In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import pickle
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [None]:
datadir = '~/data/'

train_dev = datadir + 'train_rand_10.csv'
train = datadir + 'train.csv'

In [None]:
train_df = pd.read_csv(train)
train_df.set_index('MachineIdentifier', inplace=True)
train_df.shape

In [None]:
bias_df = pd.read_csv(datadir + 'train_bias_weights.csv')
bias_df.set_index('MachineIdentifier', inplace=True)
bias_df = bias_df[(bias_df.prob>0.6)]

train_df = train_df.merge(bias_df, how='inner', left_index=True, right_index=True)
train_df = train_df.drop('prob', axis=1)
train_df.shape,bias_df.shape

In [None]:
y = 'HasDetections'

#Organize columns into groups by type
x_numeric = list(train_df.describe().columns.values)
x_ident = [x for x in x_numeric if 'Identifier' in x] #Some numeric columns actually IDs
x_numeric = list(set(x_numeric) - set(x_ident)) #Keep only true numerics
x_str = list(set(train_df.columns.values) - set(x_numeric))
x_numeric.remove(y)

In [None]:
len(x_numeric), len(x_str)

Things to do:
- create string and numeric matrices
- impute missing values for numeric
- create string transformations 
- feature selection
- modeling

In [None]:
'''
This block will be data transformations only - to be packaged up into a function
'''
from sklearn.metrics  import mutual_info_score

#Helper functions

def cond_join(c):
    if len(c[1])==0:
        return c[0]
    else:
        return '{}_{}'.format(c[0],c[1])
    
def entropy(p):
    return -1*(p*np.log(p) + (1-p)*np.log((1-p)))

base_rate_entropy = entropy(train_df[y].mean())

def rel_row_entropy(row):
    e1 = entropy(row.HasDetections['mean'])
    return (base_rate_entropy - e1) / base_rate_entropy


#feature transformation functions

def clean_numeric(df, filler=None):
    #Clean missing values in numeric data
    df_numeric = df[x_numeric]

    if filler is None:
        medians = df_numeric.median()
    else:
        medians = filler
    
    df_numeric.fillna(medians, inplace=True)
    return df_numeric, medians

def get_booleans(df, y, x, rel_thresh=0.001, n_thresh=2000):
    '''
    Feature construction/selection:
    - compute entropy in each category. 
    - If relative entropy is above a threshold, convert to binary
    - check to make sure no more than K-1 features

    '''
    df_grp_x = df[[x,y]].groupby(x).agg([len, np.mean]).reset_index()
    df_grp_x['relent'] = df_grp_x.apply(rel_row_entropy, axis=1)
    df_grp_x.columns = [cond_join(c) for c in df_grp_x.columns.values]
    df_grp_x_filt = df_grp_x[(df_grp_x.relent>rel_thresh) & (df_grp_x.HasDetections_len>n_thresh)]
    n_cats = df_grp_x.shape[0]
    chosen = list(df_grp_x_filt[x].values)
    if n_cats == len(chosen):
        return chosen[:-1]
    else:
        return chosen
    
def get_booleans_all_x(df):
    lab = 'MachineIdentifier'
    str_trans = {}

    for x in x_str:
        if x != lab:
            bools = get_booleans(df, y, x)
            loc_dict = {}
            for i, b in enumerate(bools):
                loc_dict[i] = b        
            str_trans[x] = loc_dict
    return str_trans

def transform_strings(df, str_trans):
    #Now create the string matrix
    df_str = pd.DataFrame(index=df.index)
    
    
    for x in str_trans:
        if len(x) > 0:
            loc_dict = str_trans[x]
            for i in range(len(loc_dict)):
                df_str[x+'_{}'.format(i)] = 1*(df[x]==loc_dict[i])
    return df_str
        
#Training 
def build_train_data(train_df):
    #train_df = pd.read_csv(train_dev)
    #train_df.set_index('MachineIdentifier', inplace=True)
    str_trans_train = get_booleans_all_x(train_df)
    train_df_str = transform_strings(train_df, str_trans_train)
    train_df_num, fillna_train = clean_numeric(train_df)
    train_all_x = train_df_num.merge(train_df_str, how='inner', left_index=True, right_index=True)
    train_y = train_df[y]
    return train_all_x, train_y, str_trans_train, fillna_train
    
#Test
def build_test_data(test_file, fillna_train, str_trans_train):
    test_df = pd.read_csv(test_file)
    test_df.set_index('MachineIdentifier', inplace=True)
    test_df_str = transform_strings(test_df, str_trans_train)
    test_df_num, fillna_train = clean_numeric(test_df, fillna_train)
    test_all_x = test_df_num.merge(test_df_str, how='inner', left_index=True, right_index=True)
    return test_all_x

class BestModelPackage(object):
    
    def __init__(self, model, features, fillna_values, str_transforms):
        self.model = model
        self.features = features
        self.fillna_values = fillna_values
        self.str_transforms = str_transforms


### Build Actual Sets

In [None]:
train_all_x, train_y, str_trans_train, fillna_train = build_train_data(train_df)
train_all_x.shape

### 2nd stage feature selection

- partition the data into multiple sets
- get marginal info gain (via RF) for each partition
- keep features that meet threshold in each partition
- take the intersection of all partitions

In [None]:

def get_rf_featimp(df_x, df_y):
    rf = RandomForestClassifier(n_estimators=100).fit(df_x, df_y)
    return rf.feature_importances_

def get_robust_featimp(feat_importances, mi_thresh=0.98):
    
    for i in range(len(feat_importances)):
        fi_sort = feat_importances[i][np.argsort(feat_importances[i])][::-1]
        keep_n = (np.cumsum(fi_sort)<mi_thresh).sum()
        keep_indices = np.argsort(feat_importances[i])[-keep_n:]
        best_feats = train_all_x.columns.values[keep_indices]
        
        if i==0:
            best_set = set(best_feats)
        else:
            best_set = best_set & set(best_feats)
            
    return list(best_set)
        
parts = 4
def hash_index(row):
    return hash(row.name) % parts

index_partitions = train_all_x.apply(hash_index, axis=1)

feat_imp_dict = {}
for i in range(parts):
    print(i)
    df_x = train_all_x[(index_partitions == i)]
    df_y = train_y[(index_partitions == i)]
    feat_imp_dict[i] = get_rf_featimp(df_x, df_y)
    
best_featset = get_robust_featimp(feat_imp_dict, 0.9999)
len(best_featset)

### Training Pipeline
- Run with two validations
- Cross validation with XGBoost

In [None]:
#Split data into 80/10/10 for training, validating, test
parts = 100
def hash_index(row):
    return hash(row.name) % parts

index_partitions = train_all_x.apply(hash_index, axis=1)

holdout_n = min(200000, round(train_all_x.shape[0]*.1))
partition_n = int(np.floor(parts * (holdout_n / train_all_x.shape[0])))
part_1 = parts - partition_n
part_2 = parts - 2*partition_n

filt_train = (index_partitions < part_2)
filt_val = (index_partitions >= part_2) & (index_partitions < part_1)
filt_test = (index_partitions >= part_1)

trainsamp_x = train_all_x[filt_train]
trainsamp_y = train_y[filt_train]

valsamp_x = train_all_x[filt_val]
valsamp_y = train_y[filt_val]

testsamp_x = train_all_x[filt_test]
testsamp_y = train_y[filt_test]



Do training here

In [None]:
pg_rf_d = {'n_estimators':[200, 500]}

pg_gbdt_d = {'n_estimators':[100, 200],
           'learning_rate':[0.05, 0.1],
           'max_depth':[7]}

pg_rf = ParameterGrid(pg_rf_d)
pg_gbdt = ParameterGrid(pg_gbdt_d)

aucs = []

for g in pg_rf:
    print(g)
    rf = RandomForestClassifier(**g)
    rf.fit(trainsamp_x[best_featset], trainsamp_y)
    pred = rf.predict_proba(valsamp_x[best_featset])[:,1]
    aucs.append((roc_auc_score(valsamp_y.values,pred), 'rf',g))
    rf = None


for g in pg_gbdt:
    print(g)
    gb = GradientBoostingClassifier(**g)
    gb.fit(trainsamp_x[best_featset], trainsamp_y)
    pred = gb.predict_proba(valsamp_x[best_featset])[:,1]
    aucs.append((roc_auc_score(valsamp_y.values,pred), 'gb',g))
    gp = None
    
trainsamp_x = None
trainsamp_y = None
valsamp_x = None
valsamp_y = None
testsamp_x = None
testsamp_y = None
train_df = None

print(aucs)

In [None]:
#Get the best model and save it
print('Starting the final model')

aucs.sort(reverse=True)
best_auc, best_algo, best_params = aucs[0]

print(best_algo)
print(best_params)

bm_pack_pre = BestModelPackage(None, best_featset, fillna_train, str_trans_train)
        
import pickle

modfile = '/Users/briand/data/MSFT_Best_Model_ALL_pre3.pickle'
with open(modfile, 'wb') as w:
    pickle.dump(bm_pack_pre, w)



if best_algo=='rf':
    best_model = RandomForestClassifier(**best_params)
else:
    best_model = GradientBoostingClassifier(**best_params)
    
best_model.fit(train_all_x[best_featset], train_y)


        
bm_pack = BestModelPackage(best_model, best_featset, fillna_train, str_trans_train)
        
import pickle

modfile = '/Users/briand/data/MSFT_Best_Model_ALL3.pickle'
with open(modfile, 'wb') as w:
    pickle.dump(bm_pack, w)
    

### Build final model on all training data

In [None]:
modfile = '/Users/briand/data/MSFT_Best_Model_ALL3.pickle'

with open(modfile, 'rb') as r:
    best_mod = pickle.load(r)


test_all_x = build_test_data(datadir + 'test.csv', 
                             best_mod.fillna_values, 
                             best_mod.str_transforms)

test_preds = best_mod.model.predict_proba(test_all_x[best_mod.features])[:,1]

test_pred_df = pd.DataFrame(test_preds, index=test_all_x.index.values, 
                            columns=['HasDetections'])

test_pred_df.to_csv(modfile.split('.')[0] + '.csv', sep=',', 
                    header=True, index=True, index_label='MachineIdentifier')

### This is all scratch

### Exploratory work