In [2]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score as auc
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier

import transforms as t
import classifiers as clsf

np.random.seed(69)

# Data Preprocessing

In [3]:
train_df0 = pd.read_csv('train.csv')
X0_df, Y_c0, Y0 = t.transform_df(train_df0, train=True, as_df=True)
X0 = X0_df.values
(N, d) = X0.shape

# Model Selection

### Grid Search

In [4]:
### Just gonna take a random 10th for validation
X, valX, Y_c, valY_c, Y, valY = train_test_split(X0, Y_c0, Y0, shuffle=True, test_size=0.1)
pars = t.get_pars_for_processing(X)
X, valX = t.process_with_pars(X, pars), t.process_with_pars(valX, pars)

t_size, v_size = X.shape[0] // 100, valX.shape[0] // 100
train_res, test_res, models = clsf.TuneClassifiers(X[:t_size], valX[:v_size], Y[:t_size], valY[:v_size], algs=['RF'])
test_res

Tuning RF ...
Tuned in: 8.710937976837158
{'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 20}


Unnamed: 0,Classifier,Type,Tuning,Accuracy,AUC
1,RF,test,base,0.560811,0.554077
3,RF,test,tuned,0.52027,0.516013


### Save best pars

In [5]:
with open('best_pars.pkl', 'wb') as f:
    pickle.dump({m:model.best_params_ for m, model in models.items()}, f)
    
with open('best_pars.pkl', 'rb') as f:
    best_pars = pickle.load(f)

### Cross Validation

In [None]:
clf = RandomForestClassifier(**models['RF'].best_params_)
aucs, accs = clsf.cross_val(clf, X0, Y_c0, (t.get_pars_for_processing, t.process_with_pars))
aucs.mean(), accs.mean()

CV fold: 0it [00:00, ?it/s]

# Final Fitting

In [None]:
pars = t.get_pars_for_processing(X0)
process = lambda x: t.process_with_pars(x, pars)
clf.fit(process(X0), Y_c0);

### Feature Importances

In [None]:
inds = np.array(sorted(range(len(X0_df.columns)), \
                       key=lambda x: clf.feature_importances_[x], \
                       reverse=True))
imp_dict = dict(zip(X0_df.columns[inds], clf.feature_importances_[inds]))
imp_dict

# Get Predictions on Test Set

In [None]:
test_df0 = pd.read_csv('test.csv')
tX = t.transform_df(test_df0)
tX = process(tX)

output = clf.predict_proba(tX)[:, 1]
     
output_df = pd.DataFrame({'id':test_df0['id'], 'Predicted': output})
output_df.to_csv('submission.csv', index=False)

### Sanity checks on output

In [None]:
assert(np.all((0 <= output) & (output <= 1)))
print(f'mean of train_labels: {np.mean(Y_c0)}')
print(f'mean of train_preds: {np.mean(output)}')
plt.hist(output);