# WiDS 2018 Datathon

Predictive Analytics for Social Impact

Competition on [Kaggle](https://www.kaggle.com/c/wids2018datathon)

In [14]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from helpers import format_submission

In [11]:
seed = 37

train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}'.format(test.shape)

Train Shape: (18255, 1235)
Test Shape: (27285, 1234)


In [3]:
# How much data is each train set feature missing?
size = len(train)
insufficient_cols = []
#print('Feature', 'Percentage Missing')
for col in train.columns:
    num_present = len(train[col].dropna(how='any', axis=0))
    proportion = 1-(float(num_present)/size)
    if proportion >= .75:
        insufficient_cols.append(col)
        #print(col, '{:.2f}'.format(proportion))
print 'Number of features missing 75%+ of data: {}'.format(len(insufficient_cols))
print 'Total number of features: {}'.format(train.shape[1])

Number of features missing 75%+ of data: 801
Total number of features: 1235


In [4]:
# Drop the columns missing 75%+ data
train.drop(insufficient_cols, axis=1, inplace=True)
test.drop(insufficient_cols, axis=1, inplace=True)
print 'New shape train set: {}'.format(train.shape)
print 'New shape test set: {}'.format(train.shape)

New shape train set: (18255, 434)
New shape test set: (18255, 434)


In [5]:
label = train['is_female']
del train['is_female']

del train['train_id']
del test['test_id']

# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')

# Convert to categorical
train_str = train.applymap(str)
train_dum = pd.get_dummies(train_str)

# Split into train and validation set
X_train, X_test, y_train, y_test = train_test_split(train_dum, label, test_size=0.2, random_state=42)

In [47]:
# Logistic Regression
clf = LogisticRegression()
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

print 'Logistic Regression'
print 'ROC AUC Score: {}'.format(score)

Logistic Regression
ROC AUC Score: 0.886394832787


In [48]:
# Random Forest
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

print 'Random Forest'
print 'ROC AUC Score: {}'.format(score)

Random Forest
ROC AUC Score: 0.857780478454


In [49]:
# Multi-layer Perceptron
clf = MLPClassifier()
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

print 'Multi-layer Perceptron'
print 'ROC AUC Score: {}'.format(score)

Multi-layer Perceptron
ROC AUC Score: 0.882140364013


In [50]:
# XGBoost Classifier with default parameters
clf = XGBClassifier()
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

print 'XGBoost Classifier'
print 'ROC AUC Score: {}'.format(score)

XGBoost Classifier
ROC AUC Score: 0.903562812416


In [6]:
# Try max_depth=10 with XGBoost Classifier
clf = XGBClassifier(max_depth=10)
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

print 'XGBoost Classifier'
print 'ROC AUC Score: {}'.format(score)

XGBoost Classifier
ROC AUC Score: 0.911228090164


In [4]:
# Fit XGBoost Classifier, specifying eval_metric='auc'
start = time.time()
clf = XGBClassifier(max_depth=10, n_estimators=100, random_state=seed)
clf.fit(X_train, y_train, eval_metric='auc')

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

end = time.time()
run_time = float(end - start)/60

print 'XGBoost Classifier'
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {:2} minutes'.format(run_time)

XGBoost Classifier
ROC AUC Score: 0.911228090164
Run time: 21.7721663992 minutes


In [4]:
# Repeat above with n_jobs=2 to compare run times
start = time.time()
clf = XGBClassifier(max_depth=10, n_estimators=100, n_jobs=2, random_state=seed)
clf.fit(X_train, y_train, eval_metric='auc')

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

end = time.time()
run_time = float(end - start)/60

print 'XGBoost Classifier'
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {} minutes'.format(run_time)

XGBoost Classifier
ROC AUC Score: 0.911228090164
Run time: 25.4510277033 minutes


In [5]:
# Fit XGBoost Classifier, increase n_estimators to 200
start = time.time()

clf = XGBClassifier(max_depth=10, n_estimators=200, random_state=seed)
clf.fit(X_train, y_train, eval_metric='auc')

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

end = time.time()
run_time = float(end - start)/60

print 'XGBoost Classifier'
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {} minutes'.format(run_time)

XGBoost Classifier
ROC AUC Score: 0.912035128077
Run time: 53.4536111633 minutes


In [8]:
# Fit XGBoost Classifier, increase max_depth to 20, keep n_estimators at default (100)
start = time.time()

clf = XGBClassifier(max_depth=20, n_estimators=100, random_state=seed)
clf.fit(X_train, y_train, eval_metric='auc')

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

end = time.time()
run_time = float(end - start)/60

print 'XGBoost Classifier'
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {} minutes'.format(run_time)

XGBoost Classifier
ROC AUC Score: 0.909064233888
Run time: 43.1484755158 minutes


In [13]:
# Tune max_depth (1-10) for XGBoost Classifier
start = time.time()

parameters = {'max_depth': range(1, 11, 2)}
xgb = XGBClassifier()
clf = GridSearchCV(xgb, parameters, scoring='roc_auc', refit=True)
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

end = time.time()
run_time = float(end - start)/60

print 'XGBoost Classifier Tuned with GridSearchCV'
print 'Best parameters: {}'.format(clf.best_params_)
print 'Best GridSearchCV ROC AUC score: {}'.format(clf.best_score_)
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {} minutes'.format(run_time)

XGBoost Classifier Tuned with GridSearchCV
Best parameters: {'max_depth': 7}
Best GridSearchCV ROC AUC score: 0.967213341827
ROC AUC Score: 0.911375481962
Run time: 139.383742003 minutes


In [6]:
# Format test set
test_str = test.applymap(str)
total = pd.concat([train_str, test_str], ignore_index=True)
total_dummies = pd.get_dummies(total)
train_dummies = total_dummies.head(len(train))
test_dummies = total_dummies.tail(len(test))

# # Export modified train and test sets to CSV to use with Keras
# train_dummies.to_csv('data/x_train.csv', index=False)
# pd.DataFrame(label).to_csv('data/y_train.csv', index=False)
# test_dummies.to_csv('data/x_test.csv', index=False)

In [14]:
# Update submission number 
sub_number = 9

# Fit best model to full train set
start = time.time()
print 'Fitting model...\n'
clf = XGBClassifier(max_depth=7, n_estimators=100, random_state=seed)
clf.fit(train_dummies, label)

# Make predictions
print 'Making predictions...\n'
preds = clf.predict_proba(test_dummies)

end = time.time()
run_time = float(end - start)/60
print 'Run time: {:0.2f} minutes'.format(run_time)

# Format submission 
print '\nFormatting submission...\n'
df = format_submission(preds, len(test_dummies))
df.to_csv('submissions/submission{}.csv'.format(sub_number), index=False)

print 'Program complete!'

Fitting model...

Making predictions...

Run time: 23.66 minutes

Formatting submission...

Program complete!


In [8]:
# Cross-validation on full training set

# Update submission number 
start = time.time()
sub_number = 10

# GridSearchCV
print 'Tuning model...\n'
parameters = {'max_depth': range(1, 11, 2), 'n_estimators': range(50,300,50)}
xgb = XGBClassifier()
clf = GridSearchCV(xgb, parameters, scoring='roc_auc', refit=True)
clf.fit(train_dummies, label)

# Make predictions
print 'Making predictions...\n'
preds = clf.predict_proba(test_dummies)

# Format submission 
print 'Formatting submission...\n'
df = format_submission(preds, len(test_dummies))
df.to_csv('submissions/submission{}.csv'.format(sub_number), index=False)

end = time.time()
run_time = float(end - start)/60

print 'Program complete!\n'
print 'XGBoost Classifier Tuned with GridSearchCV'
print 'Best parameters: {}'.format(clf.best_params_)
print 'Best GridSearchCV ROC AUC score: {}'.format(clf.best_score_)
print 'Run time: {:0.2f} minutes'.format(run_time)

Tuning model...

Making predictions...


Formatting submission...

Program complete!

XGBoost Classifier Tuned with GridSearchCV
Best parameters: {'n_estimators': 250, 'max_depth': 9}
Best GridSearchCV ROC AUC score: 0.969377013799
Run time: 687.11 minutes
