# WiDS 2018 Datathon

Predictive Analytics for Social Impact

Competition on [Kaggle](https://www.kaggle.com/c/wids2018datathon)

## 0. Import Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from helpers import format_submission

## 1. Load and Prepare Data

In [2]:
seed = 37

print 'Loading data...\n'
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}\n'.format(test.shape)

label = train['is_female']
del train['is_female']

del train['train_id']
del test['test_id']

# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')

# Convert to dummy variables
print 'Converting to dummy variables...\n'
train_str = train.applymap(str)
train_dummies = pd.get_dummies(train_str)

# Split into train and validation set
print 'Splitting into train and validation set...\n'
X_train, X_test, y_train, y_test = train_test_split(train_dummies, label, test_size=0.2, random_state=seed)

print 'Complete.'

Loading data...

Train Shape: (18255, 1235)
Test Shape: (27285, 1234)

Converting to dummy variables...

Splitting into train and validation set...

Complete.


## 2. Feature Selection

In [6]:
def log_reg(trainX, trainY, testX, testY):
    print 'Fitting logistic regression...\n'
    clf = LogisticRegression(C=0.2, penalty='l1', solver='liblinear')
    clf.fit(trainX, trainY)
    preds = clf.predict(testX)
    score = roc_auc_score(testY, preds)
    return score

def xgb_clf(trainX, trainY, testX, testY, seed):
    print 'Fitting XGB Classifier...\n'
    clf = XGBClassifier(max_depth=7, n_estimators=100, random_state=seed)
    clf.fit(trainX, trainY, eval_metric='auc')
    preds = clf.predict(testX)
    score = roc_auc_score(testY, preds)
    return score

In [9]:
# Variance Threshold 
print 'Selecting features with VarianceThreshold...\n'
for t in np.arange(0.0, 0.16, 0.01):
    sel = VarianceThreshold(threshold=t)
    X_train_new = sel.fit_transform(X_train)
    X_test_new = sel.transform(X_test)

    start = time.time()
    score = log_reg(X_train_new, y_train, X_test_new, y_test)
    end = time.time()
    run_time = float(end - start)/60

    print 'Logistic Regression with VarianceThreshold of {}'.format(t)
    print 'ROC AUC Score: {}'.format(score)
    print 'Run time: {:0.2f} minutes\n'.format(run_time)

    start = time.time()
    score = xgb_clf(X_train_new, y_train, X_test_new, y_test, seed)
    end = time.time()
    run_time = float(end - start)/60

    print 'XGBoost Classifier with VarianceThreshold of {}'.format(t)
    print 'ROC AUC Score: {}'.format(score)
    print 'Run time: {:0.2f} minutes\n'.format(run_time)


# SelectKBest 
print 'Selecting features with SelectKBest...\n'
for k in range(995,1006,1):
    ch2 = SelectKBest(chi2, k=k)
    X_train_new = ch2.fit_transform(X_train, y_train)
    X_test_new = ch2.transform(X_test)

    start = time.time()
    score = log_reg(X_train_new, y_train, X_test_new, y_test)
    end = time.time()
    run_time = float(end - start)/60

    print 'Logistic Regression with SelectKBest, {} Features'.format(k)
    print 'ROC AUC Score: {}'.format(score)
    print 'Run time: {:0.2f} minutes\n'.format(run_time)

    start = time.time()
    score = xgb_clf(X_train_new, y_train, X_test_new, y_test, seed)
    end = time.time()
    run_time = float(end - start)/60

    print 'XGBoost Classifier with SelectKBest, {} Features'.format(k)
    print 'ROC AUC Score: {}'.format(score)
    print 'Run time: {:0.2f} minutes\n'.format(run_time)


# SelectFromModel
print 'Selecting features with SelectFromModel...\n'
clf = XGBClassifier(max_depth=7, n_estimators=100, random_state=seed)
sfm = SelectFromModel(clf, threshold='median')
X_train_new = sfm.fit_transform(X_train, y_train)
X_test_new = sfm.transform(X_test)

start = time.time()
score = log_reg(X_train_new, y_train, X_test_new, y_test)
end = time.time()
run_time = float(end - start)/60

print 'Logistic Regression with SelectFromModel (Threshold = Median)'
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {:0.2f} minutes\n'.format(run_time)

start = time.time()
score = xgb_clf(X_train_new, y_train, X_test_new, y_test, seed)
end = time.time()
run_time = float(end - start)/60

print 'XGBoost Classifier with SelectFromModel (Threshold = Median)'
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {:0.2f} minutes\n'.format(run_time)


# PCA
print 'Selecting features with PCA...\n'
n='mle'
solver='full'
for w in [True, False]:
    pca = PCA(n_components=n, whiten=w, svd_solver=solver, random_state=seed)
    pca_train = pca.fit_transform(X_train)
    pca_test = pca.transform(X_test)

    start = time.time()
    score = log_reg(pca_train, y_train, pca_test, y_test)
    end = time.time()
    run_time = float(end - start)/60

    print 'Logistic Regression with PCA, {} Components, Whitened = {}'.format(pca.n_components_, w)
    print 'ROC AUC Score: {}'.format(score)
    print 'Run time: {:0.2f} minutes\n'.format(run_time)

    start = time.time()
    score = xgb_clf(pca_train, y_train, pca_test, y_test, seed)
    end = time.time()
    run_time = float(end - start)/60

    print 'XGBoost Classifier with PCA, {} Components, Whitened = {}'.format(pca.n_components_, w)
    print 'ROC AUC Score: {}'.format(score)
    print 'Run time: {:0.2f} minutes\n'.format(run_time)

Selecting features with VarianceThreshold...

Fitting logistic regression...

Logistic Regression with VarianceThreshold of 0.0
ROC AUC Score: 0.902420528395
Run time: 0.05 minutes

Fitting XGB Classifier...

XGBoost Classifier with VarianceThreshold of 0.0
ROC AUC Score: 0.905588094399
Run time: 15.59 minutes

Fitting logistic regression...

Logistic Regression with VarianceThreshold of 0.01
ROC AUC Score: 0.900150944078
Run time: 0.03 minutes

Fitting XGB Classifier...

XGBoost Classifier with VarianceThreshold of 0.01
ROC AUC Score: 0.905400092193
Run time: 3.66 minutes

Fitting logistic regression...

Logistic Regression with VarianceThreshold of 0.02
ROC AUC Score: 0.899809738153
Run time: 0.03 minutes

Fitting XGB Classifier...

XGBoost Classifier with VarianceThreshold of 0.02
ROC AUC Score: 0.904112096315
Run time: 2.83 minutes

Fitting logistic regression...

Logistic Regression with VarianceThreshold of 0.03
ROC AUC Score: 0.900874933341
Run time: 0.05 minutes

Fitting XGB Cl

KeyboardInterrupt: 

In [13]:
# PCA
print 'Selecting features with PCA...\n'
#n='mle'
#solver='full'
n=610
solver='auto'
for w in [True, False]:
    pca = PCA(n_components=n, whiten=w, svd_solver=solver, random_state=seed)
    pca_train = pca.fit_transform(X_train)
    pca_test = pca.transform(X_test)

    start = time.time()
    score = log_reg(pca_train, y_train, pca_test, y_test)
    end = time.time()
    run_time = float(end - start)/60

    print 'Logistic Regression with PCA, {} Components, Whitened = {}'.format(n, w)
    print 'ROC AUC Score: {}'.format(score)
    print 'Run time: {:0.2f} minutes\n'.format(run_time)

    start = time.time()
    score = xgb_clf(pca_train, y_train, pca_test, y_test, seed)
    end = time.time()
    run_time = float(end - start)/60

    print 'XGBoost Classifier with PCA, {} Components, Whitened = {}'.format(n, w)
    print 'ROC AUC Score: {}'.format(score)
    print 'Run time: {:0.2f} minutes\n'.format(run_time)

Selecting features with PCA...

Fitting logistic regression...

Logistic Regression with PCA, 610 Components, Whitened = True
ROC AUC Score: 0.896767356309
Run time: 0.03 minutes

Fitting XGB Classifier...

XGBoost Classifier with PCA, 610 Components, Whitened = True
ROC AUC Score: 0.858323616873
Run time: 2.61 minutes

Fitting logistic regression...

Logistic Regression with PCA, 610 Components, Whitened = False
ROC AUC Score: 0.894226163038
Run time: 0.02 minutes

Fitting XGB Classifier...

XGBoost Classifier with PCA, 610 Components, Whitened = False
ROC AUC Score: 0.858323616873
Run time: 2.57 minutes



In [14]:
# All features
start = time.time()
score = log_reg(X_train, y_train, X_test, y_test)
end = time.time()
run_time = float(end - start)/60

print 'Logistic Regression with All Features'
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {:0.2f} minutes\n'.format(run_time)

start = time.time()
score = xgb_clf(X_train, y_train, X_test, y_test, seed)
end = time.time()
run_time = float(end - start)/60

print 'XGBoost Classifier with All Features'
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {:0.2f} minutes\n'.format(run_time)

Fitting logistic regression...

Logistic Regression with All Features
ROC AUC Score: 0.902162929219
Run time: 0.09 minutes

Fitting XGB Classifier...

XGBoost Classifier with All Features
ROC AUC Score: 0.905588094399
Run time: 16.14 minutes

