# WiDS 2018 Datathon

Predictive Analytics for Social Impact

Competition on [Kaggle](https://www.kaggle.com/c/wids2018datathon)

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from helpers import format_submission

In [2]:
seed = 37

print 'Loading data...\n'
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}\n'.format(test.shape)

label = train['is_female']
del train['is_female']

del train['train_id']
del test['test_id']

# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')

# Convert to dummy variables
print 'Converting to dummy variables...\n'
train_str = train.applymap(str)
train_dummies = pd.get_dummies(train_str)

# Split into train and validation set
print 'Splitting into train and validation set...\n'
X_train, X_test, y_train, y_test = train_test_split(train_dummies, label, test_size=0.2, random_state=seed)

print 'Data preparation complete.'

Loading data...

Train Shape: (18255, 1235)
Test Shape: (27285, 1234)

Converting to dummy variables...

Splitting into train and validation set...

Data preparation complete.


In [3]:
print 'Selecting features with SelectKBest...\n'
k=1003
ch2 = SelectKBest(chi2, k=k)
X_train_new = ch2.fit_transform(X_train, y_train)
X_test_new = ch2.transform(X_test)

print 'Tuning parameters and fitting best model...\n'
start = time.time()

parameters = {'max_depth': range(7,11), 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
              'n_estimators': range(50,300,50), 'booster': ['gbtree', 'gblinear', 'dart'],
              'gamma': np.arange(0,0.1,0.02), 'min_child_weight': [0.5,1,1.5],
              'max_delta_step': [0,0.1,0.5], 'subsample':[1,.9], 'colsample_bytree': [1,.9],
              'colsample_bylevel': [1,.9], 'reg_alpha': [0,1], 'reg_lambda': [0,1],
              'scale_pos_weight': [1,.9,.5]}

xgb = XGBClassifier()
clf = GridSearchCV(xgb, parameters, scoring='roc_auc', refit=True)
clf.fit(X_train_new, y_train)

print 'Making predictions...\n'
preds = clf.predict(X_test_new)
score = roc_auc_score(Y_test, preds)

end = time.time()
run_time = float(end - start)/60

print 'Modeling complete!\n'

print 'XGBoost Classifier with SelectKBest, {} Features'.format(k)
print 'Best parameters: {}'.format(clf.best_params_)
print 'Best GridSearchCV ROC AUC score: {}'.format(clf.best_score_)
print 'ROC AUC Score: {}'.format(score)
print 'Run time: {:0.2f} minutes\n'.format(run_time)

Selecting features with SelectKBest...

Tuning parameters and fitting best model...



KeyboardInterrupt: 