# WiDS 2018 Datathon

Predictive Analytics for Social Impact

Competition on [Kaggle](https://www.kaggle.com/c/wids2018datathon)

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}'.format(test.shape)

Train Shape: (18255, 1235)
Test Shape: (27285, 1234)


In [4]:
label = train['is_female']
del train['is_female']

del train['train_id']
del test['test_id']

In [5]:
train.head()

Unnamed: 0,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,DG3,DG3A,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
0,3,32,3.0,,323011,3854,481,1975,3,4,...,99.0,,99,,99,,99,,99,
1,2,26,,8.0,268131,2441,344,1981,8,4,...,,,1,,2,,2,,2,
2,1,16,,7.0,167581,754,143,1995,3,2,...,1.0,,2,,2,,2,,2,
3,4,44,5.0,,445071,5705,604,1980,3,4,...,,,2,,2,,99,,99,
4,4,43,,6.0,436161,5645,592,1958,3,4,...,,,1,,1,,1,,1,


In [6]:
# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')
train.shape

(18255, 1183)

In [7]:
# Convert to categorical, split into train and validation set
train_str = train.applymap(str)
train_dum = pd.get_dummies(train_str)

X_train, X_test, y_train, y_test = train_test_split(train_dum, label, test_size=0.2, random_state=42)

In [8]:
# Logistic Regression
clf = LogisticRegression()
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

print 'Logistic Regression'
print 'ROC AUC Score: {}'.format(score)

Logistic Regression
ROC AUC Score: 0.884670439173


In [9]:
# Random Forest
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

print 'Random Forest'
print 'ROC AUC Score: {}'.format(score)

Random Forest
ROC AUC Score: 0.856606769533


In [10]:
# Multi-layer Perceptron
clf = MLPClassifier()
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
score = roc_auc_score(y_test, preds)

print 'Multi-layer Perceptron'
print 'ROC AUC Score: {}'.format(score)

Multi-layer Perceptron
ROC AUC Score: 0.889215622446


In [11]:
# Fit best model to test set, make predictions
test_str = test.applymap(str)
total = pd.concat([train_str, test_str], ignore_index=True)
total_dummies = pd.get_dummies(total)
train_dummies = total_dummies.head(len(train))
test_dummies = total_dummies.tail(len(test))

clf = MLPClassifier()
clf.fit(train_dummies, label)

preds = clf.predict_proba(test_dummies)
preds = np.round([p[1] for p in preds], 1)

In [12]:
# Format submission 
test_id = range(0,len(test))
d = {'test_id': test_id, 'is_female': preds}
df = pd.DataFrame(d)
df = df[['test_id', 'is_female']]
df.head()

Unnamed: 0,test_id,is_female
0,0,1.0
1,1,0.0
2,2,0.9
3,3,1.0
4,4,0.8


In [14]:
sub_number = 2
df.to_csv('submissions/submission{}.csv'.format(sub_number), index=False)