In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from helpers import format_submission

In [29]:
seed = 37

print 'Loading data...\n'
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}\n'.format(test.shape)

y_train = train['is_female']
del train['is_female']

del train['train_id']
del test['test_id']

# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')

# Convert to dummy variables
print 'Converting to dummy variables...\n'
train_str = train.applymap(str)
train_dummies = pd.get_dummies(train_str)

# Split into train and validation set
print 'Splitting into train and validation set...\n'
X_train, X_test, y_train, y_test = train_test_split(train_dummies, label, test_size=0.2, random_state=seed)

# # Format train and test set 
# print 'Converting to dummy variables...\n'
# train_str = train.applymap(str)
# test_str = test.applymap(str)
# total = pd.concat([train_str, test_str], ignore_index=True)
# total_dummies = pd.get_dummies(total)
# X_train = total_dummies.head(len(train))
# X_test = total_dummies.tail(len(test))

print 'Data preprocessing complete.'

Loading data...

Train Shape: (18255, 1235)
Test Shape: (27285, 1234)

Converting to dummy variables...

Splitting into train and validation set...

Data preprocessing complete.


#### Feature Selection

In [35]:
# # XGB with SelectKBest (k=1003)
# k=1003
# ch2 = SelectKBest(chi2, k=k)
# X_train_new = ch2.fit_transform(X_train, y_train)
# X_test_new = ch2.transform(X_test)

# # Logistic Regression with Variance Threshold (t=0)
# t=0
# sel = VarianceThreshold(threshold=t)
# X_train_new = sel.fit_transform(X_train)
# X_test_new = sel.transform(X_test)

# No feature selection
X_train_new = X_train
X_test_new = X_test

#### Voting Classifier

In [36]:
logreg = LogisticRegression(C=0.2, penalty='l1', solver='liblinear')
xgb = XGBClassifier(max_depth=7, n_estimators=100, random_state=seed)
clf = VotingClassifier(estimators=[('logreg', logreg), ('xgb', xgb)], voting='soft')
clf = clf.fit(X_train_new, y_train)

In [37]:
predictions = clf.predict_proba(X_test_new)

In [38]:
# No Feature Selection
preds = np.round([p[1] for p in predictions], 1)
roc_auc_score(y_test, preds)

0.96452121201165375

In [34]:
# SelectKBest
preds = np.round([p[1] for p in predictions], 1)
roc_auc_score(y_test, preds)

0.96195245110888161

In [10]:
# VarianceThreshold
preds = np.round([p[1] for p in preds], 1)
roc_auc_score(y_test, preds)

0.9644537240404808

#### Submission

In [28]:
sub_number = 14
df_preds = format_submission(predictions)
df_preds.to_csv('submissions/submission{}.csv'.format(sub_number), index=False)

In [None]:
# TODO: Voting classifier with Neural Net, RF, NB.