# Benchmark Balancing

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from helpers import format_submission

In [None]:
seed = 37

## Balanced

In [24]:
print 'Loading data...\n'
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

print 'Raw'
print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}\n'.format(test.shape)

# Balance the data
minimum = pd.DataFrame(train['is_female'].value_counts()).min().values[0]
train_female = train[train.is_female == 1]
train_male = train[train.is_female == 0]
train_female_sample = train_female.sample(n=minimum, random_state=seed)
train = pd.concat([train_female_sample, train_male], ignore_index=True).sample(frac=1)

print 'Balanced'
print 'Train Shape: {}\n'.format(train.shape)

label = train['is_female']
del train['is_female']
del train['train_id']
del test['test_id']

# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')

# Convert to dummy variables
print 'Converting to dummy variables...\n'
train_str = train.applymap(str)
train_dummies = pd.get_dummies(train_str)

# Split into train and validation set
print 'Splitting into train and validation set...\n'
X_train, X_test, y_train, y_test = train_test_split(train_dummies, label, test_size=0.2, random_state=seed)

# print 'Performing feature selection...\n'
# t=0
# sel = VarianceThreshold(threshold=t)
# X_train_new = sel.fit_transform(X_train)
# X_test_new = sel.transform(X_test)

print 'Data preprocessing complete.'

Loading data...

Raw
Train Shape: (18255, 1235)
Test Shape: (27285, 1234)

Balanced
Train Shape: (16900, 1235)

Converting to dummy variables...

Splitting into train and validation set...

Data preprocessing complete.


In [26]:
print 'Fitting model...\n'
logreg = LogisticRegression(C=0.2, penalty='l1', solver='liblinear', random_state=seed)
xgb = XGBClassifier(max_depth=7, n_estimators=100, random_state=seed)

clf = VotingClassifier(estimators=[('logreg', logreg), ('xgb', xgb)], voting='soft')
clf.fit(X_train, y_train)

print 'Making predictions...'
predictions = clf.predict_proba(X_test)

preds = np.round([p[1] for p in predictions], 1)
roc_auc_score(y_test, preds)

Fitting model...

Making predictions...


0.96580785172878969

## Non-Balanced

In [27]:
print 'Loading data...\n'
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

print 'Raw'
print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}\n'.format(test.shape)

label = train['is_female']
del train['is_female']
del train['train_id']
del test['test_id']

# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')

# Convert to dummy variables
print 'Converting to dummy variables...\n'
train_str = train.applymap(str)
train_dummies = pd.get_dummies(train_str)

# Split into train and validation set
print 'Splitting into train and validation set...\n'
X_train, X_test, y_train, y_test = train_test_split(train_dummies, label, test_size=0.2, random_state=seed)

# print 'Performing feature selection...\n'
# t=0
# sel = VarianceThreshold(threshold=t)
# X_train_new = sel.fit_transform(X_train)
# X_test_new = sel.transform(X_test)

print 'Data preprocessing complete.'

Loading data...

Raw
Train Shape: (18255, 1235)
Test Shape: (27285, 1234)

Converting to dummy variables...

Splitting into train and validation set...

Data preprocessing complete.


In [28]:
print 'Fitting model...\n'
logreg = LogisticRegression(C=0.2, penalty='l1', solver='liblinear', random_state=seed)
xgb = XGBClassifier(max_depth=7, n_estimators=100, random_state=seed)

clf = VotingClassifier(estimators=[('logreg', logreg), ('xgb', xgb)], voting='soft')
clf.fit(X_train, y_train)

print 'Making predictions...'
predictions = clf.predict_proba(X_test)

preds = np.round([p[1] for p in predictions], 1)
roc_auc_score(y_test, preds)

Fitting model...

Making predictions...


0.96454802642877158

## Predictions with the Best

In [29]:
print 'Loading data...\n'
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

print 'Raw'
print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}\n'.format(test.shape)

# Balance the data
minimum = pd.DataFrame(train['is_female'].value_counts()).min().values[0]
train_female = train[train.is_female == 1]
train_male = train[train.is_female == 0]
train_female_sample = train_female.sample(n=minimum, random_state=seed)
train = pd.concat([train_female_sample, train_male], ignore_index=True).sample(frac=1)

print 'Balanced'
print 'Train Shape: {}\n'.format(train.shape)

label = train['is_female']
del train['is_female']
del train['train_id']
del test['test_id']

# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')

# Convert to dummy variables (format train and test set)
print 'Converting to dummy variables...\n'
train_str = train.applymap(str)
test_str = test.applymap(str)
total = pd.concat([train_str, test_str], ignore_index=True)
total_dummies = pd.get_dummies(total)
X_train = total_dummies.head(len(train))
X_test = total_dummies.tail(len(test))

print 'Data preprocessing complete.'

Loading data...

Raw
Train Shape: (18255, 1235)
Test Shape: (27285, 1234)

Balanced
Train Shape: (16900, 1235)

Converting to dummy variables...

Data preprocessing complete.


In [33]:
print 'Fitting model...\n'
logreg = LogisticRegression(C=0.2, penalty='l1', solver='liblinear', random_state=seed)
xgb = XGBClassifier(max_depth=7, n_estimators=100, random_state=seed)

clf = VotingClassifier(estimators=[('logreg', logreg), ('xgb', xgb)], voting='soft')
clf.fit(X_train, label)

print 'Making predictions...\n'
predictions = clf.predict_proba(X_test)

print 'Modeling complete.'

Fitting model...

Making predictions...


In [34]:
sub_number = 16
df_preds = format_submission(predictions)
df_preds.to_csv('submissions/submission{}.csv'.format(sub_number), index=False)

Note: submission on Kaggle of balanced dataset didn't score as highly