# Compare the results of logistic regression with boosted classifier

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from helpers import format_submission

In [2]:
seed = 37

print 'Loading data...\n'
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

print 'Train Shape: {}'.format(train.shape)
print 'Test Shape: {}\n'.format(test.shape)

label = train['is_female']
del train['is_female']

del train['train_id']
del test['test_id']

# Remove rows/columns that are missing all data
train = train.dropna(axis=0, how='all')
train = train.dropna(axis=1, how='all')

# Convert to dummy variables
print 'Converting to dummy variables...\n'
train_str = train.applymap(str)
train_dummies = pd.get_dummies(train_str)

# Split into train and validation set
print 'Splitting into train and validation set...\n'
X_train, X_test, y_train, y_test = train_test_split(train_dummies, label, test_size=0.2, random_state=seed)

print 'Complete.'

Loading data...

Train Shape: (18255, 1235)
Test Shape: (27285, 1234)

Converting to dummy variables...

Splitting into train and validation set...

Complete.


In [3]:
def log_reg(trainX, trainY, testX, testY):
    print 'Fitting logistic regression...\n'
    clf = LogisticRegression(C=0.2, penalty='l1', solver='liblinear')
    clf.fit(trainX, trainY)
    preds = clf.predict_proba(testX)
    #score = roc_auc_score(testY, preds)
    #return [score, preds]
    return preds

def xgb_clf(trainX, trainY, testX, testY, seed):
    print 'Fitting XGB Classifier...\n'
    clf = XGBClassifier(max_depth=7, n_estimators=100, random_state=seed)
    clf.fit(trainX, trainY, eval_metric='auc')
    preds = clf.predict_proba(testX)
    #score = roc_auc_score(testY, preds)
    #return [score, preds]
    return preds

In [4]:
# XGB with SelectKBest (k=1003)
k=1003
ch2 = SelectKBest(chi2, k=k)
X_train_new = ch2.fit_transform(X_train, y_train)
X_test_new = ch2.transform(X_test)

start = time.time()
results = xgb_clf(X_train_new, y_train, X_test_new, y_test, seed)
end = time.time()
run_time = float(end - start)/60

#score_xgb = results[0]
prediction_xgb = results

print 'XGBoost Classifier with SelectKBest, {} Features'.format(k)
#print 'ROC AUC Score: {}'.format(score)
print 'Run time: {:0.2f} minutes\n'.format(run_time)

# Logistic Regression with Variance Threshold (t=0)
t=0
sel = VarianceThreshold(threshold=t)
X_train_new = sel.fit_transform(X_train)
X_test_new = sel.transform(X_test)

start = time.time()
results = log_reg(X_train_new, y_train, X_test_new, y_test)
end = time.time()
run_time = float(end - start)/60

#score_logreg = results[0]
prediction_logreg = results

print 'Logistic Regression with VarianceThreshold, {} Features'.format(t)
#print 'ROC AUC Score: {}'.format(score)
print 'Run time: {:0.2f} minutes\n'.format(run_time)

Fitting XGB Classifier...

XGBoost Classifier with SelectKBest, 1003 Features
Run time: 1.89 minutes

Fitting logistic regression...

Logistic Regression with VarianceThreshold, 0 Features
Run time: 0.06 minutes



In [5]:
prediction_xgb

array([[ 0.00752997,  0.99247003],
       [ 0.0202654 ,  0.9797346 ],
       [ 0.24321741,  0.75678259],
       ..., 
       [ 0.92927682,  0.07072315],
       [ 0.98029286,  0.01970716],
       [ 0.00589359,  0.99410641]], dtype=float32)

In [6]:
prediction_logreg

array([[  3.33781804e-03,   9.96662182e-01],
       [  1.13154377e-02,   9.88684562e-01],
       [  1.74824701e-01,   8.25175299e-01],
       ..., 
       [  9.11800818e-01,   8.81991818e-02],
       [  9.74782385e-01,   2.52176153e-02],
       [  5.07535513e-04,   9.99492464e-01]])

In [7]:
df_xgb = format_submission(prediction_xgb)
df_logreg = format_submission(prediction_logreg)

In [8]:
df_xgb.head()

Unnamed: 0,test_id,is_female
0,0,1.0
1,1,1.0
2,2,0.8
3,3,0.0
4,4,1.0


In [9]:
df_logreg.head()

Unnamed: 0,test_id,is_female
0,0,1.0
1,1,1.0
2,2,0.8
3,3,0.0
4,4,1.0


In [13]:
df = df_xgb.merge(df_logreg, how='inner', on='test_id',
                 suffixes=('_xgb', '_logreg'))
df.head()

Unnamed: 0,test_id,is_female_xgb,is_female_logreg
0,0,1.0,1.0
1,1,1.0,1.0
2,2,0.8,0.8
3,3,0.0,0.0
4,4,1.0,1.0


In [20]:
df_disagree = df[df.is_female_xgb != df.is_female_logreg]
df_disagree.shape

(1656, 3)

In [27]:
p = (float(len(df_disagree))/len(df))*100
print 'Percentage of labels on which the classifiers disagree: {:.2f}%'.format(p)

Percentage of labels on which the classifiers disagree: 45.36%


In [64]:
df_logreg_greater = df_disagree[df.is_female_logreg > df.is_female_xgb]
df_logreg_lesser = df_disagree[df.is_female_logreg < df.is_female_xgb]

print 'There are {} different predictions between logreg and xgb.'.format(len(df_disagree))
print 'Logreg prediction > xgb prediction: {}'.format(len(df_logreg_greater))
print 'Logreg prediction < xgb prediction: {}'.format(len(df_logreg_lesser))

There are 1656 different predictions between logreg and xgb.
Logreg prediction > xgb prediction: 678
Logreg prediction < xgb prediction: 978


  """Entry point for launching an IPython kernel.
  


In [77]:
df_logreg_lesser.head()

Unnamed: 0,test_id,is_female_xgb,is_female_logreg
2,2,0.8,0.8
6,6,0.8,0.3
11,11,0.1,0.1
12,12,0.1,0.1
14,14,0.1,0.1


If the predicted probabilities are different, the log reg prediction is usually lower. 

In [71]:
# Add the true labels into the mix
df_true = pd.DataFrame(y_test)
df_true.columns = ['TrueLabel']
df_true['test_id'] = range(len(df_true))
df_true.head()
df_true.reset_index(inplace=True)
df_joined = df.merge(df_true, how='inner', on='test_id')
df_joined.columns = ['TestID', 'XGB', 'LogReg', 'TestIndex', 'TrueLabel']
df_joined = df_joined[['XGB', 'LogReg', 'TrueLabel', 'TestID', 'TestIndex']]
df_joined.head()

Unnamed: 0,XGB,LogReg,TrueLabel,TestID,TestIndex
0,1.0,1.0,1,0,18008
1,1.0,1.0,1,1,11362
2,0.8,0.8,1,2,2269
3,0.0,0.0,0,3,10380
4,1.0,1.0,1,4,240


In [80]:
df_greater = df_joined[df_joined.LogReg > df_joined.XGB]
df_greater.head()

Unnamed: 0,XGB,LogReg,TrueLabel,TestID,TestIndex
9,0.0,0.1,0,9,2625
10,0.3,0.4,0,10,16193
18,0.0,0.2,0,18,8908
24,0.3,0.6,0,24,6213
25,0.0,0.2,0,25,10973


In [81]:
df_greater.shape

(678, 5)

In [83]:
df_miss = df_joined[df_joined.TrueLabel != df_joined.LogReg]
df_miss = df_miss[df_joined.TrueLabel != df_joined.XGB]
df_miss.head()

  


Unnamed: 0,XGB,LogReg,TrueLabel,TestID,TestIndex
2,0.8,0.8,1,2,2269
6,0.8,0.3,1,6,2278
10,0.3,0.4,0,10,16193
11,0.1,0.1,0,11,3846
12,0.1,0.1,0,12,5093


In [87]:
df_miss0 = df_miss[df_miss.TrueLabel == 0]
df_miss0.head(50)

Unnamed: 0,XGB,LogReg,TrueLabel,TestID,TestIndex
10,0.3,0.4,0,10,16193
11,0.1,0.1,0,11,3846
12,0.1,0.1,0,12,5093
14,0.1,0.1,0,14,12223
17,0.1,0.1,0,17,16224
24,0.3,0.6,0,24,6213
35,0.5,0.8,0,35,12912
52,0.1,0.2,0,52,15270
69,0.3,0.1,0,69,15263
73,0.4,0.5,0,73,10377
