In [1]:
import pandas as pd
import scipy as sp
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
import xgboost as xgb
import pprint as pp
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
import bnp_helper
import common_helper
import random
import os
import datetime
from sklearn.externals import joblib
from matplotlib.pylab import rcParams

%matplotlib inline
rcParams['figure.figsize'] = 16, 4
pd.set_option('display.max_columns', 18)



In [2]:
################### Combine the training set and test set to fix the unmatched categories ############
train_df_raw = pd.read_csv("../data/train.csv")
test_df_raw = pd.read_csv("../data/test.csv")

df_raw_combined = pd.concat([train_df_raw, test_df_raw], axis = 0)

########### Clean and Impute (Combined) #############
df_combinded = bnp_helper.clean(df_raw_combined, drop_collinearity = True, inplace = True)
df_combinded = bnp_helper.impute_cate_with_na_numeric_with_outlier(df_combinded)

################ Convert text to number (Combined) ################
df_combinded = common_helper.dummify(df_combinded, bnp_helper.get_categorical_variables(df_combinded))
print df_combinded.shape
df_combinded.head()

variables to be dropped:
['v12', 'v128', 'v104', 'v25', 'v15', 'v111', 'v121', 'v34', 'v40', 'v29', 'v116', 'v26', 'v105', 'v25', 'v48', 'v41', 'v11', 'v54', 'v33', 'v100', 'v26', 'v105', 'v25', 'v46', 'v54', 'v64', 'v20', 'v29', 'v41', 'v39', 'v115', 'v15', 'v32', 'v17', 'v64', 'v29', 'v67', 'v8', 'v5', 'v83', 'v32', 'v89', 'v92', 'v29', 'v41', 'v97']
(228714, 449)


Unnamed: 0,ID,target,v1,v10,v101,v102,v103,v106,v108,...,v79_Q,v79_R,v91_B,v91_C,v91_D,v91_E,v91_F,v91_G,v91_NA
0,3,1,1.335739,0.503281,8.389237,2.757375,4.374296,12.579184,2.382692,...,0,0,0,0,0,0,0,0,0
1,4,1,-999.0,1.31291,-999.0,-999.0,-999.0,-999.0,1.825361,...,0,0,1,0,0,0,0,0,0
2,5,1,0.943877,0.765864,5.879353,3.292788,5.924457,11.670572,1.375753,...,0,0,0,0,0,0,0,1,0
3,6,1,0.797415,6.542669,8.507281,2.503055,4.872157,12.554274,2.230754,...,0,0,1,0,0,0,0,0,0
4,8,1,-999.0,1.050328,-999.0,-999.0,-999.0,-999.0,-999.0,...,0,0,0,0,0,0,0,1,0


In [3]:
train_df = df_combinded[-df_raw_combined['target'].isnull()]
test_df = df_combinded[df_raw_combined['target'].isnull()]

train_df_sample = train_df.sample(40000, random_state = 0)
#train_df_sample = train_df
target_train = train_df_sample['target']
X_train = train_df_sample.drop(['ID', 'target'], axis = 1)

In [18]:
logit = LogisticRegression(random_state=27, n_jobs = -1)

para_grid = [{'penalty': ['l1', 'l2'], 
              'fit_intercept': [False, True], 
              'C':np.logspace(-5, 5, 10)}]


start = datetime.datetime.now()
para_search = GridSearchCV(logit, para_grid, scoring='log_loss', cv =5).fit(X_train, target_train)
end = datetime.datetime.now()
print "model training time: {}".format(end - start)

pp.pprint(para_search.grid_scores_)

# best combinatin
print 'Best combination: ', para_search.best_params_

# best score
print 'Best Score' , para_search.best_score_

model training time: 1:19:51.735297
[mean: -0.62436, std: 0.00339, params: {'penalty': 'l1', 'C': 1.0000000000000001e-05, 'fit_intercept': False},
 mean: -0.54557, std: 0.00089, params: {'penalty': 'l2', 'C': 1.0000000000000001e-05, 'fit_intercept': False},
 mean: -0.62436, std: 0.00339, params: {'penalty': 'l1', 'C': 1.0000000000000001e-05, 'fit_intercept': True},
 mean: -0.54555, std: 0.00091, params: {'penalty': 'l2', 'C': 1.0000000000000001e-05, 'fit_intercept': True},
 mean: -0.54657, std: 0.00222, params: {'penalty': 'l1', 'C': 0.00012915496650148841, 'fit_intercept': False},
 mean: -0.53514, std: 0.00482, params: {'penalty': 'l2', 'C': 0.00012915496650148841, 'fit_intercept': False},
 mean: -0.54658, std: 0.00222, params: {'penalty': 'l1', 'C': 0.00012915496650148841, 'fit_intercept': True},
 mean: -0.53562, std: 0.00508, params: {'penalty': 'l2', 'C': 0.00012915496650148841, 'fit_intercept': True},
 mean: -0.52944, std: 0.00196, params: {'penalty': 'l1', 'C': 0.0016681005372000

In [21]:
# Use full traing set
train_df_sample = train_df
target_train = train_df_sample['target']
X_train = train_df_sample.drop(['ID', 'target'], axis = 1)

In [22]:
# train on full training set
logit_best = para_search.best_estimator_
logit_best.fit(X_train, target_train)

<bound method LogisticRegression.score of LogisticRegression(C=0.27825594022071259, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l1', random_state=27,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)>

In [24]:
predprob_train = logit_best.predict_proba(X_train)
metrics.log_loss(target_train, predprob_train)

0.4933370539949325

In [27]:
joblib.dump(logit_best, 'save/logit_model.pkl')

['save/logit_model.pkl',
 'save/logit_model.pkl_01.npy',
 'save/logit_model.pkl_02.npy',
 'save/logit_model.pkl_03.npy',
 'save/logit_model.pkl_04.npy']

In [5]:
logit_best = joblib.load('save/logit_model.pkl')

################# Predict the test set and save result to file ############
#test_df_sample = test_df.sample(1000, random_state = 0)
test_df_sample = test_df

X_test = test_df_sample.drop(['ID', 'target'], axis = 1)

#Predict test set:
predprob_test = logit_best.predict_proba(X_test)
        
# Save results
ids = test_df_sample['ID']
predprob_one_test = [p[1] for p in predprob_test]
bnp_helper.save_result(ids, predprob_one_test, "results/logit_model_result.csv")

print('Done.')

Done.
