# Description

You are provided with an anonymized dataset containing numeric feature variables, the binary target column, and a string ID_code column. The task is to predict the value of target column in the test set. To download the dataset make sure you have kaggle api installed on your machine.

    kaggle competitions download -c santander-customer-transaction-prediction


In [1]:
from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, recall_score, roc_auc_score
import pandas as pd
import numpy as np
import pickle

## Read train data

In [3]:
train = pd.read_csv('./data/train.csv.zip',
                    compression='zip',
                    header=0,
                    sep=',',
                    quotechar='"')

m,n = train.shape
print("shape of train dataframe {}, {}".format(m,n))

shape of train dataframe 200000, 202


In [15]:
test = pd.read_csv('./data/test.csv.zip',compression='zip',
                    header=0,
                    sep=',',
                    quotechar='"')
m,n = test.shape
print("shape of test dataframe {}, {}".format(m,n))

shape of test dataframe 200000, 201


# Distribution of target classes

In [4]:
train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [5]:
# Identify feature columns
columns_to_select = [cols for cols in train.columns.values.tolist() if cols not in ['ID_code','target']]

In [6]:
%%time
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train[columns_to_select] = scaler.fit_transform(train[columns_to_select])

CPU times: user 5.8 s, sys: 1.34 s, total: 7.15 s
Wall time: 4.31 s


In [7]:
xs = train[columns_to_select]
ys = train['target']

# Smote analysis

In [8]:
from imblearn.over_sampling import SMOTE

In [9]:
%%time
smt = SMOTE()
X_train, Y_train = smt.fit_sample(xs, ys,)

CPU times: user 3min 2s, sys: 683 ms, total: 3min 3s
Wall time: 3min 1s


In [10]:
np.bincount(Y_train)

array([179902, 179902])

---

# Using RF

In [12]:
%%time
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=500,
                             max_depth =10,
                             random_state=0,
                             n_jobs=-1)

clf.fit(X_train,Y_train)

CPU times: user 1h 8min 53s, sys: 15.8 s, total: 1h 9min 9s
Wall time: 18min 33s


## Saving the model

In [20]:
with open('./model/rf.pkl','wb') as f:
    pickle.dump(clf, f) 

## Loading the model

In [12]:
with open('./model/rf.pkl','rb') as f:
    clf = pickle.load(f)


## Investigating metrics

In [None]:
y_pred = clf.predict(xs)
y_proba = clf.predict_proba(xs)[:,1]
print("accuracy: {}".format(accuracy_score(ys, y_pred)))
print("precision score: {}".format(average_precision_score(ys,y_pred)))
print("roc auc score: {}".format(roc_auc_score(ys,y_proba)))

accuracy: 0.826715
precision score: 0.23707970842771345
roc auc score: 0.8520430516587387


In [None]:
confusion_matrix(ys, y_pred)

array([[152688,  27214],
       [  7443,  12655]])

---
# Using XGBoost

In [11]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [12]:
xgb = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 verbosity=3,
 scale_pos_weight=1,
 seed=27)

## training with smote dataset

In [13]:
df_=pd.DataFrame(X_train,columns=columns_to_select)
df_['label']= Y_train

In [14]:
%%time
# Training with smote results
xgb.fit(df_[columns_to_select],df_['label'],eval_metric='auc')

CPU times: user 1h 41min, sys: 1.3 s, total: 1h 41min 1s
Wall time: 1h 40min 59s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8, verbosity=3)

In [17]:
with open('./model/xgb_smote.pkl','wb') as f:
    pickle.dump(xgb, f) 

In [None]:
with open('./model/xgb_smote.pkl','rb') as f:
    xgb = pickle.load(f)

In [19]:
train['pred']= xgb.predict(train[columns_to_select])
train['proba'] = xgb.predict_proba(train[columns_to_select])[:,1]

print("accuracy: {}".format(accuracy_score(train['target'], train['pred'])))
print("precision score: {}".format(average_precision_score(train['pred'],train['target'])))
print("roc auc score: {}".format(roc_auc_score(train['target'],train['proba'])))

accuracy: 0.937695
precision score: 0.5158105070824383
roc auc score: 0.9528613760843482


In [20]:
confusion_matrix(train['target'], train['pred'])

array([[173347,   6555],
       [  5906,  14192]])

---
## training on the orignal dataset

In [14]:
%%time
# Training on the original dataset
xgb.fit(train[columns_to_select], train['target'],eval_metric='auc')

CPU times: user 52min 32s, sys: 912 ms, total: 52min 33s
Wall time: 52min 32s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8, verbosity=3)

In [17]:
with open('./model/xgb.pkl','wb') as f:
    pickle.dump(xgb, f) 

In [None]:
with open('./model/xgb_model.pkl','rb') as f:
    xgb = pickle.load(f)

In [18]:
testdf_ = pd.DataFrame(x_test,columns=columns_to_select)
testdf_['label'] = y_test
testdf_['prediction']= xgb.predict(testdf_[columns_to_select])
testdf_['probability'] = xgb.predict_proba(testdf_[columns_to_select])[:,1]

NameError: name 'x_test' is not defined

In [20]:
xgb.predict(train[columns_to_select])

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
train['prediction'] = xgb.predict(train[columns_to_select])
train['proba'] = xgb.predict_proba(train[columns_to_select])[:,1]

print("accuracy: {}".format(accuracy_score(train['target'], train['prediction'])))
print("precision score: {}".format(average_precision_score(train['prediction'],train['target'])))
print("roc auc score: {}".format(roc_auc_score(train['target'],train['proba'])))

accuracy: 0.970765
precision score: 0.7104240736998411
roc auc score: 0.990743698585738


In [24]:
confusion_matrix(train['target'], train['prediction'])

array([[179669,    233],
       [  5614,  14484]])

---
# Applying model on test dataset

In [25]:
test = pd.read_csv('./data/test.csv.zip',compression='zip',
                    header=0,
                    sep=',',
                    quotechar='"')

m,n = test.shape
print("shape of test dataframe {}, {}".format(m,n))

shape of test dataframe 200000, 201


In [26]:
test.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


In [27]:
# Transform the test data set with similar transformation on the train

test[columns_to_select] = scaler.fit_transform(test[columns_to_select])

# Apply predict on data
test['target'] = xgb.predict(test[columns_to_select])

In [28]:
test.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,target
0,test_0,0.491396,0.934286,0.647868,0.717967,0.563676,0.563885,0.596918,0.555992,0.616146,...,0.745496,0.168422,0.477313,0.233148,0.780223,0.613081,0.74521,0.445426,0.45629,0
1,test_1,0.376861,0.667159,0.547072,0.395628,0.351853,0.527653,0.624899,0.572118,0.282491,...,0.587858,0.377335,0.741635,0.333622,0.569846,0.417179,0.606236,0.628698,0.274352,0
2,test_2,0.239172,0.191796,0.475918,0.537077,0.452813,0.834554,0.440118,0.643958,0.585037,...,0.700735,0.468998,0.46799,0.193832,0.744399,0.228884,0.146114,0.667101,0.241644,0
3,test_3,0.377177,0.561688,0.59092,0.501147,0.318535,0.687271,0.447462,0.65779,0.679433,...,0.600497,0.440591,0.516235,0.312664,0.854102,0.599396,0.501114,0.322375,0.523223,0
4,test_4,0.520319,0.610381,0.719749,0.590456,0.342955,0.426075,0.762902,0.216621,0.659735,...,0.605501,0.407583,0.509229,0.54788,0.489135,0.294091,0.183191,0.367939,0.449389,0


In [29]:
test[['ID_code','target']].to_csv('./output/xgb_smote.csv',index = False)

In [None]:
print("accuracy: {}".format(accuracy_score(testdf_['label'], testdf_['prediction'])))
print("precision score: {}".format(average_precision_score(testdf_['prediction'],testdf_['label'])))
print("roc auc score: {}".format(roc_auc_score(testdf_['label'],testdf_['probability'])))


In [None]:
confusion_matrix(testdf_['label'], testdf_['prediction'])

Confusion matrix for `XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8)` without being normalized

        array([[50516,  3455],
           [ 3941,  2088]])