# Description

You are provided with an anonymized dataset containing numeric feature variables, the binary target column, and a string ID_code column. The task is to predict the value of target column in the test set. To download the dataset make sure you have kaggle api installed on your machine.

    kaggle competitions download -c santander-customer-transaction-prediction


In [1]:
from sklearn.metrics import accuracy_score, average_precision_score, confusion_matrix, recall_score, roc_auc_score
import pandas as pd
import numpy as np

## Read train data

In [2]:
train = pd.read_csv('./data/train.csv.zip',
                    compression='zip',
                    header=0,
                    sep=',',
                    quotechar='"')

In [3]:
m,n = train.shape
print("shape of train dataframe {}, {}".format(m,n))

shape of train dataframe 200000, 202


# Distribution of target classes

In [4]:
train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

# Describe 

In [5]:
# Identify feature columns
columns_to_select = [cols for cols in train.columns.values.tolist() if cols not in ['ID_code','target']]

In [6]:
%%time
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train[columns_to_select] = scaler.fit_transform(train[columns_to_select])

CPU times: user 6.56 s, sys: 5.11 s, total: 11.7 s
Wall time: 10.5 s


In [7]:
xs = train[columns_to_select]
ys = train['target']

# Smote analysis

In [8]:
from imblearn.over_sampling import SMOTE

In [9]:
%%time
smt = SMOTE()
X_train, Y_train = smt.fit_sample(xs, ys,)

CPU times: user 3min 36s, sys: 2.06 s, total: 3min 38s
Wall time: 3min 40s


In [10]:
np.bincount(Y_train)

array([179902, 179902])

## Model fitting

In [11]:
from sklearn.ensemble import RandomForestClassifier
import pickle

In [12]:
%%time
clf = RandomForestClassifier(n_estimators=500,
                             max_depth =10,
                             random_state=0,
                             n_jobs=-1)

clf.fit(X_train,Y_train)

CPU times: user 1h 8min 53s, sys: 15.8 s, total: 1h 9min 9s
Wall time: 18min 33s


## Saving the model

In [20]:
with open('./model/rf.pkl','wb') as f:
    pickle.dump(clf, f) 

## Loading the model

In [12]:
with open('./model/rf.pkl','rb') as f:
    clf = pickle.load(f)


# Investigating metrics

In [None]:
y_pred = clf.predict(xs)
y_proba = clf.predict_proba(xs)[:,1]
print("accuracy: {}".format(accuracy_score(ys, y_pred)))
print("precision score: {}".format(average_precision_score(ys,y_pred)))
print("roc auc score: {}".format(roc_auc_score(ys,y_proba)))

accuracy: 0.826715
precision score: 0.23707970842771345
roc auc score: 0.8520430516587387


In [None]:
confusion_matrix(ys, y_pred)

array([[152688,  27214],
       [  7443,  12655]])

---
# Using XGBoost

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier


In [None]:
xgb = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [None]:
df_=pd.DataFrame(X_train,columns=columns_to_select)
df_['label']= Y_train

In [None]:
%%time
xgb.fit(df_[columns_to_select],df_['label'],eval_metric='auc')

In [None]:
with open('./model/xgb.pkl','wb') as f:
    pickle.dump(xgb, f) 

In [None]:
with open('./model/xgb_model.pkl','rb') as f:
    xgb = pickle.load(f)

In [None]:
testdf_ = pd.DataFrame(x_test,columns=columns_to_select)
testdf_['label'] = y_test
testdf_['prediction']= xgb.predict(testdf_[columns_to_select])
testdf_['probability'] = xgb.predict_proba(testdf_[columns_to_select])[:,1]

In [None]:
print("accuracy: {}".format(accuracy_score(testdf_['label'], testdf_['prediction'])))
print("precision score: {}".format(average_precision_score(testdf_['prediction'],testdf_['label'])))
print("roc auc score: {}".format(roc_auc_score(testdf_['label'],testdf_['probability'])))


In [None]:
confusion_matrix(testdf_['label'], testdf_['prediction'])

Confusion matrix for `XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8)` without being normalized

        array([[50516,  3455],
           [ 3941,  2088]])