In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn import datasets, neighbors, linear_model, preprocessing
from sklearn.model_selection import learning_curve, ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.metrics import precision_recall_fscore_support

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [3]:
data["target"].mean()

0.036447517859182946

# Classification

- Data normalization
- data split into train and cv set

Code for gini (evaluation per the problem description). 
- For the best accuracy, we want a normalized gini score to be as close as possible to 1. 
- The gini score for a random classifier is 0. 
- Also, if we get a negative gini score, we can just reverse the outcome to get a positive gini score of the same magnitude. 
- Gini score is a good metric for a skewed dataset such as this one. 
- For the evaluation, it needs the 'probability' from the classifier, not just the prediction (0 or 1)! 

In [4]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

Let's normalize the data using scikitlearn preprocessing. Each column will have a mean of 0 and variance of 1. 
The output of normalizing is a numpy array, so we convert it back into a pandas dataframe. Note that we will not be normalizing 'y' values. 


In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_np = scaler.fit_transform(data.drop("target", axis=1))
print(X_np.shape)
normalized_data = pd.DataFrame(data=X_np, columns=data.drop("target", axis=1).columns)
normalized_data = pd.concat([normalized_data, data["target"]], axis=1)
normalized_data.describe()
train,cv=train_test_split(normalized_data,test_size = 0.1, random_state=0,stratify=normalized_data['target'])
train.describe()
X_train = train.drop("target", axis=1)
y_train = train["target"]
X_cv = cv.drop("target", axis = 1)
y_cv = cv["target"]

(595212L, 58L)


1. Vanilla logistic regression has a gini score of 0.2066 - much better than a random classifier.
2. The first strategy to deal with imbalanced dataset is using class_weight = "balanced" in the classifier. This will automatically give more weight to the few positives in the data. This results in a gini score of 0.22 - much better than a random classifier!

In [6]:
#trial run
estimator = LogisticRegression(C=1, class_weight = "balanced")
#estimator = LogisticRegression(C=1)
fit = estimator.fit(X_train, y_train)
score = fit.score(X_cv, y_cv)
print(score)
print(classification_report(y_cv, estimator.predict(X_cv)))
gini_cv = gini_normalized(y_cv, estimator.predict_proba(X_cv)[:,1])
gini_train = gini_normalized(y_train, estimator.predict_proba(X_train)[:,1])
print([gini_train, gini_cv])

0.623970968717
             precision    recall  f1-score   support

          0       0.97      0.63      0.76     57353
          1       0.05      0.53      0.09      2169

avg / total       0.94      0.62      0.74     59522

[0.25243584029479305, 0.22575467997214804]


In [7]:
#results is a list of dictionaries
def evaluate_clf(estimator, X_train, y_train, X_cv, y_cv, filename):
    estimator_name = str(estimator).split("(")[0]
    fit = estimator.fit(X_train, y_train)
    accuracy = fit.score(X_cv, y_cv)
    gini_cv = gini_normalized(y_cv, estimator.predict_proba(X_cv)[:,1])
    gini_train = gini_normalized(y_train, estimator.predict_proba(X_train)[:,1])
    precision, recall, fscore, support = precision_recall_fscore_support(pd.Series(y_cv).values, estimator.predict(X_cv), pos_label=1, average='binary')
    #print(precision)
    filename = filename + '.pkl'
    joblib.dump(estimator, filename)
    results_list = [estimator_name, filename, accuracy, gini_cv, gini_train, precision, recall, fscore]
    return results_list

In [None]:
cols = ['estimator_name', 'filename', 'accuracy', 'gini_cv', 'gini_train', 'precision', 'recall', 'fscore']
results = pd.DataFrame([], columns = cols)
estimators = [LogisticRegression(C = 1, class_weight = "balanced"), svm.LinearSVC(class_weight = "balanced", verbose = 2), DecisionTreeClassifier(max_depth = 3, class_weight = "balanced"),DecisionTreeClassifier(max_depth = 5, class_weight = "balanced"), RandomForestClassifier(max_depth=2, random_state=0, class_weight = "balanced", verbose = 2), KNeighborsClassifier(n_neighbors=3)]
filenames = ['logistic_C1', 'SVC_C1_linear', 'DecTree_3','DecTree_5','RandForest', 'KNeigh_3']
for i in range(2,len(estimators)):
    estimator = estimators[i]
    filename = filenames[i]
    results_list =  evaluate_clf(estimator, X_train, y_train, X_cv, y_cv, filename)
    print(results_list)
    df = pd.DataFrame([results_list], columns = cols)
    results = pd.concat([results, df],ignore_index=True)

print(results)

['DecisionTreeClassifier', 'DecTree_3.pkl', 0.57506468196633176, 0.15029719332098571, 0.17967155074892233, 0.045375904372444167, 0.53204241585984324, 0.083620158689902538]
['DecisionTreeClassifier', 'DecTree_5.pkl', 0.51404522697490007, 0.17410971727773555, 0.22177546492945799, 0.045058831530980073, 0.61088059013370222, 0.083927157561361834]
building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.8s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


['RandomForestClassifier', 'RandForest.pkl', 0.59080676052551995, 0.18203813084573728, 0.21257874341241639, 0.047518048700901414, 0.53711387736284, 0.0873116990182118]


In [None]:
i=1
estimator = estimators[i]
filename = filenames[i]
results_list =  evaluate_clf(estimator, X_train, y_train, X_cv, y_cv, filename)
print(results_list)
df = pd.DataFrame([results_list], columns = cols)
results = pd.concat([results, df],ignore_index=True)

In [None]:
clf = joblib.load(results['filename'].iloc[0])