In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm, model_selection, ensemble, metrics
import itertools
import time
%matplotlib inline

In [2]:
T = pd.read_csv('./training.csv')

In [3]:
text_cols = [column for column in T.columns if T[column].dtype not in ['int64', 'float64']]
for col in text_cols:
    lb = preprocessing.LabelEncoder()
    T[col].fillna('-', inplace=True)
    T[col].apply(str)
    T[col] = lb.fit_transform(T[col])

In [4]:
y = np.array(T['IsBadBuy'])
X = np.array(T.drop('IsBadBuy', axis=1))

In [5]:
print 'At least, we need %.2f%% accuracy (predict always good buy)'%(np.sum(y==0)/y.shape[0]*100)

At least, we need 87.70% accuracy (predict always good buy)


In [6]:
X = np.where(np.isnan(X), np.ma.array(X, mask=np.isnan(X)).mean(axis=0), X)

In [None]:
# parameters = {'n_estimators':[5,10,25], 'random_state':[42]}
parameters = {'kernel':['rbf'], 'C':[1,2],'random_state':[42]}

outer_splits = 5
inner_splits = 5
confussion_matrix = np.zeros((2,2))
kf = model_selection.KFold(n_splits=outer_splits)

for train, test in kf.split(X):
    t1 = time.time()
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    sc = preprocessing.StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
#     clf = ensemble.RandomForestClassifier()
    clf = svm.SVC()
    clf = model_selection.GridSearchCV(clf, parameters, cv=inner_splits, n_jobs=-1)
    clf.fit(X_train, y_train)
    confussion_matrix += metrics.confusion_matrix(y_test, clf.predict(X_test))
    print 'Validation score: %.3f \nBest params: %s'%(clf.best_score_, clf.best_params_)
    print '########################'
    print time.time()-t1
    
confussion_matrix /= outer_splits # take the average

In [None]:
# source http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
TP = confussion_matrix[0,0]
FP = confussion_matrix[1,0]
FN = confussion_matrix[0,1]
TN = confussion_matrix[1,1]
plot_confusion_matrix(confussion_matrix, ['Good buy','Bad buy'])

In [None]:
precision = TP/(TP+FP)
recall = TP/(TP+FN)
F1 = 2*precision*recall / (precision+recall)
print 'Precision = %.3f \nRecall = %.3f \nF1 score = %.3f\n'%(precision, recall, F1)

In [None]:
pred = clf.predict(X_test)

In [None]:
np.sum(pred[y_test==1]==y_test[y_test==1])/y_test[y_test==1].shape[0]