# SVM

## Read data and split into train and valid set

In [12]:
import pandas as pd

from sklearn.cross_validation import StratifiedShuffleSplit

from sklearn.svm import SVC

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from matplotlib import pyplot as plt
%matplotlib inline

In [13]:
# read original data from train.csv. This data set is taken from a real Kaggle competition (Santander-May 2016)
# (there is also a dedicated test set available, which was not taken into account here)
orig_data = pd.read_table('train.csv', sep=',', header=0)

In [14]:
# separate label information from features and drop the ID as it is not relevant for future training purposes.
# according to the data description in the competition, the label's values are 0=satisfied and 1=unsatisified customer
labels = orig_data['TARGET']
features = orig_data.drop(['ID', 'TARGET'], axis=1)

In [15]:
# split the original data set into an X_train and an X_valid set and respective labels Y_train and Y_valid.
# on the X_train set a classifier will be trained and on the X_valid set it will be applied for predictions. 

# stratfiedShuffleSplit used: to keep the ratio of 0s and 1s in your train and valid sets, so that not all 1s are 
# in the Y_train and only 0s are in Y_valid etc. So, keep the proportion of satisfied/unsatisfied customers in 
# both sets train and valid. 

# test_size=0.2: the valid set will have 20% and the train set 80% of the original data.

sfs = StratifiedShuffleSplit(labels, test_size=0.2)
for train_index, test_index in sfs:
    X_train, X_valid = features.iloc[train_index], features.iloc[test_index] #needs iloc to get indexes straight
    Y_train, Y_valid = labels[train_index], labels[test_index]

## Model train, predict

In [16]:
# TRAIN 
clf = SVC(cache_size=7000) # probability = True ; needs probability for decision_function in prediction 
clf.fit(X_train[:20000], Y_train[:20000])
    
# PREDICT - this step really consumes time
pred_labels = clf.predict(X_valid[:20000])
#    pred_proba = clf.decision_function(test_data)

debug output ... predict
debug output ... predicted


## Evaluate

In [19]:
pred_score = clf.score(X_valid[:20000], Y_valid[:20000])
print('... prediction score: {0:0.2f}%'.format(pred_score * 100))
print('')

confusion_matrix(Y_valid[:20000], pred_labels)

... prediction score: 95.99%



array([[14595,     7],
       [  602,     0]])