## SVM + RBF Kernel (no Hyper-Parameter Tuning)

Import the packages.

In [8]:
import os
import numpy as np
import pandas as pd
from sklearn import svm
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score

Read in the data and extract features from labels. 

In [9]:
df = pd.read_csv('./Data/munged_df.csv', index_col='account_id')
feature_cols = list(df.columns[:-1])
target_col = df.columns[-1] 
X_all = df[feature_cols]
y_all = df[target_col] 

Scale the features.

In [10]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_all = scaler.fit_transform(X_all)

Split the data into training and testing.

In [11]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all , 
                                                    y_all, 
                                                    stratify=y_all,
                                                    test_size=0.1, 
                                                    random_state=1)
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 14946 samples.
Testing set has 1661 samples.


Create the classifier.

In [12]:
from sklearn.svm import LinearSVC
clf = svm.LinearSVC()

In [13]:
from sklearn.metrics import precision_recall_curve
model = clf.fit(X_train, y_train)
y_score = model.decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

In [14]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_score)

In [17]:
lw = 2
plt.figure(figsize=(5,5))
plt.plot(recall, precision, lw=lw, color='navy', label='Precision-Recall Curve')
plt.legend(loc="upper right")
plt.xlabel('Recall: Share of Customer Conversions Captured')
plt.ylabel('Precision: Accuracy of Predicted Customer Conversions')
plt.title('Linear SVM with Bayesian Optimization', y=1.05, fontsize=16)
plt.suptitle('Precision-Recall AUC={0:0.2f}'.format(average_precision), y=0.92, fontsize=12)
# plt.show()
path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
       'Capstone_Project/Images'
savepath = os.path.join(path, 'non-optimized_linear_svm.png')
plt.savefig(savepath)
plt.clf()

Derive the F2 score.

In [34]:
from sklearn.metrics import fbeta_score
y_pred = model.predict(X_test)
fbeta_score(y_test, y_pred, average='binary', beta=2)

0.056689342403628114

## Hyper-Parameter Tuning

Create the custom scorer.

In [20]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [21]:
def svccv(C):
    return cross_val_score(svm.LinearSVC(C=C, random_state=1),
                           X_train, y_train, ftwo_scorer, cv=20, n_jobs=-1).mean()

if __name__ == "__main__":
    gp_params = {"alpha": 1e5}

    svcBO = BayesianOptimization(svccv, {'C': (0.0001, 1000)})
    svcBO.explore({'C': [0.00001, 0.01, 0.1]})

    svcBO.maximize(n_iter=10, **gp_params)
    print('SVC: %11.9f' % svcBO.res['max']['max_val'])

[31mInitialization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    1 | 00m50s | [35m   0.03351[0m | [32m   0.0000[0m | 
    2 | 00m50s | [35m   0.04953[0m | [32m   0.0100[0m | 
    3 | 00m50s | [35m   0.08954[0m | [32m   0.1000[0m | 
    4 | 00m50s | [35m   0.25884[0m | [32m 395.6528[0m | 
    5 | 00m41s |    0.20976 |  441.4598 | 
    6 | 00m49s |    0.20961 |  198.0175 | 
    7 | 00m50s |    0.22517 |  755.3793 | 
    8 | 00m50s |    0.24851 |  184.3270 | 
[31mBayesian Optimization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    9 | 00m49s |    0.22687 |  999.9947 | 
   10 | 00m48s |    0.22687 |  999.9903 | 
   11 | 00m48s |    0.22687 |  999.9999 | 
   12 | 00m49s |    0.22687 |  999.9980 | 
   13 | 00m49s |    0.22687 |  999.9710 | 
   14 | 00m48s |    0.04562 |    0.0079 | 
   15 | 00m47s |    0.22687 |  999.9944 | 
   16 | 00m52s |    0.04821 |    0.0064 

In [22]:
results = svcBO.res['max']
svm_C = results['max_params']['C']
print(svm_C)

395.652796995


In [23]:
clf = svm.LinearSVC(C=svm_C)

In [24]:
from sklearn.metrics import precision_recall_curve
model = clf.fit(X_train, y_train)
y_score = model.decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

In [25]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_score)

In [27]:
lw = 2
plt.figure(figsize=(5,5))
plt.plot(recall, precision, lw=lw, color='navy', label='Precision-Recall Curve')
plt.legend(loc="upper right")
plt.xlabel('Recall: Share of Customer Conversions Captured')
plt.ylabel('Precision: Accuracy of Predicted Customer Conversions')
plt.title('Linear SVM with Bayesian Optimization', y=1.05, fontsize=16)
plt.suptitle('Precision-Recall AUC={0:0.2f}'.format(average_precision), y=0.92, fontsize=12)
# plt.show()
path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
       'Capstone_Project/Images'
savepath = os.path.join(path, 'optimized_Linear_SVM.png')
plt.savefig(savepath)
plt.clf()

Save the precision-recall curve results.

In [11]:
# a = pd.DataFrame(recall)
# a.rename(columns={0: 'recall'}, inplace=True)
# b = pd.DataFrame(precision)
# b.rename(columns={0: 'precision'}, inplace=True)
# results = pd.concat([a, b], axis=1) 
# path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
#        'Capstone_Project'
# results.to_csv("Optimized_Linear_SVM_Results.csv")

Derive the F2 score.

In [33]:
from sklearn.metrics import fbeta_score
y_pred = model.predict(X_test)
fbeta_score(y_test, y_pred, average='binary', beta=2)

0.056689342403628114