## SVM + RBF Kernel (no Hyper-Parameter Tuning)

Import the packages.

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import svm
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score



Read in the data and extract features from labels. 

In [2]:
df = pd.read_csv('./Data/munged_df.csv', index_col='account_id')
feature_cols = list(df.columns[:-1])
target_col = df.columns[-1] 
X_all = df[feature_cols]
y_all = df[target_col] 

Scale the features.

In [3]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_all = scaler.fit_transform(X_all)

Split the data into training and testing.

In [4]:
from sklearn.cross_validation import train_test_split

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all , 
                                                    y_all, 
                                                    stratify=y_all,
                                                    test_size=0.1, 
                                                    random_state=1)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 14946 samples.
Testing set has 1661 samples.


Create the classifier.

In [5]:
from sklearn.svm import SVC
clf = SVC()

In [6]:
from sklearn.metrics import precision_recall_curve
model = clf.fit(X_train, y_train)
y_score = model.decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

In [7]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_score)

Plot the precision-recall curve.

In [8]:
lw = 2
plt.plot(recall, precision, lw=lw, color='navy',
         label='Precision-Recall Curve')
plt.legend(loc="upper right")
plt.xlabel('Recall: Share of Customer Conversions Captured')
plt.ylabel('Precision: Accuracy of Predicted Customer Conversions')
plt.title('RBF Kernel: Precision-Recall AUC={0:0.2f}'.format(average_precision))
# plt.show()
path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
       'Capstone_Project/Images'
savepath = os.path.join(path, 'non-optimized_rbf_kernel.png')
plt.savefig(savepath)
plt.clf()

Derive the F2 score.

In [9]:
from sklearn.metrics import fbeta_score
y_pred = model.predict(X_test)
fbeta_score(y_test, y_pred, average='binary', beta=0.5)

  'precision', 'predicted', average, warn_for)


0.0

## Hyper-Parameter Tuning

Create the custom scorer.

In [None]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)

Deploy the F2 scorer. 

In [None]:
def svccv(C, gamma):
    return cross_val_score(SVC(C=C, gamma=gamma, random_state=1, kernel='rbf'),
                           X_train, y_train, ftwo_scorer, cv=20, n_jobs=-1).mean()

if __name__ == "__main__":
    gp_params = {"alpha": 1e5}

    svcBO = BayesianOptimization(svccv, {'C': (0.0001, 1000), 'gamma': (0.0001, 1000)})
    svcBO.explore({'C': [0.001, 0.01, 0.1], 'gamma': [0.001, 0.01, 0.1]})

    svcBO.maximize(n_iter=10, **gp_params)
    print('SVC: %11.9f' % svcBO.res['max']['max_val'])

[31mInitialization[0m
[94m-----------------------------------------------------[0m
 Step |   Time |      Value |         C |     gamma | 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


    1 | 02m03s | [35m   0.00000[0m | [32m   0.0010[0m | [32m   0.0010[0m | 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


    2 | 05m13s |    0.00000 |    0.0100 |    0.0100 | 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


    3 | 21m35s |    0.00000 |    0.1000 |    0.1000 | 


In [None]:
results = svcBO.res['max']
rbf_gamma = results['max_params']['gamma']
rbf_C = results['max_params']['C']
print(rbf_C)
print(rbf_gamma)

In [None]:
clf = SVC(C=rbf_C, cache_size=200, gamma=rbf_gamma, kernel='rbf',
  max_iter=-1, probability=False, random_state=1, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
from sklearn.metrics import precision_recall_curve
model = clf.fit(X_train, y_train)
y_score = model.decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

In [None]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_score)

In [None]:
lw = 2
plt.figure(figsize=(5,5))
plt.plot(recall, precision, lw=lw, color='navy', label='Precision-Recall Curve')
plt.legend(loc="upper right")
plt.xlabel('Recall: Share of Customer Conversions Captured')
plt.ylabel('Precision: Accuracy of Predicted Customer Conversions')
plt.title('RBF Kernel with Bayesian Optimization', y=1.05, fontsize=16)
plt.suptitle('Precision-Recall AUC={0:0.2f}'.format(average_precision), y=0.92, fontsize=12)
# plt.show()
path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
       'Capstone_Project/Images'
savepath = os.path.join(path, 'optimized_SVM_with_RBF.png')
plt.savefig(savepath)
plt.clf()

Save the precision-recall curve results.

In [None]:
# a = pd.DataFrame(recall)
# a.rename(columns={0: 'recall'}, inplace=True)
# b = pd.DataFrame(precision)
# b.rename(columns={0: 'precision'}, inplace=True)
# results = pd.concat([a, b], axis=1) 
# path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
#        'Capstone_Project'
# results.to_csv("Optimized_RBF_Kernel_Results.csv")

Derive the F2 score.

In [None]:
from sklearn.metrics import fbeta_score
y_pred = model.predict(X_test)
fbeta_score(y_test, y_pred, average='binary', beta=2)