Import the packages.

In [139]:
import os
import numpy as np
import pandas as pd
from sklearn import svm
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score

Read in the data and extract features from labels. 

In [140]:
df = pd.read_csv('./Data/munged_df.csv', index_col='account_id')
# df = pd.read_csv('./Data/vif_pruned_df.csv', index_col='account_id')
feature_cols = list(df.columns[:-1])
target_col = df.columns[-1] 
X_all = df[feature_cols]
y_all = df[target_col] 

In [141]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_all = scaler.fit_transform(X_all)

In [142]:
from sklearn.cross_validation import train_test_split

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all , 
                                                    y_all, 
                                                    stratify=y_all,
                                                    test_size=0.1, 
                                                    random_state=1)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 14946 samples.
Testing set has 1661 samples.


In [149]:
def svccv(C):
    return cross_val_score(svm.LinearSVC(C=C, random_state=1),
                           X_train, y_train, 'f1', cv=10, n_jobs=-1).mean()

if __name__ == "__main__":
    gp_params = {"alpha": 1e5}

    svcBO = BayesianOptimization(svccv, {'C': (0.0001, 100)})
    svcBO.explore({'C': [0.001, 0.01, 0.1]})

    svcBO.maximize(n_iter=10, **gp_params)
    print('SVC: %11.9f' % svcBO.res['max']['max_val'])

[31mInitialization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    1 | 00m07s | [35m   0.05099[0m | [32m   0.0010[0m | 
    2 | 00m07s | [35m   0.08496[0m | [32m   0.0100[0m | 
    3 | 00m07s | [35m   0.13348[0m | [32m   0.1000[0m | 
    4 | 00m07s | [35m   0.14835[0m | [32m  43.5641[0m | 
    5 | 00m07s | [35m   0.18189[0m | [32m   4.6702[0m | 
    6 | 00m07s |    0.16333 |   46.4198 | 
    7 | 00m07s | [35m   0.21848[0m | [32m  78.7624[0m | 
    8 | 00m07s |    0.18721 |   26.2623 | 
[31mBayesian Optimization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    9 | 00m08s |    0.18964 |   99.9999 | 
   10 | 00m08s |    0.18964 |   99.9994 | 
   11 | 00m07s |    0.18964 |   99.9993 | 
   12 | 00m08s |    0.18964 |   99.9973 | 
   13 | 00m08s |    0.18964 |   99.9988 | 
   14 | 00m07s |    0.04912 |    0.0006 | 
   15 | 00m06s |    0.18964 |   99.9996 | 
   1

In [150]:
results = svcBO.res['max']
svm_C = results['max_params']['C']
print(svm_C)

78.7624498925


In [151]:
clf = svm.LinearSVC(C=svm_C)

In [152]:
from sklearn.metrics import precision_recall_curve
y_score = clf.fit(X_train, y_train).decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

In [153]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_score)

In [154]:
lw = 2
plt.figure(figsize=(5,5))
plt.plot(recall, precision, lw=lw, color='navy', label='Precision-Recall Curve')
plt.legend(loc="upper right")
plt.xlabel('Recall: Share of Customer Conversions Captured')
plt.ylabel('Precision: Accuracy of Predicted Customer Conversions')
plt.title('Linear SVM with Bayesian Optimization', y=1.05, fontsize=16)
plt.suptitle('Precision-Recall AUC={0:0.2f}'.format(average_precision), y=0.92, fontsize=12)
# plt.show()
path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
       'Capstone_Project/Images'
savepath = os.path.join(path, 'optimized_Linear_SVM.png')
plt.savefig(savepath)
plt.clf()

