Import the packages.

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import svm
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score



Read in the data and extract features from labels. 

In [2]:
df = pd.read_csv('./Data/munged_df.csv', index_col='account_id')
# df = pd.read_csv('./Data/vif_pruned_df.csv', index_col='account_id')
feature_cols = list(df.columns[:-1])
target_col = df.columns[-1] 
X_all = df[feature_cols]
y_all = df[target_col] 

In [3]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_all = scaler.fit_transform(X_all)

In [4]:
from sklearn.cross_validation import train_test_split

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = train_test_split(X_all , 
                                                    y_all, 
                                                    stratify=y_all,
                                                    test_size=0.2, 
                                                    random_state=1)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 13285 samples.
Testing set has 3322 samples.


In [5]:
def svccv(C):
    return cross_val_score(svm.LinearSVC(C=C, random_state=1),
                           X_train, y_train, 'f1', cv=10, n_jobs=-1).mean()

if __name__ == "__main__":
    gp_params = {"alpha": 1e5}

    svcBO = BayesianOptimization(svccv, {'C': (0.0001, 100)})
    svcBO.explore({'C': [0.001, 0.01, 0.1]})

    svcBO.maximize(n_iter=10, **gp_params)
    print('SVC: %11.9f' % svcBO.res['max']['max_val'])

[31mInitialization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    1 | 00m04s | [35m   0.05028[0m | [32m   0.0010[0m | 
    2 | 00m03s | [35m   0.06911[0m | [32m   0.0100[0m | 
    3 | 00m03s | [35m   0.13493[0m | [32m   0.1000[0m | 
    4 | 00m03s | [35m   0.19566[0m | [32m  99.8994[0m | 
    5 | 00m03s |    0.16728 |   14.8722 | 
    6 | 00m03s |    0.18636 |   65.4668 | 
    7 | 00m03s | [35m   0.21570[0m | [32m  34.9912[0m | 
    8 | 00m03s | [35m   0.23197[0m | [32m  49.4438[0m | 
[31mBayesian Optimization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    9 | 00m04s |    0.20582 |   99.9985 | 
   10 | 00m03s |    0.20582 |   99.9991 | 
   11 | 00m03s |    0.20582 |   99.9995 | 
   12 | 00m03s |    0.06003 |    0.0031 | 
   13 | 00m03s |    0.20582 |   99.9998 | 
   14 | 00m04s |    0.05188 |    0.0001 | 
   15 | 00m04s |    0.20582 |   99.9979 | 
   1

In [6]:
results = svcBO.res['max']
svm_C = results['max_params']['C']
print(svm_C)

49.4438101562


In [7]:
clf = svm.LinearSVC(C=svm_C)

In [8]:
from sklearn.metrics import precision_recall_curve
y_score = clf.fit(X_train, y_train).decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_score)

In [9]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_score)

In [10]:
lw = 2
plt.figure(figsize=(5,5))
plt.plot(recall, precision, lw=lw, color='navy', label='Precision-Recall Curve')
plt.legend(loc="upper right")
plt.xlabel('Recall: Share of Customer Conversions Captured')
plt.ylabel('Precision: Accuracy of Predicted Customer Conversions')
plt.title('Linear SVM with Bayesian Optimization', y=1.05, fontsize=16)
plt.suptitle('Precision-Recall AUC={0:0.2f}'.format(average_precision), y=0.92, fontsize=12)
# plt.show()
path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
       'Capstone_Project/Images'
savepath = os.path.join(path, 'optimized_Linear_SVM.png')
plt.savefig(savepath)
plt.clf()