## SVM + RBF Kernel (no Hyper-Parameter Tuning)

Import the packages.

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import svm
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
from sklearn.cross_validation import cross_val_score



Read in the data and extract features from labels. 

In [2]:
df = pd.read_csv('./Data/munged_df.csv', index_col='account_id')
feature_cols = list(df.columns[:-1])
target_col = df.columns[-1] 
X_all = df[feature_cols]
y_all = df[target_col] 

Scale the features.

In [3]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_all = scaler.fit_transform(X_all)

Split the data into training and testing.

In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all , 
                                                    y_all, 
                                                    stratify=y_all,
                                                    test_size=0.1, 
                                                    random_state=1)
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 14946 samples.
Testing set has 1661 samples.


Create the classifier.

In [7]:
from sklearn.linear_model import LinearRegression
clf = LogisticRegression()

In [8]:
from sklearn.metrics import precision_recall_curve
model = clf.fit(X_train, y_train)
default_y_score = model.decision_function(X_test)
default_precision, default_recall, default_thresholds = precision_recall_curve(y_test, default_y_score)

In [9]:
from sklearn.metrics import average_precision_score
default_average_precision = average_precision_score(y_test, default_y_score)

In [8]:
# lw = 2
# plt.figure(figsize=(5,5))
# plt.plot(recall, precision, lw=lw, color='navy', label='Precision-Recall Curve')
# plt.legend(loc="upper right")
# plt.xlabel('Recall: Share of Customer Conversions Captured')
# plt.ylabel('Precision: Accuracy of Predicted Customer Conversions')
# plt.title('Linear SVM with Default Settings', y=1.05, fontsize=16)
# plt.suptitle('Precision-Recall AUC={0:0.2f}'.format(default_average_precision), y=0.92, fontsize=12)
# plt.show()
# # path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
# #        'Capstone_Project/Images'
# # savepath = os.path.join(path, 'non-optimized_linear_svm.png')
# # plt.savefig(savepath)
# # plt.clf()

Derive the F2 score.

In [10]:
# from sklearn.metrics import fbeta_score
# y_pred = model.predict(X_test)
# fbeta_score(y_test, y_pred, average='binary', beta=2)

0.094786729857819912

In [11]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)
from sklearn.cross_validation import cross_val_score
cv_results = pd.DataFrame(cross_val_score(model, X_test, y_test, cv=100, scoring=ftwo_scorer))
cv_results.mean()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

0    0.169762
dtype: float64

In [12]:
from sklearn.cross_validation import cross_val_score
cv_results = pd.DataFrame(cross_val_score(model, X_test, y_test, cv=100, scoring='recall'))
cv_results.mean()

0    0.19
dtype: float64

In [13]:
from sklearn.cross_validation import cross_val_score
cv_results = pd.DataFrame(cross_val_score(model, X_test, y_test, cv=100, scoring='precision'))
cv_results.mean()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

0    0.135
dtype: float64

## Hyper-Parameter Tuning

Create the custom scorer.

In [14]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [15]:
def svccv(C):
    return cross_val_score(svm.LinearSVC(C=C, random_state=1),
                           X_train, y_train, ftwo_scorer, cv=20, n_jobs=-1).mean()

if __name__ == "__main__":
    gp_params = {"alpha": 1e5}

    svcBO = BayesianOptimization(svccv, {'C': (0.0001, 1000)})
    svcBO.explore({'C': [0.00001, 0.01, 0.1]})

    svcBO.maximize(n_iter=10, **gp_params)
    print('SVC: %11.9f' % svcBO.res['max']['max_val'])

[31mInitialization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    1 | 00m07s | [35m   0.03351[0m | [32m   0.0000[0m | 
    2 | 00m08s | [35m   0.04953[0m | [32m   0.0100[0m | 
    3 | 00m08s | [35m   0.08954[0m | [32m   0.1000[0m | 
    4 | 00m08s | [35m   0.24709[0m | [32m 585.6010[0m | 
    5 | 00m08s |    0.23717 |  678.3932 | 
    6 | 00m08s |    0.23250 |  729.4563 | 
    7 | 00m08s |    0.24519 |  749.0101 | 
    8 | 00m08s |    0.21569 |  280.4378 | 
[31mBayesian Optimization[0m
[94m-----------------------------------------[0m
 Step |   Time |      Value |         C | 
    9 | 00m09s |    0.22687 |  999.9853 | 
   10 | 00m08s |    0.22687 |  999.9966 | 
   11 | 00m08s |    0.22687 |  999.9971 | 
   12 | 00m08s |    0.04427 |    0.0052 | 
   13 | 00m08s |    0.22687 |  999.9829 | 
   14 | 00m08s |    0.04430 |    0.0065 | 
   15 | 00m08s |    0.22687 |  999.9926 | 
   16 | 00m08s |    0.02987 |    0.0007 

In [16]:
results = svcBO.res['max']
svm_C = results['max_params']['C']
print(svm_C)

585.601036074


In [14]:
# svm_C = 334.794537868

In [17]:
clf = LogisticRegression(C=svm_C)

In [18]:
from sklearn.metrics import precision_recall_curve
optimized_model = clf.fit(X_train, y_train)
optimized_y_score = optimized_model.decision_function(X_test)
optimized_precision, optimized_recall, optimized_thresholds = precision_recall_curve(y_test, optimized_y_score)

In [19]:
from sklearn.metrics import average_precision_score
optimized_average_precision = average_precision_score(y_test, optimized_y_score)

In [26]:
lw = 2
plt.figure(figsize=(6,5))
plt.plot(default_recall, default_precision, 
         lw=lw, color='#006600', label='AUC with No Hyper-Parameter Tuning={0:0.2f}'.format(default_average_precision))
plt.plot(optimized_recall, optimized_precision, lw=lw, color='#79ff4d', 
         label='AUC with Bayesian Optimization={0:0.2f}'.format(optimized_average_precision))
plt.legend(loc="upper right")
plt.xlabel('Recall: Share of Customer Conversions Captured')
plt.ylabel('Precision: Accuracy of Predicted Customer Conversions')
plt.title('Precision-Recall Curve: Logistic Regression', y=1, fontsize=16)
# plt.show()
path = '/home/bsknight/Documents/Personal_Training_Git/Udacity/Udacity-Machine_Learning_Nanodegree/' + \
       'Capstone_Project/Images'
savepath = os.path.join(path, 'Logistic_Regression.png')
plt.savefig(savepath)
plt.clf()

Derive the F2 score.

In [19]:
# from sklearn.metrics import fbeta_score
# y_pred = optimized_model.predict(X_test)
# fbeta_score(y_test, y_pred, average='binary', beta=2)

0.10869565217391304

In [24]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)
from sklearn.cross_validation import cross_val_score
cv_results = pd.DataFrame(cross_val_score(optimized_model, X_test, y_test, cv=100, scoring=ftwo_scorer))
cv_results.mean()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

0    0.155595
dtype: float64

In [25]:
cv_results = pd.DataFrame(cross_val_score(optimized_model, X_test, y_test, cv=100, scoring='recall'))
cv_results.mean()

0    0.18
dtype: float64

In [23]:
cv_results = pd.DataFrame(cross_val_score(optimized_model, X_test, y_test, cv=100, scoring='precision'))
cv_results.mean()

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0    0.143333
dtype: float64