## Python 9.8

In [87]:
import numpy as np
import matplotlib.pyplot as plt

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection as skm
from sklearn import svm
from sklearn.metrics import f1_score, make_scorer
import ISLP
from ISLP.svm import plot as plot_svm
from sklearn.model_selection import GridSearchCV
import pandas as pd


## 9.8

### (a) Loading and preparing the data

In [89]:
data = ISLP.load_data("OJ")
print(data.shape)
print(data.columns)
print(data.dtypes)

(1070, 18)
Index(['Purchase', 'WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM', 'DiscCH',
       'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceMM',
       'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM', 'PctDiscCH',
       'ListPriceDiff', 'STORE'],
      dtype='object')
Purchase           object
WeekofPurchase      int64
StoreID             int64
PriceCH           float64
PriceMM           float64
DiscCH            float64
DiscMM            float64
SpecialCH           int64
SpecialMM           int64
LoyalCH           float64
SalePriceMM       float64
SalePriceCH       float64
PriceDiff         float64
Store7             object
PctDiscMM         float64
PctDiscCH         float64
ListPriceDiff     float64
STORE               int64
dtype: object


As you can see, there are categorial variables in the data set, so I will convert them to dummies

In [90]:
X = data.drop(columns="Purchase")
X = pd.get_dummies(X)
y = data["Purchase"]

Train/Test separation as instructed in the assignment

In [91]:
X_test = X.iloc[:800]
y_test = y.iloc[:800]
X_train = X.iloc[800:]
y_train = y.iloc[800:]

In the follwing $f_1$ metrics will be used to score the performance

In [92]:
my_f1_scorer = make_scorer(f1_score, pos_label="CH")

All the results will be saved in a data frame, here is a short function to do it

In [93]:
dfResults = pd.DataFrame()
def storeResults(model, comment=""):
    global dfResults
    df_ = pd.DataFrame({
        "C":model.get_params()["C"],
        "kernel":model.get_params()["kernel"],
        "test_score": f1_score(y_test, model.predict(X_test), pos_label = "CH"),
        "train_score": f1_score(y_train, model.predict(X_train), pos_label = "CH"),
        "n_support": model.support_vectors_.shape[0],
        "comment":comment
        }, index = [0])
    dfResults = pd.concat([dfResults, df_]).drop_duplicates()
    return df_

### (b), (c) Linear Kernel witj C = 0.01

The kernel to use was not specified in the assignment, to I will use linear

In [94]:
kernel_ = "linear"
model = svm.SVC(C=0.01, kernel=kernel_)
model.fit(X_train, y_train)
storeResults(model)

print("C = ", model.get_params()["C"], "# of support vectors: ", model.support_vectors_.shape[0])
print(" F1 score: Train: {:.3f}, Test: {:.3f}".format( 
    f1_score(y_train, model.predict(X_train), pos_label="CH"), 
    f1_score(y_test, model.predict(X_test), pos_label="CH")
    ))


C =  0.01 # of support vectors:  205
 F1 score: Train: 0.700, Test: 0.614


Accuracies are reasonable

###  (d)

For some reason my system hangs when I am using CV with C>4, so only values up to 3 will be tested

In [96]:
grid_params = {
    'C': [0.01, 0.1, 1, 1.5, 2, 2.5, 3]
}
model = svm.SVC(kernel=kernel_)
grid = GridSearchCV(model, grid_params, cv = 5, scoring=my_f1_scorer)
grid.fit(X, y)
print(grid.best_params_)


{'C': 1}


As you can see, C=1 was selected to give optimal score

### (e)

In [97]:
model = svm.SVC(C=grid.best_params_["C"], kernel=kernel_)
model.fit(X_train, y_train)
storeResults(model, comment="bestGrid")
print("C = ", model.get_params()["C"], "# of support vectors: ", model.support_vectors_.shape[0])
print(" F1 score: Train: {:.3f}, Test: {:.3f}".format( 
    f1_score(y_train, model.predict(X_train), pos_label="CH"), 
    f1_score(y_test, model.predict(X_test), pos_label="CH")
    ))

C =  1 # of support vectors:  142
 F1 score: Train: 0.840, Test: 0.873


This time score is even better

### (f) Radial Kernel

In [98]:
kernel_ = "rbf"
model = svm.SVC(C=0.01, kernel=kernel_)
modelR = model
model.fit(X_train, y_train)
storeResults(model)
print("C = ", model.get_params()["C"], "# of support vectors: ", model.support_vectors_.shape[0])
print(" F1 score: Train: {:.3f}, Test: {:.3f}".format( 
    f1_score(y_train, model.predict(X_train), pos_label="CH"), 
    f1_score(y_test, model.predict(X_test), pos_label="CH")
    ))

C =  0.01 # of support vectors:  228
 F1 score: Train: 0.732, Test: 0.766


With RBF kernel there are no problems with high $C$, we will check some values up to 10

In [101]:
grid_params = {
    'C': [0.01, 0.1, 1, 1.5, 2, 2.5, 3, 4, 8, 10]
}
model = svm.SVC(kernel=kernel_)
grid = GridSearchCV(model, grid_params, cv = 5, scoring=my_f1_scorer)
grid.fit(X, y)
print(grid.best_params_)


{'C': 0.01}


The optimal parameter is on the edge

In [103]:
model = svm.SVC(C=grid.best_params_["C"], kernel=kernel_)
model.fit(X_train, y_train)
storeResults(model, comment="bestGrid")
print("C = ", model.get_params()["C"], "# of support vectors: ", model.support_vectors_.shape[0])
print(" F1 score: Train: {:.3f}, Test: {:.3f}".format( 
    f1_score(y_train, model.predict(X_train), pos_label="CH"), 
    f1_score(y_test, model.predict(X_test), pos_label="CH")
    ))

C =  0.01 # of support vectors:  228
 F1 score: Train: 0.732, Test: 0.766


For this model optimal accuracy is better then first linear, but smaller then optimal linear

### (g)

Let us consider polynomial with `degree=2`

In [105]:
kernel_ = "poly"
model = svm.SVC(C=0.01, kernel=kernel_, degree=2)
modelP = model
modelP.fit(X_train, y_train)
storeResults(modelP)
print("C = ", modelP.get_params()["C"], "# of support vectors: ", model.support_vectors_.shape[0])
print(" F1 score: Train: {:.3f}, Test: {:.3f}".format( 
    f1_score(y_train, model.predict(X_train), pos_label="CH"), 
    f1_score(y_test, model.predict(X_test), pos_label="CH")
    ))
del model

C =  0.01 # of support vectors:  228
 F1 score: Train: 0.732, Test: 0.766


Surprisingly, results are exactly the same as in the case of RBF kernel

Again no problems with high C

In [107]:
grid_params = {
    'C': [0.01, 0.1, 1, 1.5, 2, 2.5, 3, 4, 8, 10]
}
model = svm.SVC(kernel=kernel_, degree= 2)
grid = GridSearchCV(model, grid_params, cv = 5, scoring=my_f1_scorer)
grid.fit(X, y)
print(grid.best_params_)


{'C': 0.01}


As you can see, C=0.01 was selected to give optimal score

In [108]:
model = svm.SVC(C=grid.best_params_["C"], kernel=kernel_, degree = 2)
model.fit(X_train, y_train)
storeResults(model, comment="bestGrid")
print("C = ", model.get_params()["C"], "# of support vectors: ", model.support_vectors_.shape[0])
print(" F1 score: Train: {:.3f}, Test: {:.3f}".format( 
    f1_score(y_train, model.predict(X_train), pos_label="CH"), 
    f1_score(y_test, model.predict(X_test), pos_label="CH")
    ))

C =  0.01 # of support vectors:  228
 F1 score: Train: 0.732, Test: 0.766


### (h)

You can see that in this case SVM with linear kernel and C=1 gives the best result

In [109]:
dfResults.sort_values("test_score", ascending=False)

Unnamed: 0,C,kernel,test_score,train_score,n_support,comment
0,1.0,linear,0.873494,0.840125,142,bestGrid
0,0.01,rbf,0.766384,0.732394,228,
0,0.01,rbf,0.766384,0.732394,228,bestGrid
0,0.01,poly,0.766384,0.732394,228,
0,0.01,poly,0.766384,0.732394,228,bestGrid
0,0.01,linear,0.614057,0.699647,205,


It is interesting to note that both `RBF` and `Poly` kernels give exactly the same result.

It is easy to check that their presictions are the same both on train and test subsets

In [110]:
(
np.all(modelR.predict(X_train) == modelP.predict(X_train)),
np.all(modelR.predict(X_test) == modelP.predict(X_test))
)

(True, True)

The actual models, however, are different. For example, they are using different sets of support vectors

In [111]:
np.all(
    np.sort(modelR.support_) == np.sort(modelP.support_)
)


False