In [1]:
# k-fold cross validation

In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

In [3]:
# Loading up the dataset
df = pd.read_csv('../../archive/Social_Network_Ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [4]:
# Setting up the X and y values
y = df['Purchased']
Xs = df.drop(['User ID', 'Purchased'], axis=1)

# Fixing the categorical data
gender_dummies = pd.get_dummies(df['Gender'])
gender_dummies = gender_dummies['Female']

pd.to_numeric(gender_dummies, errors='coerce')
Xs = pd.concat([Xs, gender_dummies], axis=1)

Xs.drop(['Gender'], inplace=True, axis=1)
Xs.head()

Unnamed: 0,Age,EstimatedSalary,Female
0,19.0,19000.0,0
1,35.0,20000.0,0
2,26.0,43000.0,1
3,27.0,57000.0,1
4,19.0,76000.0,0


In [5]:
# Splitting the data set into training and testing

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.25, random_state=0)

In [6]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)

# We are only using the transform and not the fit transform because we already fit the model
X_test = sc_X.transform(X_test)

In [7]:
# Fitting the classifier to the training set

from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
# Predicting the values of the testing set

y_pred = classifier.predict(X_test)

In [9]:
# Checking for the results

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

In [10]:
from sklearn.model_selection import cross_val_score
acc = cross_val_score(estimator=classifier, X = X_train, y = y_train, cv=10, n_jobs=-1)
acc

array([0.80645161, 0.96666667, 0.83333333, 0.9       , 0.9       ,
       0.8       , 0.93333333, 0.93333333, 0.96666667, 0.96551724])

In [11]:
acc.mean()
# These are all the scores (noticed that they are a wide-range of score)
# We are better off because we can find the mean of these scores 
# to find where we stand with our model on testing set

0.9005302187615868

- You should consider two things:
    - What type of problems are you trying to figure out? Are you looking at a clustering, regression, or classification?
    - Then, is the problem a linear or non-linear? This is difficult when you have a lot of data 
        - Grid search will help discover this

In [12]:
# Appyling grid search to find the best models and parameters
from sklearn.model_selection import GridSearchCV

parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.5, 0.1, 0.01, 0.001, 0.0001]},
]

grid_search = GridSearchCV(estimator=classifier, 
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1
                          )

grid_search = grid_search.fit(X_train, y_train)

In [13]:
grid_search

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.5, 0.1, 0.01, 0.001, 0.0001]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [14]:
# Best parameters selected
best_parameters = grid_search.best_params_ # {'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}

In [15]:
# Score used by the best parameters
best_accuracy = grid_search.best_score_ # 90%

In [16]:
# Redoing some of the process
# Appyling grid search to find the best models and parameters

parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
]

grid_search = GridSearchCV(estimator=classifier, 
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1
                          )

grid_search = grid_search.fit(X_train, y_train)

# Best parameters selected
best_parameters = grid_search.best_params_

# Score used by the best parameters
best_accuracy = grid_search.best_score_ 

print(best_parameters, best_accuracy)

{'C': 1, 'gamma': 0.3, 'kernel': 'rbf'} 0.9033333333333333


In [29]:
np.linspace(50, 200, num=10)

array([ 50.        ,  66.66666667,  83.33333333, 100.        ,
       116.66666667, 133.33333333, 150.        , 166.66666667,
       183.33333333, 200.        ])