# Cross Validation
### https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

In [0]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

## Datasets

In [0]:
# Heart disease data dataset for classification
data= pd.read_csv("heart.csv")
X_heart= data[data.columns[data.columns!= 'target']]
y_heart= data['target']

print(X_heart.shape)
print(y_heart.shape)

(303, 13)
(303,)


In [0]:
X_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [0]:
y_heart.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

### Application of SVMs to normalized data with feature preprocessing using minmax scaling

In [0]:
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

svm_clf= SVC()
scaler = MinMaxScaler()
X_heart_scaled = scaler.fit_transform(X_heart)

cv_scores = cross_val_score(svm_clf, X_heart_scaled, y_heart)
print('Cross-validation scores (3-fold):', cv_scores)
print('Mean cross-validation score (3-fold): {:.3f}'
     .format(np.mean(cv_scores)))


Cross-validation scores (3-fold): [0.83168317 0.85148515 0.79207921]
Mean cross-validation score (3-fold): 0.825


In [0]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=41)

svm_clf= SVC()
scaler = MinMaxScaler()
X_heart_scaled = scaler.fit_transform(X_heart)

cv_scores = cross_val_score(svm_clf, X_heart_scaled, y_heart, cv= skf)
print('Cross-validation scores (3-fold):', cv_scores)
print('Mean cross-validation score (3-fold): {:.3f}'
     .format(np.mean(cv_scores)))

Cross-validation scores (3-fold): [0.82178218 0.83168317 0.78217822]
Mean cross-validation score (3-fold): 0.812


### A note on performing cross-validation for more advanced scenarios.

The proper way to do cross-validation when you need to scale the data is *not* to scale the entire dataset with a single transform, since this will indirectly leak information into the training data about the whole dataset, including the test data (see the lecture on data leakage later in the course).  Instead, scaling/normalizing must be computed and applied for each cross-validation fold separately.  To do this, the easiest way in scikit-learn is to use *pipelines*.  
http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

svm_clf = Pipeline([
        ("scaler", MinMaxScaler()),
        ("linear_svc", SVC()),
    ])

cv_scores = cross_val_score(svm_clf, X_heart, y_heart)
print('Cross-validation scores (3-fold):', cv_scores)
print('Mean cross-validation score (3-fold): {:.3f}'
     .format(np.mean(cv_scores)))

Cross-validation scores (3-fold): [0.83168317 0.87128713 0.77227723]
Mean cross-validation score (3-fold): 0.825


# Grid Search
### Optimizing a classifier

In [0]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

clf = SVC()
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

grid_values = {'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}

# default metric to optimize over grid parameters: accuracy
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values, cv= 3)

grid_clf_acc.fit(X_heart, y_heart)

print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)
print('Grid best score (accuracy): ', grid_clf_acc.best_score_)

Grid best parameter (max. accuracy):  {'gamma': 0.001, 'kernel': 'linear'}
Grid best score (accuracy):  0.8382838283828383


In [0]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([("scaler", MinMaxScaler()), ("linear_svc", SVC())])

# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {'linear_svc__gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100], 'linear_svc__kernel': ['linear', 'rbf']}

search = GridSearchCV(pipe, param_grid, cv=3 ) #     return_train_score=False)
search.fit(X_heart, y_heart)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)


Best parameter (CV score=0.838):
{'linear_svc__gamma': 0.001, 'linear_svc__kernel': 'linear'}
