<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Practice" data-toc-modified-id="Practice-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Practice</a></span></li><li><span><a href="#Grid-Search-Cross-Validation" data-toc-modified-id="Grid-Search-Cross-Validation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Grid Search Cross Validation</a></span></li><li><span><a href="#Random-Search-Cross-Validation" data-toc-modified-id="Random-Search-Cross-Validation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Random Search Cross Validation</a></span></li><li><span><a href="#Excercise" data-toc-modified-id="Excercise-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Excercise</a></span></li></ul></div>

In [1]:
from sklearn.datasets import load_iris
from sklearn.svm import SVC
import pandas as pd

### Practice

In [2]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df[47:52]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, train_size=0.7)

In [6]:
model = SVC(kernel='rbf', C=30, gamma='auto')
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9777777777777777

In [7]:
from sklearn.model_selection import cross_val_score

In [8]:
cross_val_score(SVC(kernel='linear', C=10, gamma='auto'), iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [9]:
cross_val_score(SVC(kernel='rbf', C=10, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [10]:
cross_val_score(SVC(kernel='rbf', C=20, gamma='auto'), iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [11]:
kernels = ['rbf', 'linear']
C = [1, 10, 20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(SVC(kernel=kval, C=cval, gamma='auto'), iris.data, iris.target, cv=5)
        avg_scores[kval + '_' +str(cval)] = np.average(cv_scores)
avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

### Grid Search Cross Validation

In [13]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(SVC(gamma='auto'), {
    'C' : [1, 10, 20],
    'kernel' : ['linear', 'rbf']
}, cv=5, return_train_score=False)

clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.0004003 , 0.00060034, 0.00020037, 0.0004003 , 0.00020018,
        0.        ]),
 'std_fit_time': array([0.00049027, 0.00049017, 0.00040073, 0.00049027, 0.00040035,
        0.        ]),
 'mean_score_time': array([0.00020022, 0.00020027, 0.00020013, 0.00020022, 0.00040026,
        0.00020022]),
 'std_score_time': array([0.00040045, 0.00040054, 0.00040026, 0.00040045, 0.00049021,
        0.00040045]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'}],


In [15]:
df_clf = pd.DataFrame(clf.cv_results_)
df_clf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0004,0.00049,0.0002,0.0004,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.0006,0.00049,0.0002,0.000401,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.0002,0.000401,0.0002,0.0004,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
3,0.0004,0.00049,0.0002,0.0004,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.0002,0.0004,0.0004,0.00049,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
5,0.0,0.0,0.0002,0.0004,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5


In [16]:
df_clf[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,1,rbf,0.98
2,10,linear,0.973333
3,10,rbf,0.98
4,20,linear,0.966667
5,20,rbf,0.966667


In [17]:
clf.best_score_

0.9800000000000001

In [18]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

### Random Search Cross Validation

In [32]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(SVC(gamma='auto'), {
    'C' : [1, 10, 20],
    'kernel' : ['linear', 'rbf']},
    cv=5, return_train_score=False,
    n_iter = 2
)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.966667
1,10,rbf,0.98


In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [38]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1, 5, 10]
        }
    }
}

In [39]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [41]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.953333,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}


### Excercise

For digits dataset use the following classifiers and find out which one gives the best performance. Also find optimal parameters for that classifier.

1. Support Vector Machine
2. Random Forest
3. Logistic Regression
4. Gaussian Naive Bayes
5. Multinomial Naive Bayes
6. Decision Tree

In [44]:
from sklearn.datasets import load_digits
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [49]:
digits = load_digits()

In [57]:
model_params = {
    'svm': {
        'model': SVC(),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear', 'poly'],
            'gamma': ['scale', 'auto']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10],
            'criterion': ['gini', 'entropy']
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1, 5, 10]
        }
    },
    'gaussian_nb': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': [1e-9, 1e-10, 1e-11]
        }
    },
    'multinomial_nb': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [1, 5, 10],
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random']
        }
    }
}

In [58]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [59]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.97385,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}"
1,random_forest,0.906543,"{'criterion': 'entropy', 'n_estimators': 10}"
2,logistic_regression,0.922114,{'C': 1}
3,gaussian_nb,0.806928,{'var_smoothing': 1e-09}
4,multinomial_nb,0.874246,{'alpha': 10}
5,decision_tree,0.811941,"{'criterion': 'entropy', 'splitter': 'random'}"


Our best model, best score, best params:
>svm 	0.973850 	{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}