### In this notebook you will find:
##### - K Fold Cross Valiadation ( KNN Algorithm )
##### - Grid Search Cross Valiadation ( KNN Algorithm)
##### - Grid Search Cross Validation ( Logistic Regression)

In [19]:
import pandas as pd
import numpy as np


from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## K Fold Cross Validation (KNN)

### let's use iris dataset

In [20]:
from sklearn.datasets import load_iris
iris_data = load_iris()

### We have to first transform dataframe

In [21]:
iris = pd.DataFrame(data=iris_data.data, index=range(0,150), columns=iris_data.feature_names)
iris['class'] = iris_data.target
iris['class_name'] = iris['class'].map({0:iris_data.target_names[0], 1:iris_data.target_names[1], 2:iris_data.target_names[2]})
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class,class_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [22]:
x = iris.iloc[:,0:4]
y = iris.iloc[:, 4]
print('x_shape: {}\ny_shape: {}'.format(x.shape, y.shape))

x_shape: (150, 4)
y_shape: (150,)


Above shape of y is (150,). We need it in that format. No need to reshape.

### Normalize features first
This type of normalization is also acceptable since it scales the values in a range of 0 to 1. Still better techniques for normalization are present. In another repository, you can find them in detail.

In [23]:
x = ((x-np.min(x))/(np.max(x)-np.min(x)))

### KNN (Let's just assume optimal K = 3)

In [24]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.35, random_state = 42)

from sklearn.neighbors import KNeighborsClassifier
knn_01 = KNeighborsClassifier(n_neighbors=3)

### K Fold CV with cv=10

In [25]:
from sklearn.model_selection import cross_val_score

accuracy_list = cross_val_score(estimator=knn_01, X = x_train, y = y_train, cv = 10)

print('average accuracy: {}'.format(np.mean(accuracy_list)))
print('stdev: {}'.format(np.std(accuracy_list)))

average accuracy: 0.9393939393939394
stdev: 0.09968832154727536


#### Results above show that our model can be trained with a consistent accuracy.
#### Next, we can see the real score with initial test data.

In [26]:
knn_01.fit(x_train, y_train)
print('test accuracy: {}'.format(knn_01.score(x_test, y_test)))

test accuracy: 0.9811320754716981


Note: Such accuracy makes sense for iris data and it small test data size. Normally getting almost 100% prediction accuracy is rare in other real life data.

# Grid Search Cross Validation  (KNN)

In [27]:
from sklearn.model_selection import GridSearchCV

### Trying 50 different K values with cv=10

In [28]:
grid_01 = {'n_neighbors':np.arange(1,50)}
knn_02 = KNeighborsClassifier()

knn_cv = GridSearchCV(knn_02, grid_01, cv=10)
knn_cv.fit(x,y)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

### Results are better and seems like K=30 is optimal K value

In [29]:
print('tuned hyperparameter K:',knn_cv.best_params_)
print('Best Accuracy according to {}: {}'.format(knn_cv.best_params_ ,knn_cv.best_score_))

tuned hyperparameter K: {'n_neighbors': 30}
Best Accuracy according to {'n_neighbors': 30}: 0.9733333333333334


# Grid Search Cross Validation (Logistic Regression)

to apply Grid Search CV to Logistic regression we must first reduce the number of classes to 2 from 3. First 100 rows of data is of class 0 and class 1. Last 50 is of class 2. So simply cut last 50 data. Logistic regression will perform better with binary classification problems.

In [30]:
x = x.iloc[:100,:]
y = y[:100]

# Normalization is already done. No need to repeat.

In [31]:
# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3 , random_state=42)

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grid_02 = {'C':np.logspace(-3,3,7), 'penalty':['l1','l2']}  #L1 = Lasso, L2 = Ridge, C= Regularization Parameter

logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, grid_02, cv=10)
logreg_cv.fit(x,y)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
print('Hyperparameters tuned: ',logreg_cv.best_params_)
print('Accuracy: ',logreg_cv.best_score_)

Hyperparameters tuned:  {'C': 0.01, 'penalty': 'l2'}
Accuracy:  1.0


In [34]:
logreg2 = LogisticRegression(C=0.01, penalty='l2', solver='lbfgs')
logreg2.fit(x_train,y_train)
print('score: ',logreg2.score(x_test,y_test))

score:  1.0


More Logistic Regression and other classification algortihms (also grid search CV) can be found in other notebooks.