### Cross validation and grid search - iris dataset, logistic regression

In [93]:
# need some imports 
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [94]:
# load iris and do some cross validation tests
iris = load_iris()
logreg = LogisticRegression(C=10, max_iter=200, n_jobs=3,solver='liblinear') 
# C=1, max_iter=100, solver='liblinear' 'newton-cg', 'lbfgs', 'liblinear', 'sag'
np.random.seed(79) # this (np rng) is used when [random_state=None]. Can also set directly random_state=79 
# shuffling is similar to stratification
kf = KFold(n_splits=5, shuffle=True, random_state=None) # shuffle= False by default, use cv=3 in next step
scores = cross_val_score(logreg, iris.data, iris.target, cv = kf) # cv=3 as good as cv = 5, prev. step not needed
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f} +/- {:.2f}".format(scores.mean(), scores.std()))

Cross-validation scores: [ 0.96666667  0.96666667  1.          0.96666667  0.9       ]
Average cross-validation score: 0.96 +/- 0.03


### grid search  
Using iris dataset, we split dataset, run grid search on train data, predict test data using optimized classifier (best parameters) and evaluate results.

In [95]:
# do a train test split at 70% training
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, train_size = 0.7, random_state=0)

# define a logistic regression classifier with some initial values (set n_jobs to 1 if only one core)
clf  = LogisticRegression(n_jobs=3,solver='liblinear')

# define a paramter grid (dictionaries) of parameters to try
param_grid = {'C': [0.01, 0.1, 1, 10, 100],'max_iter': [100, 200, 300, 400]}

# define grid search with validation using 3 folds (stratified by default)
grid_search = GridSearchCV(clf, param_grid, cv=3)

# gridsearch is now a classifier with best parameters
grid_search.fit(X_train, y_train) # gridsearch finds best parameters & fits the whole train data, ready to predict

# print best parameters and best score
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# check test score
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test))) # score predicts first

Best parameters: {'C': 100, 'max_iter': 100}
Best cross-validation score: 0.97
Test set score: 0.96


In [96]:
# predict values using optimized model
preds = grid_search.predict(X_test)

# print confusion matrix and performance on test set
print(confusion_matrix(y_test,preds))
print(classification_report(y_test,preds))

[[16  0  0]
 [ 0 16  2]
 [ 0  0 11]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        16
          1       1.00      0.89      0.94        18
          2       0.85      1.00      0.92        11

avg / total       0.96      0.96      0.96        45



In [97]:
print("Best estimator:\n{}".format(grid_search.best_estimator_))

Best estimator:
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
