In [8]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


# Load the data set
cancer = load_breast_cancer()

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=19)

In [9]:
lr = LogisticRegression(solver='liblinear', max_iter=1000)

print(lr.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 1000, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [10]:
parameters = {
    'penalty':['l1', 'l2'],
    'C':[1, 10, 100]
}

In [11]:
clf = GridSearchCV(estimator=lr, param_grid=parameters)

print(clf.get_params())

{'cv': None, 'error_score': nan, 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__l1_ratio': None, 'estimator__max_iter': 1000, 'estimator__multi_class': 'deprecated', 'estimator__n_jobs': None, 'estimator__penalty': 'l2', 'estimator__random_state': None, 'estimator__solver': 'liblinear', 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': LogisticRegression(max_iter=1000, solver='liblinear'), 'n_jobs': None, 'param_grid': {'penalty': ['l1', 'l2'], 'C': [1, 10, 100]}, 'pre_dispatch': '2*n_jobs', 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 0}


The “CV” in GridSearchCV is an acronym for cross-validation. It’s best practice in 
Preview: Docs Machine learning is a branch of artificial intelligence that enables systems to learn from data and make predictions or decisions without explicit programming.
machine learning to go beyond the usual train-test split and have a holdout or validation dataset. Specifically, GridSearhCV uses a technique known as k-fold cross-validation. This works as follows.

## Cross-validation in GridSearchCV
GridSearchCV subdivides the training data further into another training and test data set. It fits the model on this new training data and evaluates the model on the new test data. But to make sure that we don’t accidentally have good performance in only one part of our dataset, GridSearchCV will do this process multiple times on different cross-validation splits so that every point in the data gets to be tested on at least once! The number of times this split happens is the “k” in “k-fold”. For instance, in a 10-fold cross-validation, our data would be split into a 90:10 train-test split 10 times and GridSearchCV would evaluate the model on each fold.

Evaluating GridSearchCV results
After fitting a GridSearchCV model we can find out the results using the following attributes of the clf argument:

- .best_estimator_ gives us the best estimator
- .best_score_ gives us the mean cross-validated score corresponding to the best estimator
- .best_params_ gives us the set of hyperparameters that correspond to the best estimator
Additionally, the .cv_results_ attribute gives us the scores for each hyperparamter combination in the grid. We’re now ready to evaluate the grid search we set up earlier and we’ve preloaded the code from the previous exercise in the Setup cell.

In [12]:
clf.fit(X_train, y_train)
best_model = clf.best_estimator_


print(best_model)
print(clf.best_params_)

LogisticRegression(C=10, max_iter=1000, penalty='l1', solver='liblinear')
{'C': 10, 'penalty': 'l1'}


In [None]:
from sklearn.metrics import accuracy_score

# Store the best score from the grid search
best_score = clf.best_score_

# Get the best estimator and evaluate on the test set
best_model = clf.best_estimator_
test_predictions = best_model.predict(X_test)

# Calculate accuracy
test_score = accuracy_score(y_test, test_predictions)


print(best_score)
print(test_score)

In [19]:
import pandas as pd
hyperparameter_grid = pd.DataFrame(clf.cv_results_['params'])
grid_scores = pd.DataFrame(clf.cv_results_['mean_test_score'])
# grid_scores['score'] = grid_scores['0']

df = pd.concat([hyperparameter_grid, grid_scores], axis = 1)
print(df)
grid_scores.head()

     C penalty         0
0    1      l1  0.957702
1    1      l2  0.952996
2   10      l1  0.967114
3   10      l2  0.957702
4  100      l1  0.955321
5  100      l2  0.960027


Unnamed: 0,0
0,0.957702
1,0.952996
2,0.967114
3,0.957702
4,0.955321
