# Hyperparameter tuning
- model parameters are learned from training
- hyper parameters are set prior training
- hyper parameters should be tuned to obtain the best model score

In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

## Load data from file

In [5]:
df = pd.read_csv('wbc.csv', index_col='id')
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Split data into training and test set

In [8]:
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

## Gridsearch
- manually set grid of discrete hyperparameter values
- set a metric for scoring model performance
- search exhaustively through the grid
- for each set of hyperparameters, evaluate each model's CV score
- optimal hyperparameters are those for which the model achieves the best CV score

In [16]:
from sklearn.model_selection import GridSearchCV

# instantiate the model
dt = DecisionTreeClassifier()

# Define grid of hyperparameters as dictionary with parameter names as keys
params_dt = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_leaf': [0.04, 0.06, 0.08],
    'max_features': [0.2, 0.4, 0.6, 0.8]
}

# perform gridsearch
grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=params_dt,
    scoring='accuracy',
    cv=10,
    n_jobs=-1)
grid_dt.fit(X_train, y_train)
print('Best hyperparameters:\n', grid_dt.best_params_)
print('Best CV score: {:.3f}'.format(grid_dt.best_score_))

Best hyperparameters:
 {'max_depth': 3, 'max_features': 0.6, 'min_samples_leaf': 0.04}
Best CV score: 0.936


## Evaluate grid model

In [17]:
best_model = grid_dt.best_estimator_
test_acc = best_model.score(X_test, y_test)
print('Test accuracy of best model: {:.3f}'.format(test_acc))

Test accuracy of best model: 0.939
