In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
df = pd.read_csv('data/titanic.csv')

In [4]:
X_train = df[['Pclass', 'SibSp', 'Parch', 'Fare']]
Y_train = df['Survived']

### GridSearchCV for hyperparameter tuning

In [5]:
# Set up possible values of parameters to optimize over
p_grid = {"n_neighbors": [5,20,50],
        "weights": ['uniform', 'distance']}   # 6 total combinations

# Define two stratkfoldCV splits
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=41)
outer_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Define target model and tune hyperparam
knn = KNeighborsClassifier()
clf = GridSearchCV(estimator=knn, param_grid=p_grid, cv=inner_cv)   # grid search CV setup
# find best hyperparam for kNN using inner_cv (5 splits)
gscv = clf.fit(X_train, Y_train)
print(gscv.best_estimator_)
print(gscv.best_params_)
print('\n')
print("------------ All parameter combinations tested ------------")
print(gscv.cv_results_['params'])
print('\n')
print("------------ Results for each combination (6) given split 1 ------------")
print(gscv.cv_results_['split1_test_score'])
print('\n')
print("------------ Average result for each combination ------------")
print("------------ (mean of score for all combinations over each split) ------------")
print(gscv.cv_results_['mean_test_score'])
print('\n')
print("We can see that the 6-th combination is the best because the highest accuracy is the last element of the mean_test_score array")

KNeighborsClassifier(n_neighbors=50, weights='distance')
{'n_neighbors': 50, 'weights': 'distance'}


------------ All parameter combinations tested ------------
[{'n_neighbors': 5, 'weights': 'uniform'}, {'n_neighbors': 5, 'weights': 'distance'}, {'n_neighbors': 20, 'weights': 'uniform'}, {'n_neighbors': 20, 'weights': 'distance'}, {'n_neighbors': 50, 'weights': 'uniform'}, {'n_neighbors': 50, 'weights': 'distance'}]


------------ Results for each combination (6) given split 1 ------------
[0.67977528 0.66853933 0.69662921 0.71348315 0.69662921 0.71348315]


------------ Average result for each combination ------------
------------ (mean of score for all combinations over each split) ------------
[0.67113803 0.66552633 0.68240537 0.68464629 0.67791099 0.68576361]


We can see that the 6-th combination is the best because the highest accuracy is the last element of the mean_test_score array


### RandomizedSearchCV

In [6]:
from scipy.stats import uniform
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
p_distrib = {'C': uniform(loc=0, scale=3),
            'penalty': ['l2'], 
            'fit_intercept': [True, False]}

clf = RandomizedSearchCV(estimator=logistic, 
                        param_distributions=p_distrib, # parameters options and/or distributions
                        cv=inner_cv, # cv splits to use
                        n_iter=8)   # number of random parameters combinations
# find best hyperparam for kNN using inner_cv (5 splits)
Rgscv = clf.fit(X_train, Y_train)
df = pd.DataFrame(Rgscv.cv_results_)
df1 = df["params"].apply(pd.Series)
df = pd.merge(df.drop("params", axis=1), df1, left_index=True, right_index=True)


print(f"Best parameters: {Rgscv.best_params_}")
print("\n")
print(df[['C', 'fit_intercept', 'penalty', 'mean_test_score']])

Best parameters: {'C': 1.9793618166780922, 'fit_intercept': True, 'penalty': 'l2'}


          C  fit_intercept penalty  mean_test_score
0  0.763540          False      l2         0.674559
1  1.979362           True      l2         0.683542
2  2.247328          False      l2         0.674559
3  0.966326          False      l2         0.674559
4  2.086444           True      l2         0.683542
5  2.529099          False      l2         0.674559
6  1.210238           True      l2         0.683542
7  2.514719           True      l2         0.683542


In [7]:
# DOCS EXAMPLE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

p_grid = {"C": [1, 10, 100], "gamma": [0.01, 0.1]}
svm = SVC(kernel="rbf")

inner_cv = KFold(n_splits=5, shuffle=True, random_state=123)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=123)

# Non_nested parameter search and scoring (only hyper tuning)
clf1 = GridSearchCV(estimator=svm, param_grid=p_grid, cv=outer_cv) 
clf1.fit(X_train, Y_train)
non_nested_score = clf.best_score_

# Nested CV with parameter optimization
clf2 = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv)       # hyper tuning, score is the one used to select the best model
nested_score = cross_val_score(clf2, X=X_train, y=Y_train, cv=outer_cv)  # CV score for the best combination, obtained from a different random split of the dataset
nested_score = nested_score.mean()

print(f"Only hyperparameters tuning: final score for best combination of param = {non_nested_score}")
print(f"Nested CV: final score for best combination of param = {nested_score}")

Only hyperparameters tuning: final score for best combination of param = 0.683541522817149
Nested CV: final score for best combination of param = 0.693567446370137
