<a href="https://colab.research.google.com/github/anna985/pml-1/blob/master/course/Problem029_MLFlow/029_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://github.com/rawata/pml/blob/master/img/mldlc2.png?raw=1" width="900">

## KFold and StratifiedKFold train test split

In [None]:
from sklearn.model_selection import KFold
import numpy as np
import collections

X = np.array([[1, 2], [3, 4], [4, 5], [4, 4], [5, 6], [6, 7], [8, 9], [9, 10], [11, 12]])
Y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])

# 3 Fold split
print("====K-Fold Split====")
kf = KFold(n_splits=3, shuffle=True, random_state=101)
for train_indices, test_indices in kf.split(X, Y):
   print("train indices:", train_indices, "test indices:", test_indices)
   X_train, X_test = X[train_indices], X[test_indices]
   Y_train, Y_test = Y[train_indices], Y[test_indices]


#StratifiedKFold distributes the target labels within fold in same ratio in which they appear in main dataset

# TBD: Use StratifiedKFold to split X and Y into 3 fold train test set and verify that all the three target labels (0, 1, 2) are present in each fold
#StratifiedKFold distributes the target labels within fold in same ratio in which they appear in main dataset
from sklearn.model_selection import StratifiedKFold

print("\n====Stratified K-Fold Split====")
skf = StratifiedKFold(n_splits=3, shuffle=True)
for train_indices, test_indices in skf.split(X, Y):
    print("train indices:", train_indices, "test indices:", test_indices)
    X_train, X_test = X[train_indices], X[test_indices]
    Y_train, Y_test = Y[train_indices], Y[test_indices]
    train_label_ratio = { k:v/len(Y_train) for (k,v) in collections.Counter(Y_train).items() }
    print(f'Label ratio in train = {train_label_ratio}')
    test_label_ratio = { k:v/len(Y_test) for (k,v) in collections.Counter(Y_test).items() }
    print(f'Label ratio in test = {test_label_ratio}')
    print(train_label_ratio == test_label_ratio)


## GridSearch CV (Search for a best set of hyperparams for a given model)

In [None]:
##Load Boston housing dataset

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

X, Y = load_boston(return_X_y=True)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=101)

model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train, Y_train)
Y_hat = model.predict(X_test)

#Some baseline performance
print(mean_squared_error(Y_test, Y_hat))

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV

def grid_summary(grid_result):
    print(f"\nBest: {grid_result.best_score_} using {grid_result.best_params_}")
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print(f"{mean},{stdev} with: {param}")

## TBD Using GridSearchCV check which value of param n_neighbors [2,3,4,5,6,7,8] gives the best results
# This is a subset of below problem, its paramgrid has one less param to worry about.
n_neighbors = [2,3,4,5,6,7,8]
param_grid = { 'n_neighbors':n_neighbors}
from sklearn.neighbors import KNeighborsRegressor
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv = 3, scoring = 'neg_mean_squared_error' )
res = grid.fit(X_train, Y_train)
grid_summary(res)

## TBD Using GridSearchCV check which value of param combination n_neighbors [2,3,4,5,6,7,8], p [1, 2] gives the best result
n_neighbors = [2,3,4,5,6,7,8]
p = [1,2]
param_grid = { 'n_neighbors':n_neighbors, 'p': p }
from sklearn.neighbors import KNeighborsRegressor
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv = 3, scoring = 'neg_mean_squared_error' )
res = grid.fit(X_train, Y_train)
grid_summary(res)

Y_hat = grid.predict(X_test)
print(mean_squared_error(Y_test, Y_hat))

## Gridsearch across different algorithms

## TBD Create a ML pipeline that selects the best model-param combination among given set of madels and params
* LinearRegression, No params
* KNeighborsRegressor, params: {n_neighbors : [4,5,6], p: [1,2]}
* XGBoost, params : {n_estimators: [100,200,300], max_depth: [3,4,5,6,7,8], subsample: [0.9, 1.0, 1.1] }

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import  Pipeline
from sklearn.linear_model import LinearRegression


pipe  = Pipeline([('model', LinearRegression())])
param_grid = [
              {'model' : [LinearRegression()]},
              {'model' : [KNeighborsRegressor()], 'model__n_neighbors': [2,3,4], 'model__p': [1,2]},
              {'model' : [GradientBoostingRegressor()], 'model__n_estimators': [50,100,200], 'model__max_depth': [2,3,4,5,6,7] }
            ]

grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring = 'neg_mean_squared_error')

res = grid.fit(X_train, Y_train)
grid_summary(res)

Y_hat = grid.predict(X_test)
print(mean_squared_error(Y_test, Y_hat))

