# Cross Validation

In [1]:
import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, make_scorer

In [2]:
tips = data("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
X = tips[["tip", "total_bill", "size"]]
y = tips.time

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=.2)

In [4]:
tree = DecisionTreeClassifier(max_depth=4)

In [5]:
cross_val_score(tree, X_train, y_train, cv=4).mean()

0.7024872448979592

In [6]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [7]:
predicted = tree.predict(X_train)
actual = y_train

precision_score(actual, predicted, pos_label="Dinner")

0.8571428571428571

In [8]:
precision_scorer = make_scorer(precision_score, pos_label="Dinner")

# decision tree with max depth of 4
cross_val_score(tree, X_train, y_train, cv=4, scoring=precision_scorer).mean()

0.7371597454477888

In [9]:
cross_val_score(DecisionTreeClassifier(max_depth=3), X_train, y_train, scoring=precision_scorer).mean()

0.7396049896049897

---
## Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV

# keys are names of hyperparameters
# values are a list of values to try with hyperparameters
# different values of c for logistic regression
# different value of k for KNN
params = {
    "max_depth": range(1, 11),
    "criterion": ["gini", "entropy"]
}

# cv=4 means four-fold cross validation, i.e. k=4
grid = GridSearchCV(tree, params, cv=4)
grid.fit(X_train, y_train)

# best accuracy for out-of-sample data
grid.best_params_

{'criterion': 'entropy', 'max_depth': 3}

In [11]:
# .best_estimator_ gives us a model that is prefit with the best hyperparameters
model = grid.best_estimator_
model.score(X_test, y_test)

0.6530612244897959

In [12]:
grid.best_score_

0.7387329931972789

In [13]:
# cv_results_ gives us a dictionary with a params key that containts a list of dictionaries that represent the
# params that were used for the model

results = grid.cv_results_
results

{'mean_fit_time': array([0.00246769, 0.00180036, 0.00210756, 0.00197673, 0.0019722 ,
        0.00197279, 0.00173891, 0.00185245, 0.00190479, 0.00185728,
        0.00171095, 0.00175893, 0.00170445, 0.00179547, 0.00172669,
        0.0017947 , 0.00186735, 0.00182438, 0.00185919, 0.00186032]),
 'std_fit_time': array([6.63427262e-04, 2.23459228e-04, 6.37209894e-05, 2.94945802e-04,
        1.75426424e-04, 3.00743397e-04, 2.97872982e-05, 1.14874141e-04,
        1.50665204e-04, 1.43941552e-04, 9.99098134e-05, 1.56865701e-04,
        8.16643671e-05, 7.96834297e-05, 2.18620704e-05, 5.40228565e-05,
        9.63337010e-05, 2.58489065e-05, 3.94656572e-05, 6.00858279e-05]),
 'mean_score_time': array([0.00130647, 0.00087696, 0.00102597, 0.00092477, 0.00097406,
        0.00090504, 0.000907  , 0.00087303, 0.00088012, 0.00088274,
        0.00088012, 0.00084525, 0.00085127, 0.00088018, 0.00091308,
        0.00085163, 0.00087804, 0.00086236, 0.00095576, 0.00089848]),
 'std_score_time': array([2.01851031e-

In [14]:
# modify each parameter dictionary such that it also contains the model's average performance after cross validation
for score, p in zip(results["mean_test_score"], results["params"]):
    p["score"] = score

# score is the model's performance on unseen data (validate split); we have never touched test
df = pd.DataFrame(results["params"])
df

Unnamed: 0,criterion,max_depth,score
0,gini,1,0.733418
1,gini,2,0.733418
2,gini,3,0.718112
3,gini,4,0.697385
4,gini,5,0.630846
5,gini,6,0.666454
6,gini,7,0.661352
7,gini,8,0.65625
8,gini,9,0.656144
9,gini,10,0.661352


---
## Use Cases

- Probably won't use this for:
    - MVP
    
- Would to use this:
    - When focusing on modeling
    - In a good spot with feature engineering
    - When the best bang for your buck is going to be tweaking hyperparameters