# Cross Validation

In [1]:
import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, make_scorer

In [2]:
tips = data("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
X = tips[["tip", "total_bill", "size"]]
y = tips.time

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=.2)

In [4]:
tree = DecisionTreeClassifier(max_depth=4)

In [5]:
cross_val_score(tree, X_train, y_train, cv=4).mean()

0.6922831632653061

In [6]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [7]:
predicted = tree.predict(X_train)
actual = y_train

precision_score(actual, predicted, pos_label="Dinner")

0.8571428571428571

In [8]:
precision_scorer = make_scorer(precision_score, pos_label="Dinner")

# decision tree with max depth of 4
cross_val_score(tree, X_train, y_train, cv=4, scoring=precision_scorer).mean()

0.7402330314997984

In [9]:
cross_val_score(DecisionTreeClassifier(max_depth=3), X_train, y_train, scoring=precision_scorer).mean()

0.7396049896049897

---
## Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV

# keys are names of hyperparameters
# values are a list of values to try with hyperparameters
# different values of c for logistic regression
# different value of k for KNN
params = {
    "max_depth": range(1, 11),
    "criterion": ["gini", "entropy"]
}

# cv=4 means four-fold cross validation, i.e. k=4
grid = GridSearchCV(tree, params, cv=4)
grid.fit(X_train, y_train)

# best accuracy for out-of-sample data
grid.best_params_

{'criterion': 'entropy', 'max_depth': 3}

In [11]:
# .best_estimator_ gives us a model that is prefit with the best hyperparameters
model = grid.best_estimator_
model.score(X_test, y_test)

0.6530612244897959

In [12]:
grid.best_score_

0.7438350340136054

In [13]:
# cv_results_ gives us a dictionary with a params key that containts a list of dictionaries that represent the
# params that were used for the model

results = grid.cv_results_
results

{'mean_fit_time': array([0.00252116, 0.0027312 , 0.00205004, 0.0022043 , 0.00190723,
        0.00253379, 0.00195837, 0.0019753 , 0.00184453, 0.00179058,
        0.00183976, 0.00191653, 0.00204217, 0.0017398 , 0.00188369,
        0.00186181, 0.00181395, 0.00181973, 0.00193441, 0.00184423]),
 'std_fit_time': array([4.16219657e-04, 9.12074341e-04, 2.03157988e-04, 2.62950861e-04,
        3.42967944e-04, 5.59325477e-04, 2.45879209e-04, 1.27768225e-04,
        1.81718639e-04, 1.10765045e-04, 3.61674167e-04, 1.52557014e-04,
        3.62641223e-04, 1.66751118e-04, 1.28968516e-04, 1.10115706e-04,
        9.35841327e-05, 8.72606300e-05, 5.51769053e-05, 9.29028786e-05]),
 'mean_score_time': array([0.00141656, 0.00159138, 0.00111395, 0.00124359, 0.00104696,
        0.00102031, 0.0010559 , 0.00101221, 0.00089252, 0.00098199,
        0.00086421, 0.00089949, 0.00109279, 0.00082773, 0.00101954,
        0.00084978, 0.00087023, 0.00098449, 0.00099707, 0.00082999]),
 'std_score_time': array([4.96415837e-

In [14]:
# modify each parameter dictionary such that it also contains the model's average performance after cross validation
for score, p in zip(results["mean_test_score"], results["params"]):
    p["score"] = score

# score is the model's performance on unseen data (validate split); we have never touched test
df = pd.DataFrame(results["params"])
df

Unnamed: 0,criterion,max_depth,score
0,gini,1,0.733418
1,gini,2,0.733418
2,gini,3,0.723214
3,gini,4,0.697385
4,gini,5,0.625744
5,gini,6,0.671556
6,gini,7,0.65625
7,gini,8,0.671556
8,gini,9,0.646046
9,gini,10,0.651148


---
## Use Cases

- Probably won't use this for:
    - MVP
    
- Would to use this:
    - When focusing on modeling
    - In a good spot with feature engineering
    - When the best bang for your buck is going to be tweaking hyperparameters