---
# Cross Validation Exercises
---

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data

from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# no yelling in the library
import warnings
warnings.filterwarnings("ignore")

---
## `mpg`

Use the cross validation techniques described in the lesson to find the best model for predicting transmission type with the mpg dataset.

In [30]:
# get the data
mpg = data('mpg')
mpg.trans = np.where(mpg.trans.str.startswith('auto'), 'auto', 'manual')

In [31]:
# create X and y
X, y = mpg[['displ', 'cyl', 'cty', 'hwy']], mpg.trans

In [32]:
# split the data into train and test (no validate needed here due to use of K-Fold Cross Validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=527)

### DT

In [36]:
# Grid Search
dt = DecisionTreeClassifier(random_state=527)
grid = GridSearchCV(dt, {'max_depth': range(1, 21), 'min_samples_split': range(1, 11), 'min_samples_leaf': range(1, 11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=527),
             param_grid={'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11),
                         'min_samples_split': range(1, 11)})

In [42]:
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,max_depth,min_samples_leaf,min_samples_split,score
801,9,1,2,0.709804
802,9,1,3,0.70915
1801,19,1,2,0.698693
1601,17,1,2,0.698693
1301,14,1,2,0.698693


In [41]:
best_dt = grid.best_estimator_
best_dt.fit(X_train, y_train)
best_dt.score(X_test, y_test)

0.6440677966101694

### RF

In [46]:
# Grid Search
rf = RandomForestClassifier(random_state=527)
grid = GridSearchCV(rf, {'max_depth': range(1, 21)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=527),
             param_grid={'max_depth': range(1, 21)})

In [47]:
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,max_depth,score
9,10,0.716013
12,13,0.710131
11,12,0.704575
8,9,0.69902
10,11,0.698693


In [48]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.6949152542372882

### KNN

In [60]:
# Grid Search
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, {'n_neighbors': range(1, 21)}, cv=5)
grid.fit(X_train, y_train)
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,n_neighbors,score
17,18,0.702857
11,12,0.697143
13,14,0.697143
7,8,0.691429
14,15,0.691429


In [61]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.6949152542372882

### LR

In [56]:
# Grid Search
clf = LogisticRegression()
grid = GridSearchCV(clf, {'C': np.arange(0.1, 10, 0.1)}, cv=10)
grid.fit(X_train, y_train)
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,C,score
49,5.0,0.647059
74,7.5,0.647059
72,7.3,0.647059
71,7.2,0.647059
70,7.1,0.647059


In [59]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.6440677966101694

---
## `titanic`

Use cross validation techniques to determine the best model for predicting survival with the titanic dataset.

In [63]:
titanic = data('titanic')

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [73]:
titanic_enc = pd.get_dummies(titanic, drop_first=True)
titanic_enc.head()

Unnamed: 0,class_2nd class,class_3rd class,age_child,sex_women,survived_yes
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1
5,0,0,0,0,1


In [74]:
# create X and y
X, y = titanic_enc.drop(columns='survived_yes'), titanic_enc.survived_yes

In [75]:
# split the data into train and test (no validate needed here due to use of K-Fold Cross Validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=527)

### DT

In [76]:
# Grid Search
dt = DecisionTreeClassifier(random_state=527)
grid = GridSearchCV(dt, {'max_depth': range(1, 21), 'min_samples_split': range(1, 11), 'min_samples_leaf': range(1, 11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=527),
             param_grid={'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11),
                         'min_samples_split': range(1, 11)})

In [77]:
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,max_depth,min_samples_leaf,min_samples_split,score
1001,11,1,2,0.799433
1714,18,2,5,0.799433
1106,12,1,7,0.799433
1105,12,1,6,0.799433
1104,12,1,5,0.799433


In [78]:
best_dt = grid.best_estimator_
best_dt.fit(X_train, y_train)
best_dt.score(X_test, y_test)

0.78419452887538

### RF

In [79]:
# Grid Search
rf = RandomForestClassifier(random_state=527)
grid = GridSearchCV(rf, {'max_depth': range(1, 21)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=527),
             param_grid={'max_depth': range(1, 21)})

In [80]:
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,max_depth,score
2,3,0.799433
10,11,0.798423
11,12,0.798423
18,19,0.798423
17,18,0.798423


In [81]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.78419452887538

### KNN

In [82]:
# Grid Search
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, {'n_neighbors': range(1, 21)}, cv=5)
grid.fit(X_train, y_train)
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,n_neighbors,score
3,4,0.761873
4,5,0.74969
5,6,0.74969
2,3,0.748675
7,8,0.738579


In [83]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.7355623100303952

### LR

In [84]:
# Grid Search
clf = LogisticRegression()
grid = GridSearchCV(clf, {'C': np.arange(0.1, 10, 0.1)}, cv=10)
grid.fit(X_train, y_train)
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,C,score
49,5.0,0.77719
74,7.5,0.77719
72,7.3,0.77719
71,7.2,0.77719
70,7.1,0.77719


In [85]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.7781155015197568

---
## `tips`

Use cross validation techniques to determine the best model for predicting tip amount with the tips dataset.

In [86]:
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [87]:
pd.get_dummies(tips)

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
1,16.99,1.01,2,1,0,1,0,0,0,1,0,1,0
2,10.34,1.66,3,0,1,1,0,0,0,1,0,1,0
3,21.01,3.50,3,0,1,1,0,0,0,1,0,1,0
4,23.68,3.31,2,0,1,1,0,0,0,1,0,1,0
5,24.59,3.61,4,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,29.03,5.92,3,0,1,1,0,0,1,0,0,1,0
241,27.18,2.00,2,1,0,0,1,0,1,0,0,1,0
242,22.67,2.00,2,0,1,0,1,0,1,0,0,1,0
243,17.82,1.75,2,0,1,1,0,0,1,0,0,1,0


In [74]:
# create X and y
X, y = titanic_enc.drop(columns='survived_yes'), titanic_enc.survived_yes

In [75]:
# split the data into train and test (no validate needed here due to use of K-Fold Cross Validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=527)

### DT

In [76]:
# Grid Search
dt = DecisionTreeClassifier(random_state=527)
grid = GridSearchCV(dt, {'max_depth': range(1, 21), 'min_samples_split': range(1, 11), 'min_samples_leaf': range(1, 11)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=527),
             param_grid={'max_depth': range(1, 21),
                         'min_samples_leaf': range(1, 11),
                         'min_samples_split': range(1, 11)})

In [77]:
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,max_depth,min_samples_leaf,min_samples_split,score
1001,11,1,2,0.799433
1714,18,2,5,0.799433
1106,12,1,7,0.799433
1105,12,1,6,0.799433
1104,12,1,5,0.799433


In [78]:
best_dt = grid.best_estimator_
best_dt.fit(X_train, y_train)
best_dt.score(X_test, y_test)

0.78419452887538

### RF

In [79]:
# Grid Search
rf = RandomForestClassifier(random_state=527)
grid = GridSearchCV(rf, {'max_depth': range(1, 21)}, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=527),
             param_grid={'max_depth': range(1, 21)})

In [80]:
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,max_depth,score
2,3,0.799433
10,11,0.798423
11,12,0.798423
18,19,0.798423
17,18,0.798423


In [81]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.78419452887538

### KNN

In [82]:
# Grid Search
clf = KNeighborsClassifier()
grid = GridSearchCV(clf, {'n_neighbors': range(1, 21)}, cv=5)
grid.fit(X_train, y_train)
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,n_neighbors,score
3,4,0.761873
4,5,0.74969
5,6,0.74969
2,3,0.748675
7,8,0.738579


In [83]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.7355623100303952

### LR

In [84]:
# Grid Search
clf = LogisticRegression()
grid = GridSearchCV(clf, {'C': np.arange(0.1, 10, 0.1)}, cv=10)
grid.fit(X_train, y_train)
results = pd.DataFrame(grid.cv_results_['params'])
results['score'] = grid.cv_results_['mean_test_score']
results.sort_values(by='score', ascending=False).head(5)

Unnamed: 0,C,score
49,5.0,0.77719
74,7.5,0.77719
72,7.3,0.77719
71,7.2,0.77719
70,7.1,0.77719


In [85]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)
best_clf.score(X_test, y_test)

0.7781155015197568