Barrett Downs
02/11/2021

# Cross Validation and model Selection

## imports


In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn import model_selection
from sklearn import metrics

## Load Data

In [2]:
iris = datasets.load_iris()
tmp = {name: iris.data[:, i] for i, name in enumerate(iris.feature_names)}  # dictionary comprehension
tmp["target"] = [iris.target_names[i] for i in iris.target]
iris_df = pd.DataFrame(tmp)
iris_df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target"]
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
X = iris_df.drop(["target"], axis=1)
y = iris_df["target"]
clf = tree.DecisionTreeClassifier()
clf

DecisionTreeClassifier()

## Train/test Split

In [4]:
(X_train, X_test, y_train, y_test) = model_selection.train_test_split(X, y, test_size=1/3) #splitting data into thirds
print(X.shape, X_train.shape, X_test.shape)#confirm test set is 1/3 of data set

(150, 4) (100, 4) (50, 4)


In [5]:
#train 
clf.fit(X_train, y_train)
#predict
y_pred = clf.predict(X_test)

In [6]:
metrics.precision_score(y_test, y_pred, average="weighted")

0.9653333333333333

In [7]:
metrics.recall_score(y_test, y_pred, average="weighted")

0.96

In [8]:
metrics.f1_score(y_test, y_pred, average="weighted")

0.9600952380952381

In [9]:
# score
print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))


precision=0.9653333333333333
recall=   0.96
f1=       0.9600952380952381


## KFold Cross Validation

In [12]:
kf = model_selection.KFold(n_splits=10, random_state=30, shuffle=True) #Using 10 Folds
for train_index, test_index in kf.split(iris_df):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()
    

TRAIN: [0 1 2 4 6] TEST: [ 3  5  8 20 38]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [3 4 5 6 7] TEST: [ 0  1  2 24 32]
precision=0.8666666666666667
recall=   0.8666666666666667
f1=       0.8666666666666667

TRAIN: [0 1 2 3 4] TEST: [14 68 71 81 98]
precision=0.9466666666666667
recall=   0.9333333333333333
f1=       0.9352826510721248

TRAIN: [0 1 2 3 4] TEST: [ 6 12 21 59 63]
precision=0.9666666666666667
recall=   0.9333333333333333
f1=       0.9428571428571428

TRAIN: [0 1 2 3 4] TEST: [10 17 23 29 31]
precision=0.9444444444444445
recall=   0.9333333333333333
f1=       0.9316017316017315

TRAIN: [0 1 2 3 5] TEST: [ 4 16 34 41 43]
precision=0.9444444444444445
recall=   0.9333333333333333
f1=       0.9326599326599326

TRAIN: [0 1 2 3 4] TEST: [19 22 25 26 30]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [0 1 2 3 4] TEST: [ 9 36 39 44 47]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [0 1 2 3 4] TEST: [11 13 15 27 28]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [0 

[Scoring Parameter](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

In [13]:
#Same method as above just much less code to write
scores = model_selection.cross_validate(clf, X, y, cv=10, # using 10 folds
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
scores

{'fit_time': array([0.00310016, 0.00251913, 0.00245595, 0.00254583, 0.00248003,
        0.00291586, 0.00220799, 0.00238276, 0.00217915, 0.00201201]),
 'score_time': array([0.00490189, 0.00436592, 0.00501513, 0.00448012, 0.00398612,
        0.00417209, 0.00391579, 0.00414515, 0.00368905, 0.00382996]),
 'test_precision_weighted': array([1.        , 0.94444444, 1.        , 0.94444444, 0.94444444,
        0.86666667, 0.94444444, 1.        , 1.        , 1.        ]),
 'test_recall_weighted': array([1.        , 0.93333333, 1.        , 0.93333333, 0.93333333,
        0.86666667, 0.93333333, 1.        , 1.        , 1.        ]),
 'test_f1_weighted': array([1.        , 0.93265993, 1.        , 0.93265993, 0.93265993,
        0.86666667, 0.93265993, 1.        , 1.        , 1.        ])}

In [14]:
np.mean(scores["test_f1_weighted"])

0.9597306397306398