Barrett Downs
01/14/2021

# Cross Validation and model Selection

## imports


In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn import model_selection
from sklearn import metrics

## Load Data

In [2]:
iris = datasets.load_iris()
tmp = {name: iris.data[:, i] for i, name in enumerate(iris.feature_names)}  # dictionary comprehension
tmp["target"] = [iris.target_names[i] for i in iris.target]
iris_df = pd.DataFrame(tmp)
iris_df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target"]
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
X = iris_df.drop(["target"], axis=1)
y = iris_df["target"]
clf = tree.DecisionTreeClassifier()
clf

DecisionTreeClassifier()

## Train/test Split

In [8]:
(X_train, X_test, y_train, y_test) = model_selection.train_test_split(X, y, test_size=1/3) #splitting data into thirds
print(X.shape, X_train.shape, X_test.shape)#confirm test set is 1/3 of data set

(150, 4) (100, 4) (50, 4)


In [9]:
#train 
clf.fit(X_train, y_train)
#predict
y_pred = clf.predict(X_test)

In [10]:
metrics.precision_score(y_test, y_pred, average="weighted")

0.8876190476190476

In [11]:
metrics.recall_score(y_test, y_pred, average="weighted")

0.88

In [12]:
metrics.f1_score(y_test, y_pred, average="weighted")

0.8818881118881117

In [13]:
# score
print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))


precision=0.8876190476190476
recall=   0.88
f1=       0.8818881118881117


## KFold Cross Validation

In [14]:
kf = model_selection.KFold(n_splits=5, shuffle=True) #Using 5 Folds
for train_index, test_index in kf.split(iris_df):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()
    

TRAIN: [0 1 2 3 5] TEST: [ 4  9 13 18 23]
precision=0.9018939393939394
recall=   0.9
f1=       0.8994397759103642

TRAIN: [0 1 3 4 5] TEST: [ 2 16 20 21 27]
precision=0.9333333333333333
recall=   0.9333333333333333
f1=       0.9333333333333333

TRAIN: [0 1 2 4 5] TEST: [ 3  6 10 11 22]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [0 2 3 4 5] TEST: [ 1  8 12 14 19]
precision=0.8333333333333334
recall=   0.8333333333333334
f1=       0.8318250377073907

TRAIN: [1 2 3 4 6] TEST: [ 0  5  7 15 17]
precision=0.9700000000000001
recall=   0.9666666666666667
f1=       0.9668771929824562



[Scoring Parameter](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

In [15]:
#Same method as above just much less code to write
scores = model_selection.cross_validate(clf, X, y, cv=5, # using 5 folds
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
scores

{'fit_time': array([0.0026722 , 0.00261784, 0.00284123, 0.00249505, 0.00229716]),
 'score_time': array([0.00516868, 0.00412512, 0.0044899 , 0.0042932 , 0.00376487]),
 'test_precision_weighted': array([0.96969697, 0.96969697, 0.9023569 , 0.93333333, 1.        ]),
 'test_recall_weighted': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ]),
 'test_f1_weighted': array([0.96658312, 0.96658312, 0.89974937, 0.93333333, 1.        ])}

In [16]:
np.mean(scores["test_f1_weighted"])

0.9532497911445279