# k Fold nested Cross Validation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from dt import DecisionTreeClassifier # my implementation
from sklearn.model_selection import train_test_split

In [3]:
data_loc = "../data/iris/"
%ls {data_loc}

Index  iris.csv  iris.names


In [4]:
data = pd.read_csv(data_loc + "iris.csv", header=None)
data.head(6)

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa


In [5]:
from sklearn.utils import shuffle

def nested_cross(dTrain, depths, num_valid):
    folds = make_folds(dTrain, num_valid)
    # holding out test
    passed_folds = folds[:-1]
    k = best_k(passed_folds, depths)
    acc = 0
    for i in range(num_valid):
        curr_fold_train = [x for j,x in enumerate(folds) if j!=i] 
        curr_fold_test = folds[i]
        curr_fold_train = pd.concat(curr_fold_train)
        mydt = DecisionTreeClassifier(method='gini', max_depth=k)
        mydt.train(curr_fold_train)
        my_preds = np.squeeze(mydt.predict(curr_fold_test.iloc[:, :-1]).values)
        true = curr_fold_test.iloc[:, -1].values
        my_acc = ((true == my_preds).sum())/curr_fold_test.shape[0]
        acc += my_acc/num_valid
    return {"depth": k, "acc": acc}   

def make_folds(dTrain, num_valid):
    '''Make a number of folds with diven pd'''
    train = shuffle(dTrain)
    last = train.shape[0]
    folds = []
    for i in range(num_valid):
        start = i*int(last/num_valid)
        end = (i+1)*int(last/num_valid)
        if end >= last:
            end = -1
        folds.append(train.iloc[start:end, :])
    return folds

def best_k(folds, depths):
    '''return depth that maximizes the avg accuracy'''
    num_valid = len(folds)
    acc = {}
    for i in range(num_valid):
        curr_fold_train = [x for j,x in enumerate(folds) if j!=i] 
        curr_fold_validation = folds[i]
        curr_fold_train = pd.concat(curr_fold_train)
        for k in depths:
            mydt = DecisionTreeClassifier(method='gini', max_depth=k)
            mydt.train(curr_fold_train)
            my_preds = np.squeeze(mydt.predict(curr_fold_validation.iloc[:, :-1]).values)
            true = curr_fold_validation.iloc[:, -1].values
            my_acc = ((true == my_preds).sum())/curr_fold_validation.shape[0]
            if k in acc.keys():
                acc[k] += my_acc
            else:
                acc[k] = my_acc
    acc = {k: acc[k]/num_valid for k in acc.keys()}
    
    x = acc
    print (acc) # for showcasing
    sorted_by_value = sorted(x.items(), key=lambda kv: kv[1])
    return sorted(x[0] for x in sorted_by_value if sorted_by_value[-1][1] == x[1])[0]

In [6]:
dic = nested_cross(data, [1, 3, 5, 7, 9, 11], 4)

{1: 0.7027027027027027, 3: 0.918918918918919, 5: 0.9279279279279281, 7: 0.9279279279279281, 9: 0.9279279279279281, 11: 0.9279279279279281}


In [7]:
print ("optimal depth:", dic["depth"], "| with accuracy:", dic["acc"]) 

optimal depth: 5 | with accuracy: 0.9391891891891893
