In [5]:
%matplotlib inline

In [21]:
import numpy as np
import pandas as pd
from sklearn import datasets, svm
from sklearn.model_selection import KFold, cross_val_score


In [7]:
# Initial data load
iris = datasets.load_iris()


In [16]:
# setup kfolds instance

kf = KFold(n_splits = 4, random_state = 88 , shuffle = True)

#### *****************
# VERY IMPORTANT TO SET SHUFFLE TO TRUE AND INCLUDE RANDOM STATE - FILE IS SORTED AND WITHOUT THESE ATTRIBUTES SET PROPERLY
# KF WILL SIMPLY DIVIDE THE FILE WITHOUT CHANGING ORDER - ACCURACY GOES FROM low 30% to high 90% WITH THESE FEATURES PROPERLY SET.



In [17]:
# helper function to compute accuracy of model
def get_accuracy(preds,targetvalues):
    acc1 = preds - targetvalues # subtract the arrays - take advantage of broadcasting
    acc2 = np.count_nonzero(acc1) # get the nonzero values - where the model is wrong
    acc3 = 1.000 * (len(acc1) - acc2) / len(acc1) # Number right over total * 1.000 to convert to float
    return(acc3)

In [85]:

modeldict = {}
modelcount = 0


for train_index, test_index in kf.split(X):
    # create instance of svm model
    svc = svm.SVC(kernel='linear')

    model = svc.fit(iris.data[train_index], iris.target[train_index])
    preds = model.predict(iris.data[test_index])
    targetvalues = iris.target[test_index]
    accuracy = get_accuracy(preds, targetvalues)
    print('Accuracy of model {} manually calculated is {:1.5f}'.format(modelcount,accuracy))
    scores = cross_val_score(svc, iris.data, iris.target, cv = 4)
    scoresconf95 = scores.std() * 2
    
    print('Accuracy of model {} using cross_val_score function : {}'\
          .format(modelcount, cross_val_score(svc, iris.data[train_index],iris.target[train_index], cv = 4)))
    print("Model mean is {:1.2f} - 95% confidence interval is +/-{:1.2f} : ".format(scores.mean(), scoresconf95))
    
    f1scores = cross_val_score(svc, iris.data, iris.target, cv = 4, scoring='f1_macro')
    print("F1 scores for model {} are {}".format(modelcount,f1scores))
    


    modeldict[modelcount] = {'model': model, 'train_index': train_index, 'test_index': test_index, \
                             'preds' : preds, 'targetvalues': targetvalues, 'accuracy': accuracy} 
    modelcount += 1




Accuracy of model 0 manually calculated is 1.00000
Accuracy of model 0 using cross_val_score function : [ 1.          0.96428571  0.96428571  1.        ]
Model mean is 0.98 - 95% confidence interval is +/-0.02 : 
F1 scores for model 0 are [ 1.          0.97432099  0.97217391  0.97217391]
Accuracy of model 1 manually calculated is 0.92105
Accuracy of model 1 using cross_val_score function : [ 1.  1.  1.  1.]
Model mean is 0.98 - 95% confidence interval is +/-0.02 : 
F1 scores for model 1 are [ 1.          0.97432099  0.97217391  0.97217391]
Accuracy of model 2 manually calculated is 1.00000
Accuracy of model 2 using cross_val_score function : [ 1.          0.96428571  0.96428571  1.        ]
Model mean is 0.98 - 95% confidence interval is +/-0.02 : 
F1 scores for model 2 are [ 1.          0.97432099  0.97217391  0.97217391]
Accuracy of model 3 manually calculated is 0.91892
Accuracy of model 3 using cross_val_score function : [ 1.          1.          0.92592593  0.96296296]
Model mean 

In [86]:
#think about confusion matrix and classification report capabiltiies in scikit


In [87]:
modeldict

{0: {'accuracy': 1.0,
  'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False),
  'preds': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
  'targetvalues': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
  'test_index': array([ 14,  15,  17,  19,  20,  22,  24,  29,  30,  31,  33,  39,  45,
          52,  53,  55,  57,  59,  60,  66,  73,  84,  95, 100, 104, 109,
         111, 113, 116, 117, 120, 121, 123, 129, 130, 133, 134, 140], dtype=int64),
  'train_index': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  16,  18,  21,  23,  25,  26,  27,  28,  32,  34,  35,  36,
          37,  38,  40,  41,  4