In [1]:
import sklearn
import numpy as np
import pandas as pd
from scipy import stats
import json
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate, cross_val_score, LeaveOneOut, KFold
from sklearn.naive_bayes import GaussianNB
from IPython.display import display

# Machine Learning Final Project
**Student name:** Mohammad Amin Dadgar

**Student Id:** 4003624016

**Instructor:** Dr. Peyman Adibi

**Date:** Tir 1401 | June, July 2022

The Final Project is the implementation of [dependency analysis of accuracy estimates in k-fold cross validation](https://ieeexplore.ieee.org/document/8012491) article.

**Abstract:** K-fold cross-validation is a method to evaluate the performance of classification algorithms. In this report, we are going to show the appropriateness of K-fold cross-validation for the dependence of fold accuracies. 

**Introduction**
Several studies have shown that in k-fold cross-validation, fold accuracies are dependent, but there is no formal definition for this fact. This report is the reproduction results of referenced article that in section 2, we introduced the statistical methods needed for this experiment. In section 3 sampling the distribution from fold accuracies is introduced and using this in section 4, a statistical method is shown to evaluate the fold accuracies independence. In section 5, the method introduced in section 4 is tested on 20 UCI datasets, and in section 6 we’ve concluded the work done before.

To inference the idea behind this article, in section 4 the variances are given as below

sample variance:
\begin{equation}
s^2 = \frac{\sum_{i=0}^{k} (\bar p_i - \bar{\bar{x}})^2}{k-1}
\end{equation}
And the variance for leave-one-out cross validation
\begin{equation}
\sigma_I^2 = \frac{p(1-p)}{n}
\end{equation}

to explain the equations above we can say that $p$ is the accuracy of the classifier and to find it, we can devide the correct classifications by the sample size.

$\bar p_i$ refers to the accuracy estimate in fold i, and can be written as below
\begin{equation}
\bar{p_i} = \frac{\sum_{j=1}^{m} x_{ij}}{m}
\end{equation}
where $m$ is the count of each fold sample. For example if we had $200$ sample in our dataset and using $5$ folds, then we would have $m=\frac{200}{5}$.

where $x_{ij}$ is a function that outputs $1$ where the classification is correct and outputs $0$ when the classification is wrong.

The $\bar{\bar{x}}$ represents the sample mean and it's possible to find it using the equation below
\begin{equation}
\bar{\bar{x}} = \frac{\sum_{i=1}^{k} \bar{p_i}}{k}
\end{equation}

We will start the dependency test of folds with a hypothesis $H_0: s^2/k=\sigma_l^2$. the test statistics can be found as $\chi^2 = \frac{(k-1) s^2}{k \sigma_l^2}$ when $\chi$ has $k-1$ degrees of freedom.

In [2]:
######### Let's re-calculate the example 2 and 3 in the article
total = 200
classification_res = np.array([32, 30, 27, 30, 25])
sample_mean = np.sum(classification_res / 200) 

for true_classified in classification_res:
    print(f"acc: {true_classified / 40}")

print(f"sample mean is {sample_mean}")

acc: 0.8
acc: 0.75
acc: 0.675
acc: 0.75
acc: 0.625
sample mean is 0.72


In [3]:
##### implementing equation 1
def find_sample_variance(true_classification, fold_samples_count):
    """
    Find sample variance of k-fold cross validation
    
    Parameters:
    -----------
    true_classification : array of floats
        the portion of true classified samples in each fold
        note that the folds count is the length of this array
    fold_samples_count : positive integer
        the count of folds data samples
        
    Returns:
    ---------
    sample_variance : float
        the sample variance of all the folds
    """
    ## find the total sample count
    total_sample_count = fold_samples_count * len(true_classification)
    
    sample_mean = find_sample_mean(true_classification, total_sample_count)
    
    ## the subtraction of true classified portion from sample mean
    subtraction_arr = np.subtract(true_classification / fold_samples_count, sample_mean)

    ## to divide the found value by `k-1`
    sample_variance = np.sum(np.power(subtraction_arr, 2)) / (len(true_classification) - 1)
    
    return sample_variance
    
def find_sample_mean(true_classification, total_samples_count):
    """
    Find the sample mean of k-fold cross validation using the true classification results
    
    Parameters:
    -----------
    true_classification : array of floats
        the portion of true classified samples in each fold
        note that the folds count is the length of this array
    total_samples_count : positive integer
        the count of total sample size (dataset length maybe)
    
    Returns:
    ---------
    sample_mean : float
        the sample mean calculated for all the dataset
    """
    sample_mean = np.sum(true_classification / total_samples_count)
    
    return sample_mean

In [4]:
m_sample_variance = find_sample_variance(classification_res, 40)
m_sample_variance

0.004812500000000001

In [5]:
def find_total_variance(accuracy, total_sample_count):
    """
    Find the variance of a set using its accuracy and samples count
    the equation is `accuracy(1-accuracy)/total_sample_count`
    
    Parameters:
    ------------
    accuracy : float between 0 and 1
        the floating value that rerpresent the portion of true classified samples over all samples
    total_sample_count : integer
        the total samples in a dataset
    
    Returns:
    ---------
    variance : float
        the calculated variance for the dataset
    """
    
    variance = (accuracy * (1 - accuracy)) / total_sample_count
    
    return variance

In [6]:
m_variance = find_total_variance(0.7, 200)
m_variance

0.0010500000000000002

In [7]:
def chi_independence_test(k_folds, total_variance, sample_variance):
    """
    the chi square independence test introduced in the article
    equation is `(`k-1` folds * sample_varience) / `k` folds  * total_variance`
    
    Parameters:
    ------------
    k_folds : positive integer
        the count of folds applied for a model
    total_variance : float
        the variance represented by leave-one-out cross validation
    sample_variance : float
        the variance that is found by aggregation of fold accuracies
        
    Returns:
    ---------
    chi_square : float
        the chi square value 
    """
    
    chi_square = ((k_folds - 1) * sample_variance) / (k_folds * total_variance)
    
    return chi_square

In [8]:
chisquare_value = chi_independence_test(5, m_variance, m_sample_variance)
chisquare_value

3.6666666666666665

And finding the exact values for `Example 3.`, we can now go on to real tests for real datasets, but Before going to experiments in real datasets another thing is to find out the p-value for our test.

So we are going to find the p-value first

In [9]:
stats.chi2.cdf(chisquare_value, 4)

0.5470073861075335

# 1-NN Liver disorders dataset
Let's try KNN method with K=1 with different datasets. minkowski distance with $p=2$ is the euclidean distance and we are using euclidean distance as our nearest neighbour metric.

In [10]:
ds_liver = pd.read_csv('Datasets/liver-disorders/bupa.data', names=['mcv', 'alkphos', 'sgpt', 'sgot', 'gammagt', 'drinks', 'selector'])
ds_liver.head()

Unnamed: 0,mcv,alkphos,sgpt,sgot,gammagt,drinks,selector
0,85,92,45,27,31,0.0,1
1,85,64,59,32,23,0.0,2
2,86,54,33,16,54,0.0,2
3,91,78,34,24,36,0.0,2
4,87,70,12,28,10,0.0,2


In [11]:
ds_liver.selector.unique()

array([1, 2], dtype=int64)

we have two classess for each data, selector is the label in our dataset.

In [12]:
KNN_1_liver = KNeighborsClassifier(n_neighbors=1, p=2)
# KNN_1_liver.fit(ds_liver[ds_liver.columns[:-1]], ds_liver.selector)

In [13]:
ds_liver_X = ds_liver[ds_liver.columns[:-1]]
ds_liver_Y = ds_liver.selector 

In [14]:
KNN_1_5cv_scores = cross_validate(KNN_1_liver, 
                                  ds_liver_X,
                                  ds_liver_Y,
                                  cv=5,
                                  return_train_score=True,
                                 return_estimator=True)
KNN_1_5cv_scores

{'fit_time': array([0.00602102, 0.00300574, 0.00299954, 0.00199652, 0.00199032]),
 'score_time': array([0.00399899, 0.00500011, 0.00299978, 0.00300145, 0.00299811]),
 'estimator': [KNeighborsClassifier(n_neighbors=1),
  KNeighborsClassifier(n_neighbors=1),
  KNeighborsClassifier(n_neighbors=1),
  KNeighborsClassifier(n_neighbors=1),
  KNeighborsClassifier(n_neighbors=1)],
 'test_score': array([0.65217391, 0.76811594, 0.62318841, 0.66666667, 0.53623188]),
 'train_score': array([1., 1., 1., 1., 1.])}

In [15]:
m_sample_mean = np.mean(KNN_1_5cv_scores['test_score'])
m_sample_mean

0.6492753623188406

In [16]:
## find sample variance
def find_sample_variance_using_accuracies(fold_accuracies):
    """
    Find variances using the accuracies got from k-fold cross validation
    
    Parameters:
    ------------
    fold_accuracies : float between 0 and 1
        the accuracies in each fold of k-fold cross validation
        note that the k folds count will be computed using the length of this parameter  
    
    Returns:
    ---------
    variance : float
        the variance of accuracies
    """
    sample_mean = np.mean(fold_accuracies)
    
    ## the subtraction of true classified portion from sample mean
    subtraction = np.subtract(fold_accuracies, sample_mean)
    
    variance = np.sum(np.power(subtraction, 2)) / (len(fold_accuracies) - 1)
    
    return variance

In [17]:
sample_variance = find_sample_variance_using_accuracies(KNN_1_5cv_scores['test_score'])
sample_variance

0.006973324931737026

## Leave one out method
In this method the folds count is eqaul to the data size and test size in each fold is 1.https://www.statology.org/leave-one-out-cross-validation/

In [18]:
KNN_leaveOneOut = KNeighborsClassifier(n_neighbors=1, p=2)
KNN_leaveOneOut_result = cross_validate(KNN_leaveOneOut, 
                                  ds_liver_X,
                                  ds_liver_Y,
                                  cv=LeaveOneOut(),
                                  return_train_score=True,
                                 return_estimator=False)

In [19]:
KNN_leaveOneOut_result['test_score']

array([0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1.,
       1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0.,
       0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 1.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1.,
       1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0.,
       1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1.,
       1., 0., 0., 1., 1.

In [20]:
leaveOneOut_acc = sum(KNN_leaveOneOut_result['test_score']) / len(ds_liver)
leaveOneOut_acc

0.6202898550724638

In [21]:
## implementing leave-one-out method by hand
kf = KFold(n_splits=len(ds_liver))

KNN_leaveOneOut = KNeighborsClassifier(n_neighbors=1, p=2)


test_result = []
for train, test in kf.split(ds_liver):
    ## predict using the classifier
    KNN_leaveOneOut.fit(ds_liver_X.loc[train], ds_liver_Y.loc[train])
    ## predict the test data
    y_pred = KNN_leaveOneOut.predict(ds_liver_X.loc[test])
    
    test_result.append(y_pred == ds_liver_Y.loc[test].values)

In [22]:
sum(test_result) / len(ds_liver)

array([0.62028986])

We can see that the accuracy of leave-one-out compited by hand is the same as the `LeaveOneOut()` method in sklearn. So we will continue with sklearn method because of its simplicity. (We just implement it by hand to see we are going in the right direction)

Now as Example 3, we can compute the chi value

In [23]:
sigma = (leaveOneOut_acc * (1 - leaveOneOut_acc)) / len(ds_liver)
sigma

0.000682696668888828

In [24]:
## the folds was chosen 5 above
chi_square = ((5-1) * sample_variance) / (5* sigma)
chi_square

8.171506028394088

In [25]:
## number of freedom is one less than 5
stats.chi2.cdf(chi_square, 4)

0.9145060709028664

With significance level $\alpha = 0.05$, our null hypothesis cannot be reject because the p-value is $0.91$. And we can say that here 5 fold accuracies are independent.


# Table 2 Reproduction
Here we will reproduce table 2 results. To do this we need to find the leave-one-out accuracies of `1-NN`, `3-NN`, `5-NN`, `7-NN` and `NBC` of different datasets.

## Livers-Disorders

In [26]:
ds_liver = pd.read_csv('Datasets/liver-disorders/bupa.data', names=['mcv', 'alkphos', 'sgpt', 'sgot', 'gammagt', 'drinks', 'selector'])

ds_liver_X = ds_liver[ds_liver.columns[:-1]]
ds_liver_Y = ds_liver.selector

In [27]:
def find_leave_one_out_results(dataset_X, dataset_Y, verbose=False):
    """
    Find the leave-one-out results of a dataset for 1NN, 3NN, 5NN, 7NN and Naive Bayes algorithm
    
    Parameters:
    ------------
    dataset_X : array_like
        the features vector
    dataset_Y : array_like
        the label for each feature
    verbose : bool
        print the progress if True
        default is False
        
    Returns:
    ---------
    acc_dict : dictionary
        the leave-one-out method applied on the dataset and the results of each method accuracy is returned
    """
    ## initialize a dicitonary
    acc_dict = {}
    
    ## 1-NN, 3-NN, 5-NN, 7-NN
    for neighbours_count in [1, 3, 5, 7]:
        ## initialize KNN classifier with neighbours_count value
        KNN_Classifier = KNeighborsClassifier(n_neighbors=neighbours_count, p=2)
        ## find the results of leave-one-out cross validation
        KNN_leaveOneOut_result = cross_validate(KNN_Classifier, 
                                      dataset_X,
                                      dataset_Y,
                                      cv=LeaveOneOut(),
                                      return_train_score=False,
                                      return_estimator=False)

        ## both dataset_X and dataset_Y have the same length
        ## here just use one of it to find the accuracy of classification
        acc_score = sum(KNN_leaveOneOut_result['test_score']) / len(dataset_X)
        
        if verbose:
            print(f'{neighbours_count}-NN finished with accuracy score: {acc_score}')
        
        ## add the accuracy of the classifier to our results
        acc_dict[f'{neighbours_count}NN'] = acc_score
    
    ## Appying Naive Bayes afterwards
    
    ## Naive Bayes
    NB_Classifier = GaussianNB()
    NB_leaveOneOut_result = cross_validate(NB_Classifier, 
                                  dataset_X,
                                  dataset_Y,
                                  cv=LeaveOneOut(),
                                  return_train_score=False,
                                  return_estimator=False)
    ## get the accuracy of Nive Bayes and save it in the dictionary
    NB_acc_score = sum(KNN_leaveOneOut_result['test_score']) / len(dataset_X)
    acc_dict['NBC'] = NB_acc_score
    
    if verbose:
        print(f'Naive Bayes finished with accuracy score: {NB_acc_score}')
    
    return acc_dict

Because of multiple runs of the project, We've created this function just to save the produced results and read them in the next runs.

In [28]:
def read_if_available_else_produce(dataset_X, dataset_Y, file_name):
    """
    Read the data if available and else run the leave-one-out method on dataset
    
    Parameters:
    ------------
    dataset_X : array_like
        the features vector
    dataset_Y : array_like
        the label for each feature
    file_name : string
        where to read or save the data
    
    Returns:
    ---------
    acc_dict : dictionary
        the leave-one-out method applied on the dataset and the results of each method accuracy is returned
    """
    ## initialize out of the conditions to have access to it
    table2_results = {}
    ## check if the results is not available then run the method to produce results
    if os.path.isfile(file_name) == False:  
        print("Results not available, Producing them\n")
        table2_results = find_leave_one_out_results(dataset_X,
                                                    dataset_Y,
                                                    verbose=True)

        ## save the results to use later
        file = open(file_name, 'w')
        json.dump(table2_results, file)
        file.close()
    else:
        print(f"Reading From previous data\nResults from file {file_name}")
        file = open(file_name, 'r')
        table2_results = file.read()
        ## convert string to dictionary
        table2_results = json.loads(table2_results)
        file.close()

    return table2_results


In [29]:
livers_fileName = 'results/LeaveOneOut_liverDisorders.txt'
table2_results_livers_disorder = read_if_available_else_produce(ds_liver_X, ds_liver_Y, livers_fileName)
table2_results_livers_disorder

Reading From previous data
Results from file results/LeaveOneOut_liverDisorders.txt


{'1NN': 0.6202898550724638,
 '3NN': 0.6376811594202898,
 '5NN': 0.6608695652173913,
 '7NN': 0.6869565217391305,
 'NBC': 0.6869565217391305}

## Letter-Recognition dataset
dataset link: https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/

In [30]:
letter_recognition_cols = ['letter', 'x-box', 'y-box', 
                            'width', 'height', 'onpix', 'x-bar', 'y-bar',
                            'x2-bar', 'y2-bar', 'xy-bar', 'x2y-bar','xy2-bar',
                            'x-ege', 'xegvy', 'y-ege','yegvx']
ds_letter_reco = pd.read_csv('Datasets/letter recognition/letter-recognition.data',
                            names=letter_recognition_cols)

In [31]:
ds_letter_reco.head()

Unnamed: 0,letter,x-box,y-box,width,height,onpix,x-bar,y-bar,x2-bar,y2-bar,xy-bar,x2y-bar,xy2-bar,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [32]:
ds_letter_reco_X = ds_letter_reco[letter_recognition_cols[1:]]
ds_letter_reco_Y = ds_letter_reco[letter_recognition_cols[0]]

In [33]:
letter_recognition_result_fileName = 'results/LeaveOneOut_letter_recognition.txt'
table2_results_letter_recognition = read_if_available_else_produce(ds_letter_reco_X,
                               ds_letter_reco_Y
                              ,letter_recognition_result_fileName)
table2_results_letter_recognition

Reading From previous data
Results from file results/LeaveOneOut_letter_recognition.txt


{'1NN': 0.96245, '3NN': 0.9595, '5NN': 0.958, '7NN': 0.95585, 'NBC': 0.95585}

## MAGIC gamma telescope data
Link: https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope

In [34]:
ds_magic_gamma = pd.read_csv('Datasets/magic/magic04.data', 
                             names=['fLength', 'fWidth', 'fSize', 'fConc',
                                   'fConc1', 'fAsym', 'fM3Long', 'fM3Trans',
                                   'fAlpha', 'fDist', 'class'])

In [35]:
ds_magic_gamma.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [36]:
ds_magic_gamma_X = ds_magic_gamma[ds_magic_gamma.columns[:-1]]
ds_magic_gamma_Y = ds_magic_gamma['class']

In [37]:
magic_gamma_FileName = 'results/LeaveOneOut_magic_gamma.txt'
table2_results_magic_gamma = read_if_available_else_produce(ds_letter_reco_X,
                                                            ds_letter_reco_Y,
                                                            magic_gamma_FileName)

Reading From previous data
Results from file results/LeaveOneOut_magic_gamma.txt


In [38]:
table2_results_magic_gamma

{'1NN': 0.96245, '3NN': 0.9595, '5NN': 0.958, '7NN': 0.95585, 'NBC': 0.95585}

## Page-Blocks dataset
Link: https://archive.ics.uci.edu/ml/datasets/Page+Blocks+Classification

In [39]:
ds_page_blocks = pd.read_csv('Datasets/page-blocks/page-blocks.data', 
                             sep=' +',
                             engine='python',
                             names=['height', 'length', 'area', 'eccen',
                                   'p_black', 'p_and', 'mean_tr', 'blackpix',
                                   'blackand', 'wb_trans'])

In [40]:
ds_page_blocks.head()


Unnamed: 0,height,length,area,eccen,p_black,p_and,mean_tr,blackpix,blackand,wb_trans
5,7,35,1.4,0.4,0.657,2.33,14,23,6,1
6,7,42,1.167,0.429,0.881,3.6,18,37,5,1
6,18,108,3.0,0.287,0.741,4.43,31,80,7,1
5,7,35,1.4,0.371,0.743,4.33,13,26,3,1
6,3,18,0.5,0.5,0.944,2.25,9,17,4,1


In [41]:
ds_page_blocks_X = ds_page_blocks[ds_page_blocks.columns[:-1]]
ds_page_blocks_Y = ds_page_blocks.wb_trans

In [42]:
page_blocks_FileName = 'results/LeaveOneOut_page_blocks.txt'
table2_results_page_blocks = read_if_available_else_produce(ds_page_blocks_X,
                                                            ds_page_blocks_Y,
                                                            page_blocks_FileName)
table2_results_page_blocks

Reading From previous data
Results from file results/LeaveOneOut_page_blocks.txt


{'1NN': 0.9572446555819477,
 '3NN': 0.9574273707290334,
 '5NN': 0.957061940434862,
 '7NN': 0.9552347889640052,
 'NBC': 0.9552347889640052}

## Wine Quality dataset
Link: https://archive.ics.uci.edu/ml/datasets/wine+quality

### White Wine

In [43]:
ds_redWine = pd.read_csv('Datasets/wine-quality/winequality-red.csv', sep=';')
ds_redWine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [44]:
ds_redWine_X = ds_redWine[ds_redWine.columns[:-1]]
ds_redWine_Y = ds_redWine.quality

In [45]:
redWine_FileName = 'results/LeaveOneOut_redWine.txt'
table2_results_redWine = read_if_available_else_produce(ds_redWine_X,
                                                        ds_redWine_Y,
                                                        redWine_FileName)
table2_results_redWine

Reading From previous data
Results from file results/LeaveOneOut_redWine.txt


{'1NN': 0.6153846153846154,
 '3NN': 0.5234521575984991,
 '5NN': 0.5103189493433395,
 '7NN': 0.4996873045653533,
 'NBC': 0.4996873045653533}

### Red Wine

In [46]:
ds_whiteWine = pd.read_csv('Datasets/wine-quality/winequality-white.csv', sep=';')
ds_whiteWine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [47]:
ds_whiteWine_X = ds_whiteWine[ds_whiteWine.columns[:-1]]
ds_whiteWine_Y = ds_whiteWine.quality

In [48]:
white_Wine_FileName = 'results/LeaveOneOut_white_Wine.txt'
table2_results_White_Wine = read_if_available_else_produce(ds_whiteWine_X,
                                                        ds_whiteWine_Y,
                                                        white_Wine_FileName)
table2_results_White_Wine

Reading From previous data
Results from file results/LeaveOneOut_white_Wine.txt


{'1NN': 0.6161698652511229,
 '3NN': 0.5065332788893426,
 '5NN': 0.4959167006941609,
 '7NN': 0.4881584320130666,
 'NBC': 0.4881584320130666}

## Haberman's survival dataset
Link: https://archive.ics.uci.edu/ml/datasets/haberman's+survival

In [49]:
ds_haberman = pd.read_csv('Datasets/haberman/haberman.data', 
                          names=['Age', 'op_year', 'auxilary_node', 'survival'])
ds_haberman.head()

Unnamed: 0,Age,op_year,auxilary_node,survival
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [50]:
ds_haberman_X = ds_haberman[ds_haberman.columns[:-1]]
ds_haberman_Y = ds_haberman.survival

In [51]:
haberman_fileName = 'results/LeaveOneOut_haberman.txt'
table2_results_haberman = read_if_available_else_produce(ds_haberman_X, 
                           ds_haberman_Y,
                            haberman_fileName)
table2_results_haberman

Reading From previous data
Results from file results/LeaveOneOut_haberman.txt


{'1NN': 0.6764705882352942,
 '3NN': 0.7026143790849673,
 '5NN': 0.7222222222222222,
 '7NN': 0.7254901960784313,
 'NBC': 0.7254901960784313}

And now we can show the table two results as below

In [52]:
table2_dict = {'Haberman': table2_results_haberman, 
            'Magic': table2_results_magic_gamma,
           'Livers': table2_results_livers_disorder, 
           'Letter': table2_results_letter_recognition,
           'Page-Blocks':table2_results_page_blocks,
           'Red-wine':table2_results_redWine,
           'White-wine':table2_results_White_Wine}
table2_df = pd.DataFrame.from_dict(table2_dict, orient='index')
table2_df

Unnamed: 0,1NN,3NN,5NN,7NN,NBC
Haberman,0.676471,0.702614,0.722222,0.72549,0.72549
Magic,0.96245,0.9595,0.958,0.95585,0.95585
Livers,0.62029,0.637681,0.66087,0.686957,0.686957
Letter,0.96245,0.9595,0.958,0.95585,0.95585
Page-Blocks,0.957245,0.957427,0.957062,0.955235,0.955235
Red-wine,0.615385,0.523452,0.510319,0.499687,0.499687
White-wine,0.61617,0.506533,0.495917,0.488158,0.488158


# Table 3 Reproduction
Table 3, represents the p-values for our null hypothesis $H_0: \frac{s^2}{k} = \sigma_I^2$. To reproduce the p-values in table 3, we need to first find each folds accuracies and compute the $\chi$ values for each algorithm.

In [53]:
def find_p_value(acc_cross_val, leaveOneOut_acc, dataset_length, df):
    """
    Find the p-value, by computing the chi-score
    
    Parameters:
    -----------
    acc_cross_val : float_array
        cross validation test scores
    leaveOneOut_acc : float
        the accuracy found by leaveOneOut cross validation method
    dataset_length : positive integer
        dataset length, representing the count of data in dataset
    df : positive integer
        the degrees of freedom to find p-value
        
    Returns:
    ---------
    p_value : float
        the p_value score
    """
    variance = np.sum(np.power(acc_cross_val - np.mean(acc_cross_val), 2))
    sigma = ((leaveOneOut_acc) * (1 - leaveOneOut_acc )) / dataset_length
    chi_value = variance / (2*sigma)
    
    ## get the p_value
    p_value = stats.chi2.cdf(chi_value, df)
    
    return p_value

def find_p_values_multiple_methods(dataset_X, dataset_Y, leaveOneOut_acc, fileName, verbose=False):
    """
    read the p-values or compute it for the dataset if not available
    Using the test introduced in the article
    the p-values are obtained for methods: `1NN`, `3NN`, `5NN`, `7NN` and `NBC` using 2 to 10 folds
    
    Parameters:
    ------------
    dataset_X : array_like
        the features vector
    dataset_Y : array_like
        the label for each feature
    leaveOneOut_acc : dictionary of floats
        the accuracy represented for leaveOneOut cross validation, on methods `1NN`, `3NN`, `5NN`, `7NN` and `NBC`
    file_name : string
        where to read or save the data
    verbose : bool
        print the progress if True
        default is False
    
    Returns:
    --------
    p_values_dict : dictionary
        the dictionary contans the p-values for `1NN`, `3NN`, `5NN`, `7NN` and `NBC` and 2 to 10 folds
    """
    ## if the results file is not available, then calculate the results
    if os.path.isfile(fileName) == False:
        print("Results not available, Producing them\n")

        ## initialize the dictionary
        p_values_dict = {}

        ## Find p-values for 
        ## 1-NN, 3-NN, 5-NN and 7-NN
        for neighbours_count in [1, 3, 5, 7]:
            ## to create a nested dictionary
            ## initialize the folds dictionary
            p_values_folds = {}
            ## 2,3, ..., 10 folds
            for folds_count in range(2,11):
                ## KNN with K neighbours count
                KNN_method = KNeighborsClassifier(n_neighbors=neighbours_count)
                ## cross validation test score
                acc_cross_val = cross_val_score(KNN_method, 
                                dataset_X, 
                                dataset_Y,
                                cv=folds_count)

                ## degrees of freedom is folds minus one
                p_value = find_p_value(acc_cross_val, leaveOneOut_acc[f'{neighbours_count}NN'], len(dataset_X), folds_count - 1)

                if verbose:
                    print(f'{neighbours_count}NN method with fold count {folds_count}, p-value: {p_value}')

                p_values_folds[f'{folds_count}'] = p_value

            ## add all the folds p_values
            p_values_dict[f'{neighbours_count}NN'] = p_values_folds

        ## For Naive Bayes p-value We need to find repeat the codes above again
        NB_p_values_folds = {}
        ## 2,3, ..., 10 folds
        for folds_count in range(2,11):
            NB_method = GaussianNB()
            ## cross validation test score
            acc_cross_val = cross_val_score(KNN_method, 
                            dataset_X, 
                            dataset_Y,
                            cv=folds_count)

            ## degrees of freedom is folds minus one
            p_value = find_p_value(acc_cross_val, leaveOneOut_acc['NBC'], len(dataset_X), folds_count - 1)

            if verbose:
                print(f'Naive Bayes method with fold count {folds_count}, p-value: {p_value}')

            NB_p_values_folds[f'{folds_count}'] = p_value

        p_values_dict['NB'] = NB_p_values_folds
        
        ## save the results
        file = open(fileName, 'w')
        json.dump(p_values_dict, file)
        file.close()
        
        print(f"Results Produced and saved in {fileName}")
    
    ## else if the results file is available, then load it
    else:
        print(f"Reading From previous data\nResults from file {fileName}")

        file = open(fileName, 'r')
        p_values_dict = file.read()
        ## convert string to dictionary
        p_values_dict = json.loads(p_values_dict)
        file.close()
        
    return p_values_dict

In [54]:
table3_livers_file = 'results/pValues_livers_disorder.txt'
table3_results_livers_disorder = find_p_values_multiple_methods(ds_liver_X,
                                                                ds_liver_Y,
                                                                table2_df.loc['Livers'],
                                                                table3_livers_file)

Reading From previous data
Results from file results/pValues_livers_disorder.txt


In [55]:
table3_letters_reco_file = 'results/pValues_letter_recognition.txt'
table3_results_letters_reco = find_p_values_multiple_methods(ds_letter_reco_X,
                                                             ds_letter_reco_Y,
                                                                table2_df.loc['Letter'],
                                                                table3_letters_reco_file)

Reading From previous data
Results from file results/pValues_letter_recognition.txt


In [56]:
table3_magic_file = 'results/pValues_magic.txt'
table3_results_magic = find_p_values_multiple_methods(ds_magic_gamma_X,
                                                             ds_magic_gamma_Y,
                                                             table2_df.loc['Magic'],
                                                             table3_magic_file)

Reading From previous data
Results from file results/pValues_magic.txt


In [57]:
table3_haberman_file = 'results/pValues_haberman.txt'
table3_results_haberman = find_p_values_multiple_methods(ds_haberman_X,
                                                             ds_haberman_Y,
                                                             table2_df.loc['Haberman'],
                                                             table3_haberman_file)

Reading From previous data
Results from file results/pValues_haberman.txt


In [58]:
table3_pageBlocks_file = 'results/pValues_pageBlocks.txt'
table3_results_page_blocks = find_p_values_multiple_methods(ds_page_blocks_X,
                                                             ds_page_blocks_Y,
                                                             table2_df.loc['Page-Blocks'],
                                                             table3_pageBlocks_file)

Reading From previous data
Results from file results/pValues_pageBlocks.txt


In [59]:
table3_redWine_file = 'results/pValues_redWine.txt'
table3_results_redWine = find_p_values_multiple_methods(ds_redWine_X,
                                                             ds_redWine_Y,
                                                             table2_df.loc['Red-wine'],
                                                             table3_redWine_file)

Reading From previous data
Results from file results/pValues_redWine.txt


In [60]:
table3_whiteWine_file = 'results/pValues_whiteWine.txt'
table3_results_whiteWine = find_p_values_multiple_methods(ds_whiteWine_X,
                                                             ds_whiteWine_Y,
                                                             table2_df.loc['White-wine'],
                                                             table3_whiteWine_file)

Reading From previous data
Results from file results/pValues_whiteWine.txt


In [74]:
def highlight_accepted_null_hypothesis(val):
    """
    styler function for pandas dataframe
    if the null hypothesis probability is less than 0.05, then color the background as yellow
    """
    
    background = 'yellow' if val < 0.05 else ''
    
    return 'background-color: %s' % background

## save all the tex versions of table 3 in the array
table3_df_tex_versions = []

for method in ['1NN', '3NN', '5NN', '7NN', 'NB']:
    
    new_dict = {'Haberman': table3_results_haberman[method],
                'Magic': table3_results_magic[method],
                'Livers': table3_results_livers_disorder[method],
                'Letters': table3_results_letters_reco[method],
                'Page-Blocks' : table3_results_page_blocks[method],
                'Red-wine' : table3_results_redWine[method],
                'White-wine' : table3_results_whiteWine[method]
               }
    table3_df = pd.DataFrame.from_dict(new_dict, orient='index')
    table3_df = table3_df.style.applymap(highlight_accepted_null_hypothesis)
    
    table3_df.columns.name = f'({method}) Dataset/folds'
    table3_df_tex_versions.append(table3_df.to_latex())
    display(table3_df)

(1NN) Dataset/folds,2,3,4,5,6,7,8,9,10
Haberman,0.887838,0.869738,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Magic,0.983736,0.965209,0.999995,0.999973,1.0,0.999977,1.0,1.0,1.0
Livers,0.518691,0.933701,0.879641,0.999589,0.848436,0.962822,0.999235,0.999999,0.99994
Letters,0.496836,0.956897,0.99999,0.743307,0.999994,0.705099,0.999994,1.0,1.0
Page-Blocks,0.999995,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Red-wine,0.952585,0.979516,0.999148,0.951465,0.999861,0.99999,1.0,1.0,1.0
White-wine,0.046859,0.670176,0.99962,0.999601,1.0,0.99997,1.0,1.0,1.0


(3NN) Dataset/folds,2,3,4,5,6,7,8,9,10
Haberman,0.920028,0.968212,1.0,1.0,1.0,0.999999,1.0,1.0,1.0
Magic,0.99275,0.697403,0.960222,1.0,1.0,1.0,1.0,1.0,1.0
Livers,0.79617,0.300609,0.024092,0.820314,0.952167,0.061336,0.145584,0.983429,0.999709
Letters,0.77738,0.528967,0.888941,0.631956,0.931341,0.831017,0.999939,0.999993,1.0
Page-Blocks,0.999998,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Red-wine,0.593357,0.989397,1.0,1.0,1.0,1.0,1.0,1.0,1.0
White-wine,0.525077,0.919169,0.999971,0.990614,1.0,0.999308,1.0,1.0,1.0


(5NN) Dataset/folds,2,3,4,5,6,7,8,9,10
Haberman,0.2982,0.970358,0.840694,0.999909,1.0,0.932226,0.998999,1.0,1.0
Magic,0.99983,0.996228,0.553456,1.0,0.998854,1.0,1.0,1.0,1.0
Livers,0.527209,0.705409,0.209247,0.869971,0.975632,0.881018,0.984343,0.999965,0.999945
Letters,0.540871,0.982361,0.947001,0.984432,0.979942,0.996975,0.984166,0.987542,1.0
Page-Blocks,0.997872,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Red-wine,0.467064,0.239697,0.938136,0.990823,0.990119,1.0,0.999992,1.0,1.0
White-wine,0.412861,0.9932,0.794083,0.806029,0.999111,0.989864,0.99898,0.989034,1.0


(7NN) Dataset/folds,2,3,4,5,6,7,8,9,10
Haberman,0.0,0.373536,0.984275,0.99996,1.0,0.350122,0.987512,1.0,1.0
Magic,0.997599,0.998862,0.984154,0.999926,1.0,1.0,1.0,1.0,1.0
Livers,0.414356,0.893988,0.703853,0.805683,0.999665,0.999758,0.998214,1.0,0.985949
Letters,0.37012,0.475599,0.092167,0.61118,0.475987,0.999957,0.999992,0.984248,1.0
Page-Blocks,0.997398,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,1.0
Red-wine,0.330275,0.837347,0.888244,0.824378,0.956167,1.0,0.999988,1.0,0.999783
White-wine,0.668899,0.932369,0.996489,0.999024,0.999998,0.999808,1.0,1.0,1.0


(NB) Dataset/folds,2,3,4,5,6,7,8,9,10
Haberman,0.0,0.373536,0.984275,0.99996,1.0,0.350122,0.987512,1.0,1.0
Magic,0.997599,0.998862,0.984154,0.999926,1.0,1.0,1.0,1.0,1.0
Livers,0.414356,0.893988,0.703853,0.805683,0.999665,0.999758,0.998214,1.0,0.985949
Letters,0.37012,0.475599,0.092167,0.61118,0.475987,0.999957,0.999992,0.984248,1.0
Page-Blocks,0.997398,1.0,0.999999,1.0,1.0,1.0,1.0,1.0,1.0
Red-wine,0.330275,0.837347,0.888244,0.824378,0.956167,1.0,0.999988,1.0,0.999783
White-wine,0.668899,0.932369,0.996489,0.999024,0.999998,0.999808,1.0,1.0,1.0


# Table 4 Reproduction
Now lets find the frequency of significance for the independence test.

In [62]:
## initialize a dictionary for it
table4_frequency_significance_dict = {}

for method in ['1NN', '3NN', '5NN', '7NN', 'NB']:
    
    new_dict = {'Haberman': table3_results_haberman[method],
                'Magic': table3_results_magic[method],
                'Livers': table3_results_livers_disorder[method],
                'Letters': table3_results_letters_reco[method],
                'Page-Blocks' : table3_results_page_blocks[method],
                'Red-wine' : table3_results_redWine[method],
                'White-wine' : table3_results_whiteWine[method]
               }
    table3_df = pd.DataFrame.from_dict(new_dict, orient='index')
    
    
    ## find each methods null hypothesis acceptence count
    table4_folds_frequency_dict = {}
    total = 0
    for folds in range(2, 10):
        ## find how many null hypothesis was not rejected
        ## threshold is 0.05
        table4_frequency_significance_count = sum(table3_df[str(folds)].values < 0.05)
        
        ## append the null hypothesis acceptence count to frequency dict
        table4_folds_frequency_dict[f'{folds}'] = table4_frequency_significance_count
        
        total += table4_frequency_significance_count
    table4_folds_frequency_dict['Total'] = total
    
    ## append to the results
    table4_frequency_significance_dict[method] = table4_folds_frequency_dict
    

## not containing the totals row
temp_df = pd.DataFrame.from_dict(table4_frequency_significance_dict, orient='index')


####### the last row of table 4
## table 4 is created and just one more thing to add
## the total counts row
table4_totals_dict = {}
for total_column in range(2, 10):
    table4_totals_dict[str(total_column)] = sum(temp_df[str(total_column)])
table4_totals_dict['Total'] = sum(temp_df['Total'])

## Add the last row to the table4 dict
table4_frequency_significance_dict['Total'] = table4_totals_dict

## create the dataframe of table 4
table4_df = pd.DataFrame.from_dict(table4_frequency_significance_dict, orient='index')


table4_df.columns.name = 'Method / Folds'
table4_df.style.set_properties(**{'text-align': 'center'})

Method / Folds,2,3,4,5,6,7,8,9,Total
1NN,1,0,0,0,0,0,0,0,1
3NN,0,0,1,0,0,0,0,0,1
5NN,0,0,0,0,0,0,0,0,0
7NN,1,0,0,0,0,0,0,0,1
NB,1,0,0,0,0,0,0,0,1
Total,3,0,1,0,0,0,0,0,4


In [63]:
## File methods, `1NN`, `3NN`, `5NN`, `7NN, `NB`
## `NB` is the abbreviation for `Naive Bayes` 

## see the rejection rate for null hypothesis
table4_df['Total']['Total'] / (5 * table3_df.shape[0] * (table3_df.shape[1]))

0.012698412698412698

As we can see the rejection rate for null hypothesis is about 1%, So the null hypothesis (folds dependece) is rejected and we can say that fold accuracies are independent.

# Saving the tables for the report

In [68]:
Table2_tex = table2_df.style.to_latex()
print(Table2_tex)

\begin{tabular}{lrrrrr}
 & 1NN & 3NN & 5NN & 7NN & NBC \\
Haberman & 0.676471 & 0.702614 & 0.722222 & 0.725490 & 0.725490 \\
Magic & 0.962450 & 0.959500 & 0.958000 & 0.955850 & 0.955850 \\
Livers & 0.620290 & 0.637681 & 0.660870 & 0.686957 & 0.686957 \\
Letter & 0.962450 & 0.959500 & 0.958000 & 0.955850 & 0.955850 \\
Page-Blocks & 0.957245 & 0.957427 & 0.957062 & 0.955235 & 0.955235 \\
Red-wine & 0.615385 & 0.523452 & 0.510319 & 0.499687 & 0.499687 \\
White-wine & 0.616170 & 0.506533 & 0.495917 & 0.488158 & 0.488158 \\
\end{tabular}



In [69]:
table4_tex = table4_df.style.to_latex()
print(table4_tex)

\begin{tabular}{lrrrrrrrrr}
Method / Folds & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & Total \\
1NN & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 \\
3NN & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 1 \\
5NN & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\
7NN & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 \\
NB & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 \\
Total & 3 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 4 \\
\end{tabular}



In [75]:
for table3_tex in table3_df_tex_versions:
    print(table3_tex)
    print('\n\n')

\begin{tabular}{lrrrrrrrrr}
(1NN) Dataset/folds & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 \\
Haberman & \background-color 0.887838 & \background-color 0.869738 & \background-color 1.000000 & \background-color 1.000000 & \background-color 1.000000 & \background-color 1.000000 & \background-color 1.000000 & \background-color 1.000000 & \background-color 1.000000 \\
Magic & \background-color 0.983736 & \background-color 0.965209 & \background-color 0.999995 & \background-color 0.999973 & \background-color 1.000000 & \background-color 0.999977 & \background-color 1.000000 & \background-color 1.000000 & \background-color 1.000000 \\
Livers & \background-color 0.518691 & \background-color 0.933701 & \background-color 0.879641 & \background-color 0.999589 & \background-color 0.848436 & \background-color 0.962822 & \background-color 0.999235 & \background-color 0.999999 & \background-color 0.999940 \\
Letters & \background-color 0.496836 & \background-color 0.956897 & \background-color 0.999990 & \