In [1]:
import numpy as np
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import sys

def run(x_train, y_train, x_test, y_test, clf):
    clf.fit(x_train, y_train)
    return clf.score(x_test, y_test)


In [8]:

def split(x,y,k,m):
    ns = int(y.shape[0]/m)
    # ns is the number of elements in each split
    # print ("ns: ", ns)
    s = []
    # for cycle will split the original data into m splits
    for i in range(m):
    	s.append([x[(ns*i):(ns*i+ns)],
                  y[(ns*i):(ns*i+ns)]])

    # print("s[0][0].shape: ", s[0][0].shape)

    # deconstruct the array s into x_test and y_test
    # remember that k is the parameter used to identify
    # the index of the split that will be used as test
    x_test, y_test = s[k]
    x_train = []
    y_train = []
    for i in range(m):
        if (i==k):
            continue
        else:
            a,b = s[i]
            x_train.append(a)
            y_train.append(b)

    # x_train and y_train are arrays of arrays where
    # each element is a split of the original data
    # each element is an numpy array
    # print("x_train[0].shape: ", x_train[0].shape)
    # print("y_train[0].shape: ", y_train[0].shape)

    # reshape the arrays to have desired shape, each one will
    # merge all the splits inside a single array
    #first we create numpy array
    npyx = np.array(x_train)
    npyy = np.array(y_train)

    # print the shape
    # print("npyx.shape: ", npyx.shape)
    # print("npyy.shape: ", npyy.shape)

    # reshape the array to have the desired shape
    x_train = np.array(x_train).reshape(((m-1)*ns,30))
    y_train = np.array(y_train).reshape((m-1)*ns)

    # print("x_train.shape: ", x_train.shape)
    # print("y_train.shape: ", y_train.shape)

    return [x_train, y_train, x_test, y_test]


The selected code defines a function called `split` that takes four arguments: `x`, `y`, `k`, and `m`. `x` is a NumPy array representing the feature matrix of a dataset, `y` is a NumPy array representing the target vector of the dataset, `k` is an integer representing the index of the test set, and `m` is an integer representing the number of folds in the cross-validation.

The `split` function performs k-fold cross-validation by splitting the dataset into `m` equally sized folds, and using one fold as the test set and the remaining folds as the training set. Specifically, the function splits the dataset into `m` folds, where each fold contains a subset of the samples in `x` and `y`. The function then selects the `k`-th fold as the test set, and combines the remaining folds into a single training set.

The function returns four NumPy arrays: `x_train`, `y_train`, `x_test`, and `y_test`. `x_train` and `y_train` represent the feature matrix and target vector of the training set, respectively, and are created by combining all of the folds except for the `k`-th fold. `x_test` and `y_test` represent the feature matrix and target vector of the test set, respectively, and are created by selecting the `k`-th fold.

Overall, the `split` function is a helper function that performs k-fold cross-validation by splitting the dataset into `m` folds and selecting one fold as the test set. It returns the training and test sets as NumPy arrays that can be used for training and evaluating a machine learning model.

In [3]:

def pp(z,k,s):
    m = z.shape[1]
    print("%-19s: %0.4f +/- %0.4f | " % (s, z[k].mean(), z[k].std()/np.sqrt(m)), end='')
    for i in range(m):
        print("%0.4f " % z[k,i], end='')
    print()


The selected code defines a function called `pp` that takes three arguments: `z`, `k`, and `s`. `z` is a NumPy array representing the results of a cross-validation experiment, `k` is an integer representing the index of the test fold, and `s` is a string representing the name of the experiment.

The `pp` function prints a summary of the cross-validation results for the test fold specified by `k`. Specifically, the function prints the mean and standard error of the test scores for the test fold, as well as the test scores for each of the training folds. The mean and standard error are printed in a formatted string that includes the name of the experiment, and the test scores for each training fold are printed as a space-separated list of floating-point numbers.

The function does not return any values, but instead prints the summary to the console using the `print()` function.

Overall, the `pp` function is a helper function that prints a summary of the cross-validation results for a single test fold. It is typically used in conjunction with a cross-validation function to evaluate the performance of a machine learning model.

In [11]:


x = np.load("../data/breast/bc_features_standard.npy")
y = np.load("../data/breast/bc_labels.npy")
idx = np.argsort(np.random.random(y.shape[0]))
x = x[idx]
y = y[idx]
m = 5 #int(sys.argv[1])
z = np.zeros((8,m))

for k in range(m):
    x_train, y_train, x_test, y_test = split(x,y,k,m)
    z[0,k] = run(x_train, y_train, x_test, y_test, NearestCentroid())
    z[1,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
    z[2,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
    z[3,k] = run(x_train, y_train, x_test, y_test, GaussianNB())
    z[4,k] = run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
    z[5,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
    z[6,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
    z[7,k] = run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))

pp(z,0,"Nearest"); 
pp(z,1,"3-NN")
pp(z,2,"7-NN");    
pp(z,3,"Naive Bayes")
pp(z,4,"Decision Tree");    
pp(z,5,"Random Forest (5)")
pp(z,6,"Random Forest (50)");    
pp(z,7,"SVM (linear)")




Nearest            : 0.9327 +/- 0.0032 | 0.9204 0.9381 0.9292 0.9381 0.9381 
3-NN               : 0.9646 +/- 0.0079 | 0.9381 0.9646 0.9735 0.9558 0.9912 
7-NN               : 0.9681 +/- 0.0081 | 0.9558 0.9469 0.9646 0.9735 1.0000 
Naive Bayes        : 0.9363 +/- 0.0039 | 0.9381 0.9381 0.9204 0.9381 0.9469 
Decision Tree      : 0.9239 +/- 0.0040 | 0.9204 0.9115 0.9204 0.9381 0.9292 
Random Forest (5)  : 0.9487 +/- 0.0058 | 0.9381 0.9469 0.9381 0.9469 0.9735 
Random Forest (50) : 0.9558 +/- 0.0079 | 0.9646 0.9469 0.9292 0.9558 0.9823 
SVM (linear)       : 0.9664 +/- 0.0068 | 0.9381 0.9735 0.9823 0.9646 0.9735 


In [19]:
def pp2(z,s):
    m = z.shape[0]
    print("%-19s: %0.4f +/- %0.4f " % (s, z.mean(), z.std()/np.sqrt(m)), end='')
    print()

In [20]:

iteration = 50
x = np.load("../data/breast/bc_features_standard.npy")
y = np.load("../data/breast/bc_labels.npy")

z = np.zeros((8,m))
final = np.zeros((8,iteration))
# loop for the number of iterations
m = 5 #int(sys.argv[1])
for i in range(iteration):
    idx = np.argsort(np.random.random(y.shape[0]))
    x = x[idx]
    y = y[idx]

    for k in range(m):
        x_train, y_train, x_test, y_test = split(x,y,k,m)
        z[0,k] = run(x_train, y_train, x_test, y_test, NearestCentroid())
        z[1,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=3))
        z[2,k] = run(x_train, y_train, x_test, y_test, KNeighborsClassifier(n_neighbors=7))
        z[3,k] = run(x_train, y_train, x_test, y_test, GaussianNB())
        z[4,k] = run(x_train, y_train, x_test, y_test, DecisionTreeClassifier())
        z[5,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=5))
        z[6,k] = run(x_train, y_train, x_test, y_test, RandomForestClassifier(n_estimators=50))
        z[7,k] = run(x_train, y_train, x_test, y_test, SVC(kernel="linear", C=1.0))
    
    final[0,i] = z[0].mean()
    final[1,i] = z[1].mean()
    final[2,i] = z[2].mean()
    final[3,i] = z[3].mean()
    final[4,i] = z[4].mean()
    final[5,i] = z[5].mean()
    final[6,i] = z[6].mean()
    final[7,i] = z[7].mean()

print(final[0])

pp2(final[0],"Nearest");
pp2(final[1],"3-NN")
pp2(final[2],"7-NN");
pp2(final[3],"Naive Bayes")
pp2(final[4],"Decision Tree");
pp2(final[5],"Random Forest (5)")
pp2(final[6],"Random Forest (50)");
pp2(final[7],"SVM (linear)")

print("Final Results")
print("Nearest Centroid: ", final[0].mean())
print("3-NN: ", final[1].mean())
print("7-NN: ", final[2].mean())
print("Naive Bayes: ", final[3].mean())
print("Decision Tree: ", final[4].mean())
print("Random Forest (5): ", final[5].mean())
print("Random Forest (50): ", final[6].mean())
print("SVM (linear): ", final[7].mean())




[0.92920354 0.92566372 0.93097345 0.92920354 0.92920354 0.92743363
 0.93097345 0.92920354 0.93097345 0.93274336 0.92920354 0.92920354
 0.93097345 0.92920354 0.93097345 0.93097345 0.92920354 0.92743363
 0.93097345 0.93274336 0.93097345 0.93097345 0.92920354 0.92743363
 0.93097345 0.92743363 0.93097345 0.92743363 0.92920354 0.93097345
 0.93097345 0.92743363 0.92920354 0.93097345 0.92743363 0.92920354
 0.93274336 0.92743363 0.93097345 0.93097345 0.93097345 0.92743363
 0.93274336 0.92920354 0.92743363 0.93097345 0.93097345 0.92920354
 0.93274336 0.93097345]
Nearest            : 0.9298 +/- 0.0002 
3-NN               : 0.9658 +/- 0.0005 
7-NN               : 0.9653 +/- 0.0005 
Naive Bayes        : 0.9328 +/- 0.0005 
Decision Tree      : 0.9250 +/- 0.0012 
Random Forest (5)  : 0.9471 +/- 0.0009 
Random Forest (50) : 0.9589 +/- 0.0006 
SVM (linear)       : 0.9725 +/- 0.0005 
Final Results
Nearest Centroid:  0.9298407079646017
3-NN:  0.9658053097345133
7-NN:  0.9653097345132744
Naive Bayes:  0.