# Experimentación

In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import metnum
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from time import time

df_train = pd.read_csv("../data/train.csv")

In [None]:
# Reduzco data set
#df_train = df_train[:10000]

En el primer caracter está el dígito a reconocer. Llamamos a esto y_train

In [None]:
# Uso values para mandar todo a arrays de numpy
X = df_train[df_train.columns[1:]].values
y = df_train["label"].values.reshape(-1, 1)

X.shape, y.shape

## K-Fold Cross-Validation

In [None]:
def splitTrainSet(X, y, K, i):
    size = int(X.shape[0] * 1/K)
    X_train1, y_train1 = X[:i * size], y[:i * size]
    X_val, y_val = X[i * size:(i + 1) * size], y[i * size:(i + 1) * size]
    X_train2, y_train2 = X[(i + 1) * size:], y[(i + 1) * size:]
    
    X_train, y_train = np.concatenate((X_train1, X_train2)), np.concatenate((y_train1, y_train2))
    
    return X_train, y_train, X_val, y_val

In [None]:
def KFold(X, y, K, classifier):
    
    accuracy = []
    pred_time = []
    fit_time = []
    
    for i in range(0, K):
        X_train, y_train, X_val, y_val = splitTrainSet(X, y, K, i)
        
        y_pred, pt, ft = classifier(X_train, y_train, X_val)
        
        acc = accuracy_score(y_val, y_pred)

        accuracy.append(acc)
        pred_time.append(pt)
        fit_time.append(ft)

    accuracy = np.mean(accuracy)
    pred_time = np.mean(pred_time)
    fit_time = np.mean(fit_time)

    return accuracy, pred_time, fit_time

In [None]:
def KFoldCohenKappa(X, y, K, classifier1, classifier2):
    
    cohen_kappa = []
    
    for i in range(0, K):
        X_train, y_train, X_val, y_val = splitTrainSet(X, y, K, i)
        
        y_pred1 = classifier1(X_train, y_train, X_val)
        y_pred2 = classifier2(X_train, y_train, X_val)
        
        cp = cohen_kappa_score(y_pred1[0], y_pred2[0])

        cohen_kappa.append(cp)

    cohen_kappa = np.mean(cohen_kappa)

    return cohen_kappa

## Classifiers Dynamics

In [None]:
def classifierKNNDynamic(neighbors):
    def classifierKNN(X_train, y_train, X_val):
        clf = metnum.KNNClassifier(neighbors)
        
        init_fit_time = time()
        clf.fit(X_train, y_train)
        end_fit_time = time()
        
        fit_time = end_fit_time - init_fit_time
        
        init_pred_time = time()
        y_pred = clf.predict(X_val)
        end_pred_time = time()
        
        pred_time = (end_pred_time - init_pred_time) / len(X_val)
        
        return y_pred, pred_time, fit_time
    
    return classifierKNN

In [None]:
def classifierPCADynamic(neighbors, components):
    def classifierPCA(X_train, y_train, X_val):
        clf = metnum.KNNClassifier(neighbors)
        pca = metnum.PCA(components)
        
        init_fit_time = time()
        pca.fit(X_train)
        X_train_transformed = pca.transform(X_train)
        clf.fit(X_train_transformed, y_train)
        end_fit_time = time()
        
        fit_time = end_fit_time - init_fit_time
        
        init_pred_time = time()
        X_val_transformed = pca.transform(X_val)
        y_pred = clf.predict(X_val_transformed)
        end_pred_time = time()
        
        pred_time = (end_pred_time - init_pred_time) / len(X_val_transformed)
        
        return y_pred, pred_time, fit_time
    
    return classifierPCA

## Test 1: Knn variando la cantidad de vecinos

In [None]:
neighbors_range = [1, 5, 10, 20, 40, 60, 100]

In [None]:
%%time

accuracy = []
pred_time = []
fit_time = []

for i in neighbors_range:
    K = 10
    neighbors = i
    
    print("Neighbors: {}".format(i))
    acc, pt, ft = KFold(X, y, K, classifierKNNDynamic(i))
    
    accuracy.append(acc)
    pred_time.append(pt)
    fit_time.append(ft)

In [None]:
data = pd.DataFrame({
    'Neighbors': neighbors_range, 
    'Accuracy': accuracy,
    'Prediction time': pred_time,
    'Fit time': fit_time,
    'Method': 'KNN'
})

data.to_csv('tests_csv/test1.csv', index=False)

## Test 2: PCA + KNN variando la cantidad de componentes principales y vecinos

In [None]:
components_range = [10, 15, 25, 50, 100]
neighbors_range = [5, 10, 20, 50]

In [None]:
%%time

neighbors_accuracy = []
neighbors_pred_time = []
neighbors_fit_time = []

for k in neighbors_range:
    accuracy = []
    pred_time = []
    fit_time = []

    for c in components_range:
        K = 10
        neighbors = k
        components = c

        print("Components: {} Neighbors: {}".format(components, neighbors))

        acc, pt, ft = KFold(X, y, K, classifierPCADynamic(neighbors, components))

        accuracy.append(acc)
        pred_time.append(pt)
        fit_time.append(ft)
        
    neighbors_accuracy.append(accuracy)
    neighbors_pred_time.append(pred_time)
    neighbors_fit_time.append(fit_time)

In [None]:
neighbors_data = []

for i in range(0, len(neighbors_range)):
    data = pd.DataFrame({
        'Components': components_range, 
        'Accuracy': neighbors_accuracy[i],
        'Prediction time': neighbors_pred_time[i],
        'Fit time': neighbors_fit_time[i],
        'Method': 'PCA + KNN k=' + str(neighbors_range[i])
    })
    
    neighbors_data.append(data)
    
data = pd.concat(neighbors_data)

data.to_csv('tests_csv/test2.csv', index=False)

## Test 3: Comparacion de accuracy y time

In [None]:
K_range = [2, 5, 10, 25, 50, 100]

In [None]:
%%time

knn_accuracy = []
knn_pred_time = []
knn_fit_time = []

pca_accuracy = []
pca_pred_time = []
pca_fit_time = []

for i in K_range:
    K = i
    neighbors = 5
    components = 30
    
    print("K: {}".format(K))
    
    acc, pt, ft = KFold(X, y, K, classifierKNNDynamic(neighbors))
    
    knn_accuracy.append(acc)
    knn_pred_time.append(pt)
    knn_fit_time.append(ft)
    
    acc, pt, ft = KFold(X, y, K, classifierPCADynamic(neighbors, components))
    
    pca_accuracy.append(acc)
    pca_pred_time.append(pt)
    pca_fit_time.append(ft)

In [None]:
dataKNN = pd.DataFrame({
    'K': K_range, 
    'Accuracy': knn_accuracy,
    'Prediction Time': knn_pred_time,
    'Fit Time': knn_fit_time,
    'Method': 'KNN'
})

dataPCA = pd.DataFrame({
    'K': K_range, 
    'Accuracy': pca_accuracy,
    'Prediction Time': pca_pred_time,
    'Fit Time': pca_fit_time,
    'Method': 'KNN + PCA'
})

data = pd.concat([dataKNN, dataPCA])

data.to_csv('tests_csv/test3.csv', index=False)

## Test 4: Kappa de Cohen de los clasificadores

In [None]:
K_range = [2, 5, 10, 25, 50, 100]

In [None]:
%%time

cohen_kappa = []

for i in K_range:
    K = i
    neighbors = 5
    components = 30
    
    print("K: {}".format(K))
    
    classifier1 = classifierKNNDynamic(neighbors)
    classifier2 = classifierPCADynamic(neighbors, components)

    ck = KFoldCohenKappa(X, y, K, classifier1, classifier2)
    
    cohen_kappa.append(ck)

In [None]:
data = pd.DataFrame({
    'K': K_range,
    'Cohen Kappa': cohen_kappa,
    'Classifiers': 'KNN and KNN + PCA'
})
    
data.to_csv('tests_csv/test4.csv', index=False)