# Experimentación 1

Analizar la calidad de los resultados obtenidos al combinar kNN con y sin PCA, para
un rango amplio de combinaciones de valores de k y α. Llamamos k a la cantidad de
vecinos a considerar en el algoritmo kNN y α a la cantidad de componentes principales
a tomar.

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import metnum
from sklearn.metrics import accuracy_score
from time import time

df_train = pd.read_csv("../data/train.csv")

In [2]:
# Reduzco data set
df_train = df_train[:1000]

En el primer caracter está el dígito a reconocer. Llamamos a esto y_train

In [3]:
# Uso values para mandar todo a arrays de numpy
X = df_train[df_train.columns[1:]].values
y = df_train["label"].values.reshape(-1, 1)

X.shape, y.shape

((1000, 784), (1000, 1))

## K-Fold Cross-Validation

In [4]:
def splitTrainSet(X, y, K, i):
    size = int(X.shape[0] * 1/K)
    X_train1, y_train1 = X[:i * size], y[:i * size]
    X_val, y_val = X[i * size:(i + 1) * size], y[i * size:(i + 1) * size]
    X_train2, y_train2 = X[(i + 1) * size:], y[(i + 1) * size:]
    
    X_train, y_train = np.concatenate((X_train1, X_train2)), np.concatenate((y_train1, y_train2))
    
    return X_train, y_train, X_val, y_val

In [5]:
def KFold(X, y, K, classifier):
    
    accuracy = []
    total_time = []
    pred_time = []
    
    for i in range(0, K):
        X_train, y_train, X_val, y_val = splitTrainSet(X, y, K, i)

        init_time = time() 
        
        y_pred, pt = classifier(X_train, y_train, X_val)
        
        end_time = time() 

        acc = accuracy_score(y_val, y_pred)
        rt = end_time - init_time

        accuracy.append(acc)
        total_time.append(rt)
        pred_time.append(pt)

    accuracy = np.mean(accuracy)
    total_time = np.mean(total_time)
    pred_time = np.mean(pred_time)

    return accuracy, total_time, pred_time

## Classifiers Dynamics

In [6]:
def classifierKNNDynamic(neighbors):
    def classifierKNN(X_train, y_train, X_val):
        clf = metnum.KNNClassifier(neighbors)
        
        clf.fit(X_train, y_train)
        
        init_time = time()
        y_pred = clf.predict(X_val)
        end_time = time()
        
        pred_time = end_time - init_time
        
        return y_pred, pred_time
    
    return classifierKNN

In [7]:
def classifierPCADynamic(neighbors, components):
    def classifierPCA(X_train, y_train, X_val):
        clf = metnum.KNNClassifier(neighbors)
        pca = metnum.PCA(components)
        
        pca.fit(X_train)
        X_train_transformed = pca.transform(X_train)
        clf.fit(X_train_transformed, y_train)
        X_val_transformed = pca.transform(X_val)
        
        init_time = time()
        y_pred = clf.predict(X_val_transformed)
        end_time = time()
        
        pred_time = end_time - init_time
        
        return y_pred, pred_time
    
    return classifierPCA

## Test 1: Knn variando la cantidad de vecinos

In [8]:
neighbors_range = list(range(1, 30))

In [9]:
%%time

accuracy = []
total_time = []
pred_time = []

for i in neighbors_range:
    K = 10
    neighbors = i
    
    print("Neighbors: {}".format(i))
    acc, tt, pt = KFold(X, y, K, classifierKNNDynamic(i))
    
    accuracy.append(acc)
    total_time.append(tt)
    pred_time.append(pt)

Neighbors: 1
Neighbors: 2
Neighbors: 3
Neighbors: 4
Neighbors: 5
Neighbors: 6
Neighbors: 7
Neighbors: 8
Neighbors: 9
Neighbors: 10
Neighbors: 11
Neighbors: 12
Neighbors: 13
Neighbors: 14
Neighbors: 15
Neighbors: 16
Neighbors: 17
Neighbors: 18
Neighbors: 19
Neighbors: 20
Neighbors: 21
Neighbors: 22
Neighbors: 23
Neighbors: 24
Neighbors: 25
Neighbors: 26
Neighbors: 27
Neighbors: 28
Neighbors: 29
CPU times: user 10.6 s, sys: 203 ms, total: 10.8 s
Wall time: 10.9 s


In [10]:
data = pd.DataFrame({
    'Neighbors': neighbors_range, 
    'Accuracy': accuracy,
    'Total time': total_time,
    'Pred time': pred_time,
    'Method': 'KNN'
})

data.to_csv('tests_csv/test1.csv', index=False)

## Test 2: PCA + KNN variando la cantidad de componentes principales

In [11]:
components_range = list(range(1, 30))

In [12]:
%%time

accuracy = []
total_time = []
pred_time = []

for i in components_range:
    K = 10
    neighbors = 5
    components = i
    
    print("Components: {}".format(components))
    
    acc, tt, pt = KFold(X, y, K, classifierPCADynamic(neighbors, components))
    
    accuracy.append(acc)
    total_time.append(tt)
    pred_time.append(pt)

Components: 1
Components: 2
Components: 3
Components: 4
Components: 5
Components: 6
Components: 7
Components: 8
Components: 9
Components: 10
Components: 11
Components: 12
Components: 13
Components: 14
Components: 15
Components: 16
Components: 17
Components: 18
Components: 19
Components: 20
Components: 21
Components: 22
Components: 23
Components: 24
Components: 25
Components: 26
Components: 27
Components: 28
Components: 29
CPU times: user 2min 42s, sys: 816 ms, total: 2min 43s
Wall time: 2min 43s


In [13]:
data = pd.DataFrame({
    'Components': components_range, 
    'Accuracy': accuracy,
    'Total time': total_time,
    'Pred time': pred_time,
    'Method': 'PCA + KNN'
})

data.to_csv('tests_csv/test2.csv', index=False)

## Test 3: Comparacion de accuracy y time

In [14]:
K_range = list(range(3, 20))

In [15]:
%%time

knn_accuracy = []
knn_total_time = []
knn_pred_time = []

for i in K_range:
    K = i
    neighbors = 5
    components = 15
    
    print("K knn: {}".format(K))
    
    acc, tt, pt = KFold(X, y, K, classifierKNNDynamic(neighbors))
    
    knn_accuracy.append(acc)
    knn_total_time.append(tt)
    knn_pred_time.append(pt)

K knn: 3
K knn: 4
K knn: 5
K knn: 6
K knn: 7
K knn: 8
K knn: 9
K knn: 10
K knn: 11
K knn: 12
K knn: 13
K knn: 14
K knn: 15
K knn: 16
K knn: 17
K knn: 18
K knn: 19
CPU times: user 6.06 s, sys: 233 ms, total: 6.3 s
Wall time: 6.29 s


In [16]:
%%time

pca_accuracy = []
pca_total_time = []
pca_pred_time = []

for i in K_range:
    K = i
    neighbors = 5
    components = 15
    
    print("K knn + pca: {}".format(K))
    
    acc, tt, pt = KFold(X, y, K, classifierPCADynamic(neighbors, components))
    
    pca_accuracy.append(acc)
    pca_total_time.append(tt)
    pca_pred_time.append(pt)

K knn + pca: 3
K knn + pca: 4
K knn + pca: 5
K knn + pca: 6
K knn + pca: 7
K knn + pca: 8
K knn + pca: 9
K knn + pca: 10
K knn + pca: 11
K knn + pca: 12
K knn + pca: 13
K knn + pca: 14
K knn + pca: 15
K knn + pca: 16
K knn + pca: 17
K knn + pca: 18
K knn + pca: 19
CPU times: user 1min 29s, sys: 563 ms, total: 1min 29s
Wall time: 1min 30s


In [17]:
dataKNN = pd.DataFrame({
    'K': K_range, 
    'Accuracy': knn_accuracy,
    'Total Time': knn_total_time,
    'Pred Time': knn_pred_time,
    'Method': 'KNN'
})

dataPCA = pd.DataFrame({
    'K': K_range, 
    'Accuracy': pca_accuracy,
    'Total Time': pca_total_time,
    'Pred Time': pca_pred_time,
    'Method': 'KNN + PCA'
})

data = pd.concat([dataKNN, dataPCA])

data.to_csv('tests_csv/test3.csv', index=False)