In [19]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import os
from os import listdir
import pickle as pk
import json
import numpy as np

In [34]:
# Function to load a batch into memory
def load_batch(data_dir, batch_id):
    with open(os.path.join(data_dir, 'data_batch_%i' % batch_id), mode='rb') as file:
        batch = pk.load(file, encoding='latin1')
    feats = batch['data'].reshape((len(batch['data']), 3, 32, 32)).transpose(0, 2, 3, 1)
    lbls = batch['labels']
    return feats, lbls

# Load the first batch
feats, labels = load_batch( 'cifar10', 1)
for i in range(2,6):
    feats_temp, labels_temp = load_batch( 'cifar10', i)
    feats = np.append(feats,feats_temp, axis=0)
    labels = np.append(labels,labels_temp, axis=0)
label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [None]:
feats_

In [11]:
def load_from_json(file:str):
    return json.loads(open(f'embeddings/{file}').read())

In [25]:
from sklearn.preprocessing import scale
def split_set(embeddings, labels, n):
    feats= scale(embeddings)
    indices = np.random.permutation(n)
    training_idx, dev_idx,test_idx = indices[:int(0.9*n)], indices[int(0.9*n):int(0.95*n)], indices[int(0.95*n):]
    feats_train,feats_dev, feats_test = feats[training_idx,:],feats[dev_idx,:], feats[test_idx,:]
    labels_train,labels_dev, labels_test = np.array([labels[i] for i in training_idx]),np.array([labels[i] for i in dev_idx]), np.array([labels[i] for i in test_idx])
    return [feats_train,feats_dev, feats_test],[labels_train,labels_dev, labels_test]

In [13]:
from sklearn.neighbors import KNeighborsClassifier
def train_knn_and_test(train,dev,labels_train,labels_dev):
    knn = KNeighborsClassifier(n_neighbors=10)
    knn.fit(train,labels_train)
    return knn,knn.score(train, labels_train), knn.score(dev, labels_dev)

In [31]:
from sklearn.decomposition import PCA
def train_pca(train_set, percentage=90):
    for i in range(50,min(sample_size,4096),50):
        pca = PCA(n_components=i)
        pca.fit(train_set)
        if np.sum(100*pca.explained_variance_ratio_) > percentage:
            return i,pca

In [15]:
def train_tsne(train_set):
    tsne = TSNE()
    tsne.fit(embeddings)
    return tsne

## Training with and without pca

In [32]:
for file in listdir('./embeddings'):
    if '.json' in file :
        df = pd.DataFrame(columns=['PCA','Train accuracy', 'Dev accuracy'])
        sample_size = int(file[file.index('_')+1:file.index('.')])
        print(sample_size)
        embeddings = load_from_json(file)
        
        # Spit the data
        [feats_train,feats_dev, feats_test],[labels_train,labels_dev, labels_test] = split_set(
            embeddings, labels[:sample_size], sample_size)

        # Train knn
        knn, knn_train_score, knn_test_score = train_knn_and_test(feats_train,feats_dev,labels_train,labels_dev)
        
        df.loc[0] = [0,knn_train_score,knn_test_score ]
        
        # Fit pca
        i,pca = train_pca(feats_train)
        print(f'Trained pca with {i} components')
        
        # Train knn
        knn, knn_train_score, knn_test_score = train_knn_and_test(pca.transform(feats_train),pca.transform(feats_dev),labels_train,labels_dev)
        df.loc[1] = [1,knn_train_score,knn_test_score]
        print(df)

30000
(27000, 10)


ValueError: n_components=50 must be between 0 and n_features=10 with svd_solver='full'

In [207]:
confusion_matrix(knn.predict(pca.transform(feats_dev)), labels_dev)
#['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

array([[23,  6,  4,  3,  2,  2,  3,  2,  6,  4],
       [ 4, 23,  6,  2,  2,  3,  3,  4,  4,  8],
       [ 3,  2, 13,  3,  4,  2,  2,  2,  0,  0],
       [ 2,  4,  7, 15,  3, 12,  3,  2,  4,  2],
       [ 1,  0, 12, 11, 24,  9, 17, 10,  4,  0],
       [ 5,  5,  3, 10,  4, 15,  6,  9,  1,  2],
       [ 1,  5,  9,  8,  5,  4, 28,  0,  1,  2],
       [ 2,  5,  3,  0,  8,  3,  2, 13,  2,  7],
       [17,  1,  0,  0,  1,  1,  1,  1, 40,  1],
       [ 6, 12,  1,  3,  3,  2,  4,  3,  5, 43]])

In [68]:
from sklearn.manifold import TSNE
embeddings = np.array(load_from_json('embeddings_3000.json'))


## Training with and without t-SNE

In [87]:
for file in listdir('./embeddings'):
    if '.json' in file:
        df = pd.DataFrame(columns=['tSNE','Train accuracy', 'Dev accuracy'])
        sample_size = int(file[file.index('vgg16_')+6:file.index('.')])
        embeddings = load_from_json(file)
        
        # Spit the data
        [feats_train,feats_dev, feats_test],[labels_train,labels_dev, labels_test] = split_set(
            embeddings, labels[:sample_size], sample_size)

        # Train knn
        knn, knn_train_score, knn_test_score = train_knn_and_test(feats_train,feats_dev,labels_train,labels_dev)
        
        df.loc[0] = [0,knn_train_score,knn_test_score ]
        
        # Train knn with TSNE
        knn, knn_train_score, knn_test_score = train_knn_and_test(TSNE().fit_transform(feats_train),TSNE().fit_transform(feats_dev),labels_train,labels_dev)
        df.loc[1] = [1,knn_train_score,knn_test_score]
        print(df)

   tSNE  Train accuracy  Dev accuracy
0   0.0        0.493333          0.37
1   1.0        0.438333          0.10
   tSNE  Train accuracy  Dev accuracy
0   0.0        0.546667      0.431667
1   1.0        0.459444      0.093333
