# Rudimentary t-SNE Pipeline Classification

Authors: Athan Zhang and Jean Lavgine du Cadet

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.linear_model import LogisticRegression as LR
from sklearn.svm import SVC as SVM
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

from tsne import tsne

SEED = 1234
np.random.seed=(SEED)

In [8]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='latin1')
    return dict

In [11]:
infile = r'cifar-10-batches-py/data_batch_1'
data_batch_1 = unpickle(infile)

In [21]:
meta_file = r'cifar-10-batches-py/batches.meta'
meta_data = unpickle(meta_file)

In [22]:
cifar_data = np.array(data_batch_1['data'])

In [23]:
cifar_labels = np.array(data_batch_1['labels'])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(cifar_data, cifar_labels, test_size=0.2, random_state=SEED)
classifiers = dict()
classifiers["DT"]   = DT(random_state=SEED)
classifiers["LR"]   = LR(random_state=SEED, max_iter=5000)
classifiers["SVM"]  = SVM(random_state=SEED)
classifiers["KNN"]  = KNN()

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print("CIFAR10 with %s Acc:\t %s" % (name, round(accuracy_score(y_test, preds),3)))
    print("CIFAR10 with %s F1:\t %s" % (name, round(f1_score(y_test, preds,average='weighted'),3)))

CIFAR10 with DT Acc:	 0.225
CIFAR10 with DT F1:	 0.226


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CIFAR10 with LR Acc:	 0.265
CIFAR10 with LR F1:	 0.265


KeyboardInterrupt: 

## With t-SNE Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cifar_data, cifar_labels, test_size=0.2, random_state=SEED)

classifiers = dict()
classifiers["DT"]   = DT(random_state=SEED)
classifiers["LR"]   = LR(random_state=SEED, max_iter=5000)
classifiers["SVM"]  = SVM(random_state=SEED)
classifiers["KNN"]  = KNN()

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print("CIFAR10 with tsne+%s Acc:\t %s" % (name, round(accuracy_score(y_test, preds),3)))
    print("CIFAR10 with tsne+%s F1:\t %s" % (name, round(f1_score(y_test, preds,average='weighted'),3)))

MNIST with tsne+DT Acc:	 0.9
MNIST with tsne+DT F1:	 0.901
MNIST with tsne+LR Acc:	 0.88
MNIST with tsne+LR F1:	 0.879
MNIST with tsne+SVM Acc:	 0.902
MNIST with tsne+SVM F1:	 0.901
MNIST with tsne+KNN Acc:	 0.92
MNIST with tsne+KNN F1:	 0.92
