In [1]:
# Importing essential libraries
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [2]:
# Function to calculate and assess the accuracy of the model
def get_accuracy(y_train, y_pred, y_test, y_test_pred):
    print("Train Accuracy:", accuracy_score(y_train, y_pred))
    print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_pred))
    print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
    print("F1 Train:", f1_score(y_train, y_pred, average='weighted'))
    print("F1 Test:", f1_score(y_test, y_test_pred, average='weighted'))

In [3]:
# Gets the public data. Transposes each image
# and then saves the same
def get_public_test():
    df = pd.read_csv('public_test.csv', header=None)
    X_pt = df.to_numpy().T
    X_public_test = np.zeros(X_pt.shape)
    for i in tqdm(range(X_pt.shape[0])):
        X_public_test[i,:] = X_pt[i,:].reshape(28,28).T.reshape(-1,)
    
    return X_public_test

In [4]:
# Function to perform interpolation on the training Dataset

# Get the 3*3 array of all values around any image
neighbors = [[i,j] for i in range(-1,2) for j in range(-1,2)]
neighbors = np.array(neighbors)

def interpolate_data_threshold(df, threshold=20):
    """
    The interpolate_data_threshold function performs thresholded
    interpolation on the Training Dataset.
    Input:
        - df: Image as a DataFrame
        - threshold: Threshold value below which 
                      the average is set to 0
    Output:
        - df_threshold: Interpolated image.
    """

    data = df.to_numpy()
    positions = np.argwhere(np.isnan(data))
    num_loops = 0
    while positions.size >= 0 and num_loops < 15:
        num_loops += 1
        for pos in positions:
            consider = []
            for neigh in neighbors:
                summation = pos+neigh
                if all(summation > 0) and all(summation < 27) and (not np.isnan(data[summation[0], summation[1]])):
                    consider.append(data[summation[0]][summation[1]])
            consider = np.array(consider)
            if consider.size > 0:
                data[pos[0]][pos[1]] = int(np.ceil(np.mean(consider)))*int(np.ceil(np.mean(consider)) > threshold)
        positions = np.argwhere(np.isnan(data))
    
    if np.isnan(data[0][0]):
        data[0][0] = int(np.ceil(np.mean(data[0:1][1], data[1][0])))
    df_threshold = pd.DataFrame(data)
    return df_threshold

In [7]:
# Save the interpolated thresholded images in X, y
X = np.zeros((10000, 784))
y = np.zeros((10000))

for char in range(10):
    # Access the data from the first 1000 files for each character
    for i in tqdm(range(1,1001), desc="Character "+str(char)):
        fname = "Training Dataset/character_" + str(char) + "/" + str(i) + ".csv"
        df = pd.read_csv(fname, header=None)
        df_threshold = interpolate_data_threshold(df, threshold=20)
        df_threshold = df_threshold.astype(int)
        image = df_threshold.to_numpy().reshape(-1,)
        X[char*1000+i-1,:] = image
        y[char*1000+i-1] = char

# Create backup data
X_backup = X.copy()
y_backup = y.copy()

Character 0: 100%|██████████| 1000/1000 [00:16<00:00, 59.11it/s]
Character 1: 100%|██████████| 1000/1000 [00:16<00:00, 59.92it/s]
Character 2: 100%|██████████| 1000/1000 [00:16<00:00, 60.87it/s]
Character 3: 100%|██████████| 1000/1000 [00:16<00:00, 61.42it/s]
Character 4: 100%|██████████| 1000/1000 [00:15<00:00, 63.88it/s]
Character 5: 100%|██████████| 1000/1000 [00:15<00:00, 64.96it/s]
Character 6: 100%|██████████| 1000/1000 [00:15<00:00, 65.05it/s]
Character 7: 100%|██████████| 1000/1000 [00:15<00:00, 65.04it/s]
Character 8: 100%|██████████| 1000/1000 [00:15<00:00, 63.05it/s]
Character 9: 100%|██████████| 1000/1000 [00:15<00:00, 64.27it/s]


In [8]:
# Using startified split to ensure equal class distribution
stratSplit = StratifiedShuffleSplit(test_size=0.2, random_state=42)

for train_idx, test_idx in stratSplit.split(X, y):
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    y_test = y[test_idx]

X_public_test = get_public_test()

100%|██████████| 1000/1000 [00:00<00:00, 209558.03it/s]


In [9]:
# Regularisation Parameter C = 15
# PCA - 75 Components followed by SVM
# The parameters were obtained after performaing a GridSearchCV

pipe = Pipeline([('pca', PCA(n_components=75, random_state=69)), ('svm', SVC(C=15, random_state=69))])
pipe = pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

# get_accuracy(y_train, y_pred, y_test, y_test_pred)

y_public_pred = pipe.predict(X_public_test).astype(int)
y_public_pred

array([6, 4, 9, 7, 7, 8, 7, 3, 7, 9, 1, 9, 4, 8, 2, 5, 9, 9, 5, 0, 4, 7,
       4, 9, 4, 3, 6, 6, 9, 9, 6, 7, 0, 5, 0, 7, 6, 1, 5, 9, 4, 4, 9, 9,
       0, 1, 7, 9, 2, 2, 0, 5, 5, 5, 6, 1, 0, 3, 9, 7, 5, 3, 7, 6, 3, 6,
       8, 4, 6, 8, 8, 1, 2, 7, 5, 7, 8, 7, 3, 6, 8, 4, 8, 3, 0, 8, 6, 7,
       6, 0, 7, 0, 3, 0, 7, 5, 3, 0, 2, 7, 7, 6, 4, 9, 6, 4, 1, 3, 8, 8,
       6, 6, 1, 0, 5, 2, 7, 6, 6, 2, 3, 3, 2, 4, 3, 2, 8, 3, 6, 0, 4, 4,
       2, 7, 2, 4, 1, 4, 3, 2, 4, 9, 5, 5, 1, 9, 2, 2, 5, 0, 6, 5, 8, 4,
       6, 0, 1, 2, 1, 5, 8, 7, 6, 6, 4, 8, 1, 5, 1, 1, 3, 2, 2, 9, 2, 0,
       9, 0, 6, 1, 6, 1, 2, 4, 8, 9, 1, 2, 8, 4, 1, 2, 2, 8, 4, 7, 2, 9,
       4, 0, 5, 9, 8, 6, 2, 8, 0, 7, 5, 9, 3, 9, 2, 9, 7, 4, 5, 9, 8, 7,
       5, 0, 0, 5, 4, 4, 4, 5, 1, 0, 9, 9, 4, 6, 6, 8, 9, 3, 2, 8, 1, 1,
       1, 7, 4, 8, 2, 9, 2, 5, 0, 0, 4, 3, 6, 3, 2, 7, 0, 7, 3, 8, 7, 5,
       1, 5, 9, 8, 3, 3, 9, 3, 8, 5, 9, 1, 4, 1, 0, 6, 0, 0, 8, 5, 3, 4,
       9, 2, 2, 7, 5, 8, 0, 3, 1, 1, 6, 6, 4, 3, 2,

In [10]:
save_classifier = open("pipe.pickle","wb")
pickle.dump(pipe, save_classifier)
save_classifier.close()

In [11]:
def get_private_test():
    df = pd.read_csv('private_test.csv', header=None)
    X_pt = df.to_numpy().T
    X_private_test = np.zeros(X_pt.shape)
    for i in tqdm(range(X_pt.shape[0])):
        X_private_test[i,:] = X_pt[i,:].reshape(28,28).T.reshape(-1,)
    
    return X_private_test

In [12]:
def predict_private_dataset():
    pipe = pickle.load(open("pipe.pickle", 'rb'))
    X_private = get_private_test()
    predicted_class = pipe.predict(X_private)
    predicted_class = predicted_class.reshape(-1,1).astype(int)
    
    return predicted_class

In [13]:
# # Combination 
# pca = PCA(n_components=75, random_state=69)
# pca = pca.fit(X_train, y_train)
# X_train_pca = pca.transform(X_train)
# X_test_pca = pca.transform(X_test)
# X_public_test_pca = pca.transform(X_public_test)

# y_pred_c = {}
# y_test_pred_c = {}
# clfs = {}
# y_pub_pred_c = {}

# # Constructing classifiers for every class and saving the classwise predictions in 10 arrays
# for c in tqdm(range(10)):
#     clfs[c] = SVC(C=15, random_state=69, probability=True)
# #     clfs[c] = SVC(C=15, probability=True)
#     y_c = (y_train == c).astype(int)
#     clfs[c].fit(X_train_pca, y_c)
#     y_pred_c[c] = clfs[c].predict(X_train_pca)
#     y_test_pred_c[c] = clfs[c].predict(X_test_pca)
#     y_pub_pred_c[c] = clfs[c].predict(X_public_test_pca)

# # Classifier for a separate step
# clf_l = KNeighborsClassifier(3)
# clf_l.fit(X_train_pca, y_train)

# # classify X_pca first based on clf_l.
# # The ones with probability lower than 0.6 in clf_l are classified again, classwise, using the various clfs
# def probwise_predict(clfs, X_pca, clf_l):
#     y_pred_c = {}
#     prob_c = {}

#     # predict the labels and probabilities of classwise classifiers
#     for c in range(10):
#         y_pred_c[c] = clfs[c].predict(X_pca)
#         prob = clfs[c].predict_proba(X_pca)
#         prob_c[c] = prob.max(axis = 1)
#     prob_c = pd.DataFrame(prob_c)

#     prob_clf_c = clf_l.predict_proba(X_train_pca)
#     eg = pd.DataFrame(prob_clf_c)
#     # display(eg)
#     prob_clf_c = clf_l.predict_proba(X_train_pca).max(axis = 1)
#     # print(prob_clf_c)
#     # print(prob_clf_c.shape)

#     #create a boolean array that takes the value 1 where the classwise classification yields multiple or zero labels
#     y_pred_c = pd.DataFrame(y_pred_c)
#     summed = y_pred_c.sum(axis = 1)
#     boo = (summed != 1).astype(int).to_numpy()
#     # extract X and y values where boo = 1
#     if 1 in boo:
#         X_residue = X_pca[boo == 1]
#         y_res = clf_l.predict(X_residue)

#     y_pred = np.zeros(len(y_pred_c))

#     count_p = 0
#     count_r = 0

#     # First, for each datapoint, check if classwise classification yielded a single answer.
#     # If yes, assign the predicted label
#     # If not, check the classification probabily yielded by clf_l.
#     # If greater than 0.6, assign the label predicted by clf_l
#     # If lesser than 0.6, assign the label with the maximum probability amongst the classwise predictors
#     for i,v in enumerate(y_pred_c.values):
#         if boo[i]:
#             p = prob_clf_c[i]
#             if p > 0.6:
#                 y_pred[i] = clf_l.predict(X_pca[i].reshape(1, -1))
#                 count_r += 1
#             else:
#                 row = prob_c.iloc[i]
#                 p_max = row.max()
#                 pos = np.where(row == p_max)[0][0]
#                 y_pred[i] = pos
#                 count_p += 1
#         else:
#             y_pred[i] = np.where(v == 1)[0][0]

#     return(y_pred, boo, count_p, count_r)




# y_test_prob_pred, boo, count_p, count_r = probwise_predict(clfs, X_test_pca, clf_l)
# y_public_test_prob_pred, boo_public, count_p_public, count_r_public = probwise_predict(clfs, X_public_test_pca, clf_l)

# s8 = np.array([6, 4, 9, 7, 7, 8, 7, 3, 7, 9, 1, 9, 4, 8, 2, 5, 9, 9, 5, 0, 4, 7, 4, 9, 4, 3, 6, 6, 9, 9, 6, 7, 0, 5, 0, 7, 6, 1, 5, 9, 4, 4, 9, 9, 0, 1, 7, 9, 2, 2, 0, 5, 5, 5, 6, 1, 0, 3, 9, 7, 5, 3, 7, 6, 3, 6, 8, 4, 6, 8, 8, 1, 2, 7, 5, 7, 8, 7, 3, 6, 8, 4, 8, 3, 0, 8, 6, 7, 6, 0, 7, 0, 3, 0, 7, 5, 3, 0, 2, 7, 7, 6, 4, 9, 6, 4, 1, 3, 8, 8, 6, 6, 1, 0, 5, 2, 7, 6, 6, 2, 3, 3, 2, 4, 3, 2, 8, 3, 6, 0, 4, 4, 2, 7, 2, 4, 1, 4, 3, 2, 4, 9, 5, 5, 1, 9, 2, 2, 5, 0, 6, 5, 8, 4, 6, 0, 1, 2, 1, 5, 8, 7, 6, 6, 4, 8, 1, 5, 1, 1, 3, 2, 2, 9, 2, 0, 9, 0, 6, 1, 6, 1, 2, 4, 8, 9, 1, 2, 8, 4, 1, 2, 2, 8, 4, 7, 2, 9, 4, 0, 5, 9, 8, 6, 2, 8, 0, 7, 5, 9, 3, 9, 2, 9, 7, 4, 5, 9, 8, 7, 5, 0, 0, 5, 4, 4, 4, 5, 1, 0, 9, 9, 4, 6, 6, 8, 9, 3, 2, 8, 1, 1, 1, 7, 4, 8, 2, 9, 2, 5, 0, 0, 4, 3, 6, 3, 2, 7, 0, 7, 3, 8, 7, 5, 1, 5, 9, 8, 3, 3, 9, 3, 8, 5, 9, 1, 4, 1, 0, 6, 0, 0, 8, 5, 3, 4, 9, 2, 2, 7, 5, 8, 0, 3, 1, 1, 6, 6, 4, 3, 2, 4, 6, 4, 3, 2, 0, 6, 3, 3, 5, 0, 1, 1, 0, 1, 1, 8, 4, 8, 6, 8, 4, 2, 8, 3, 6, 2, 7, 4, 2, 5, 0, 2, 1, 2, 0, 3, 7, 2, 3, 8, 9, 7, 0, 2, 2, 5, 6, 3, 8, 3, 8, 9, 3, 2, 4, 1, 9, 0, 6, 4, 3, 5, 1, 2, 0, 8, 2, 2, 0, 9, 6, 9, 0, 0, 2, 4, 3, 9, 9, 5, 0, 8, 8, 4, 9, 9, 2, 1, 3, 0, 0, 1, 8, 5, 7, 3, 2, 6, 8, 5, 7, 0, 6, 9, 0, 1, 5, 9, 2, 2, 9, 2, 4, 5, 8, 4, 4, 7, 1, 9, 5, 1, 5, 8, 3, 7, 6, 6, 9, 1, 3, 7, 8, 0, 6, 6, 9, 4, 0, 3, 3, 1, 3, 1, 8, 9, 4, 5, 7, 2, 4, 4, 1, 6, 7, 3, 6, 2, 9, 1, 6, 9, 2, 5, 9, 7, 3, 0, 9, 9, 9, 1, 4, 4, 4, 7, 5, 7, 5, 1, 9, 0, 0, 7, 7, 6, 4, 7, 8, 2, 9, 5, 5, 9, 1, 9, 7, 3, 3, 8, 6, 1, 5, 5, 0, 9, 8, 0, 1, 3, 9, 2, 3, 8, 9, 8, 2, 8, 8, 3, 7, 8, 5, 7, 1, 1, 9, 7, 8, 5, 8, 9, 1, 4, 1, 0, 4, 5, 3, 0, 0, 2, 3, 8, 8, 4, 6, 9, 6, 3, 8, 7, 9, 6, 6, 2, 5, 0, 4, 9, 6, 9, 6, 1, 9, 2, 2, 4, 9, 2, 0, 7, 6, 7, 4, 0, 5, 3, 8, 3, 4, 6, 6, 2, 0, 0, 5, 7, 8, 8, 1, 1, 2, 4, 9, 7, 0, 5, 7, 9, 5, 1, 0, 2, 5, 1, 2, 4, 0, 9, 4, 4, 1, 0, 0, 1, 8, 4, 3, 1, 3, 1, 1, 4, 6, 8, 2, 8, 5, 2, 9, 8, 6, 3, 9, 9, 1, 5, 0, 3, 4, 0, 3, 6, 4, 9, 6, 3, 2, 4, 8, 7, 0, 0, 4, 9, 5, 5, 7, 9, 3, 2, 0, 9, 8, 5, 7, 7, 9, 9, 8, 5, 5, 8, 6, 3, 1, 0, 1, 1, 3, 5, 5, 9, 0, 9, 8, 7, 5, 3, 7, 5, 7, 0, 3, 7, 1, 3, 3, 4, 3, 8, 6, 8, 4, 6, 3, 7, 6, 1, 3, 4, 9, 7, 4, 4, 4, 1, 2, 4, 2, 2, 5, 8, 9, 5, 2, 7, 1, 3, 4, 1, 6, 3, 9, 9, 6, 3, 5, 0, 1, 2, 4, 1, 8, 4, 3, 1, 0, 2, 3, 8, 7, 6, 7, 5, 0, 3, 0, 2, 3, 6, 1, 2, 4, 2, 4, 1, 2, 1, 6, 7, 2, 7, 2, 1, 5, 7, 1, 9, 0, 4, 8, 2, 7, 0, 1, 6, 2, 3, 7, 7, 9, 4, 8, 8, 7, 2, 0, 8, 1, 0, 6, 1, 0, 5, 6, 3, 8, 0, 7, 0, 0, 1, 7, 2, 1, 8, 4, 1, 1, 5, 6, 5, 7, 7, 0, 9, 6, 3, 4, 1, 2, 8, 1, 3, 6, 6, 1, 3, 8, 2, 8, 2, 4, 3, 8, 0, 8, 3, 1, 9, 8, 9, 6, 6, 5, 6, 0, 5, 6, 0, 4, 7, 1, 3, 1, 2, 6, 4, 7, 4, 5, 6, 7, 4, 4, 7, 3, 7, 2, 8, 2, 0, 9, 5, 1, 7, 6, 0, 6, 3, 1, 5, 0, 7, 3, 6, 0, 8, 0, 8, 7, 6, 2, 0, 7, 6, 5, 4, 5, 5, 3, 5, 4, 7, 5, 8, 2, 1, 2, 9, 5, 5, 7, 5, 9, 4, 5, 7, 7, 3, 4, 0, 4, 5, 7, 7, 0, 6, 6, 1, 4, 6, 8, 5, 8, 9, 5, 1, 3, 5, 4, 8, 2, 7, 5, 6, 2, 8, 1, 8, 3, 5, 9, 7, 5, 2, 8, 6, 6, 0, 0, 4, 1, 5, 2, 1, 5, 9, 5, 5, 7, 6, 7, 7, 8, 7, 4, 5, 5, 1, 9, 8, 8, 0, 6, 4])
# pos = np.where(y_public_test_prob_pred!=s8)
# print("Mismatch:", pos[0].size)

In [14]:
# X_train.shape
# x = X_train.T
# x = x-np.mean(x, axis=1).reshape(-1,1)
# N = x.shape[1]
# S = (1/N)*(x@x.T)
# w, v = np.linalg.eig(S)
# plt.plot(w[:80], '.-')

In [15]:
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RationalQuadratic

# # clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
# clf1 = KNeighborsClassifier(weights='distance')
# clf2 = KNeighborsClassifier(n_neighbors=10, weights='distance')
# clf3 = RandomForestClassifier(n_estimators=50, random_state=1)
# clf4 = RandomForestClassifier(n_estimators=100, random_state=1)
# clf5 = SVC(C=15, random_state=1)
# clf7 = SVC(C=10, random_state=1)
# kernel = RationalQuadratic(length_scale=1000, alpha=20)
# gpc = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X_train_pca, y_train)

# eclf1 = VotingClassifier(estimators=[('knn', clf1), ('knn2', clf2), ('rf', clf3), 
#                                      ('rf2', clf4), ('svm1', clf5), ('gpc', gpc), 
#                                      ('svm3', clf7)], voting='hard')
# eclf1 = eclf1.fit(X_train_pca, y_train)

# y_pred = eclf1.predict(X_train_pca)
# y_test_pred = eclf1.predict(X_test_pca)

# get_accuracy(y_train, y_pred, y_test, y_test_pred)

# y_public_pred = eclf1.predict(X_public_test_pca)

In [16]:
# from sklearn.ensemble import BaggingClassifier

# pca = PCA(n_components=75)
# pca = pca.fit(X_train, y_train)
# X_train_pca = pca.transform(X_train)
# X_test_pca = pca.transform(X_test)

# clf = BaggingClassifier(base_estimator=SVC(C=15), n_estimators=200, random_state=0, bootstrap=False)
# clf.fit(X_train_pca, y_train)
# y_pred = clf.predict(X_train_pca)
# y_test_pred = clf.predict(X_test_pca)

# get_accuracy(y_train, y_pred, y_test, y_test_pred)

# # F1 Train: 0.9896183054246059
# # F1 Test: 0.9613735905734541
# # 100, bootstrap=False
# # F1 Train: 0.9892414065774306
# # F1 Test: 0.9594006816763111
# # 100, bootstrap=True

In [17]:
# # Commented Out. Because it takes long time to run
# # SVM with RBF
# clf = SVC()
# clf.fit(X_train, y_train)
# y_pred_SVM = clf.predict(X_train)
# y_test_pred_SVM = clf.predict(X_test)

# get_accuracy(y_train, y_pred_SVM, y_test, y_test_pred_SVM)

In [18]:
# # Random Forest Classifier
# model = RandomForestClassifier(max_depth=11, criterion='entropy')
# model.fit(X_train, y_train)
# y_pred_RF = model.predict(X_train)
# y_test_pred_RF = model.predict(X_test)

# get_accuracy(y_train, y_pred_RF, y_test, y_test_pred_RF)

In [19]:
# # Decision Tree Classifier
# clf = tree.DecisionTreeClassifier(min_samples_split=5)
# clf.fit(X_train, y_train)
# y_pred_DT = clf.predict(X_train)
# y_test_pred_DT = clf.predict(X_test)

# get_accuracy(y_train, y_pred_DT, y_test, y_test_pred_DT)

In [20]:
# # SVM with Poly
# clf = SVC(kernel='poly', degree=9)
# clf.fit(X_train, y_train)
# y_pred_SVMP = clf.predict(X_train)
# y_test_pred_SVMP = clf.predict(X_test)

# get_accuracy(y_train, y_pred_SVMP, y_test, y_test_pred_SVMP)

In [21]:
# # SVM with RBF; gamma-auto
# clf = SVC(gamma='auto')
# clf.fit(X_train, y_train)
# y_pred_SVMA = clf.predict(X_train)
# y_test_pred_SVMA = clf.predict(X_test)

# get_accuracy(y_train, y_pred_SVMA, y_test, y_test_pred_SVMA)

In [22]:
# # SVM with Sigmoid
# clf = SVC(kernel='sigmoid')
# clf.fit(X_train, y_train)
# y_pred_LO = clf.predict(X_train)
# y_test_pred_LO = clf.predict(X_test)

# get_accuracy(y_train, y_pred_LO, y_test, y_test_pred_LO)

In [23]:
# # Commented Out. Because it takes long time to run
# # Submission 3
# # Apply PCA and then SVM
# num_components = 150
# pca = PCA(n_components=num_components)
# pca.fit(X_train)

# X_train_pca = pca.transform(X_train)
# X_test_pca = pca.transform(X_test)

# clf = SVC()
# clf = clf.fit(X_train_pca, y_train)

# y_pred_PCA = clf.predict(X_train_pca)
# y_test_pred_PCA = clf.predict(X_test_pca)

# get_accuracy(y_train, y_pred_PCA, y_test, y_test_pred_PCA)

# df_test = pd.read_csv("processed_test.csv")
# X_public_test = df_test.to_numpy().T
# X_public_test_pca = pca.transform(X_public_test)
# y_public_pred = clf.predict(X_public_test_pca)
# y_public_pred

In [24]:
# # Submission 5
# # PCA - 75 Components followed by SVM

# num_components = 75
# pca = PCA(n_components=num_components)
# pca.fit(X_train)

# X_train_pca = pca.transform(X_train)
# X_test_pca = pca.transform(X_test)

# clf = SVC()
# clf = clf.fit(X_train_pca, y_train)

# y_pred_PCA = clf.predict(X_train_pca)
# y_test_pred_PCA = clf.predict(X_test_pca)

# get_accuracy(y_train, y_pred_PCA, y_test, y_test_pred_PCA)

# df_test = pd.read_csv("processed_test.csv")
# X_public_test = df_test.to_numpy().T
# X_public_test_pca = pca.transform(X_public_test)
# y_public_pred = clf.predict(X_public_test_pca)
# y_public_pred

In [25]:
# # Normalising database
# X = np.load('interpolated_traindata.npz')["X_train"].T.astype(int)
# y = np.load('interpolated_traindata.npz')["y_train"].reshape(-1,).astype(int)

# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# stratSplit = StratifiedShuffleSplit(test_size=0.2, random_state=42)

# for train_idx, test_idx in stratSplit.split(X, y):
#     X_train = X[train_idx]
#     y_train = y[train_idx]
#     X_test = X[test_idx]
#     y_test = y[test_idx]

In [26]:
# # Next Attempt - LDA
# clf = LinearDiscriminantAnalysis()
# clf = clf.fit(X_train, y_train)

# y_pred = clf.predict(X_train)
# y_test_pred = clf.predict(X_test)

# df_test = pd.read_csv("processed_test.csv")
# X_public_test = df_test.to_numpy().T
# y_public_pred = clf.predict(X_public_test)
# y_public_pred

# pos = np.where((s5!=y_public_pred))

In [27]:
# y_pred_c = {}
# y_test_pred_c = {}
# y_public_test_pred_c = {}

# for c in tqdm(range(50)):
#     pca = PCA(n_components=75, random_state=10*c)
#     svm = SVC(C=15, random_state=10*c, probability=True)
#     pipe = Pipeline(steps=[('pca', pca), ('svm', svm)])

#     pipe.fit(X_train, y_train)
#     y_pred_c[c] = pipe.predict(X_train)
#     y_test_pred_c[c] = pipe.predict(X_test)
#     y_public_test_pred_c[c] = pipe.predict(X_public_test)

In [28]:
# # Attempt - HOG transformation
# pca = PCA(random_state=10)
# # set the tolerance to a large value to make the example faster
# # 3
# svm = SVC(random_state=10)
# pipe = Pipeline(steps=[('pca', pca), ('svm', svm)])

# n_components = list(range(10, 50, 2))
# n_components.append('mle')

# # Parameters of pipelines can be set using ‘__’ separated parameter names:
# param_grid = {
#     'svm__C': [0, 1, 10, 15, 30, 50],
#     'pca__n_components': n_components 
# }

# search = GridSearchCV(pipe, param_grid, n_jobs=-1)
# search = search.fit(X_train_hog, y_train)
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# print(search.best_params_)

In [29]:
# # Attempt scaling
# X_train_scale = X_train/255
# X_test_scale = X_test/255
# X_public_test_scale = X_public_test/255

# pca = PCA(random_state=10)
# # set the tolerance to a large value to make the example faster
# svm = SVC(random_state=10)
# pipe = Pipeline(steps=[('pca', pca), ('svm', svm)])

# n_components = list(range(10, 50, 2))
# n_components.append('mle')

# # Parameters of pipelines can be set using ‘__’ separated parameter names:
# param_grid = {
#     'svm__C': [0, 1, 10, 15, 30, 50],
#     'pca__n_components': n_components,
#     'svm__gamma': ['auto', 0.001, 0.01, 0.1]
# }

# search = GridSearchCV(pipe, param_grid, n_jobs=-1)
# search = search.fit(X_train_scale, y_train)
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# print(search.best_params_)