Machine Learning Mini-project:

Importing libraries and unpickling dataset:

In [None]:
import numpy as np
from numpy import linalg as LA
import pandas as pd
import scipy.io as sp
from scipy.stats import multivariate_normal as norm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier as DTC



def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

data_batch_1 = unpickle('cifar-10-batches-py/data_batch_1')
data_batch_2 = unpickle('cifar-10-batches-py/data_batch_2')
data_batch_3 = unpickle('cifar-10-batches-py/data_batch_3')
data_batch_4 = unpickle('cifar-10-batches-py/data_batch_4')
data_batch_5 = unpickle('cifar-10-batches-py/data_batch_5')
test_batch = unpickle('cifar-10-batches-py/test_batch')
batches = unpickle('cifar-10-batches-py/batches.meta')

train_labels = np.concatenate((data_batch_1[b'labels'], data_batch_2[b'labels'],
                              data_batch_3[b'labels'], data_batch_4[b'labels'], data_batch_5[b'labels']))
train_data = np.concatenate((data_batch_1[b'data'], data_batch_2[b'data'],
                            data_batch_3[b'data'], data_batch_4[b'data'], data_batch_5[b'data']))

test_labels = np.array(test_batch[b'labels'])
test_data = test_batch[b'data']

Calculating a 90% PoV and fitting PCA:

In [None]:
cov = np.cov(train_data.T)
eigvals = LA.eigvals(cov)

eigvals[::-1].sort()
eig_D = np.sum(eigvals)
for i in range(len(eigvals)):
    if np.sum(eigvals[:i+1])/eig_D > 0.9:
        eig_M = i+1
        break
print(eig_M)
pca = PCA(n_components=eig_M)
pca.fit(train_data)

Fitting and predicting with LDA:

In [None]:
new_train_data = pca.transform(train_data)
clf_lda = LDA()
clf_lda.fit(new_train_data, train_labels)

new_test_data = pca.transform(test_data)
lda_predictions = clf_lda.predict(new_test_data)
print(clf_lda.score(new_test_data, test_labels))

Fitting and predicting with Linear SVM:

In [None]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(new_train_data, train_labels)

svm_predictions = clf_svm.predict(new_test_data)

print(clf_svm.score(new_test_data, test_labels))

Convolutional Neural network with TensorFlow:

In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
print("TensorFlow version:", tf.__version__)

(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()

# Normalize pixel values to be between 0 and 1
train_images, test_images = train_images / 255.0, test_images / 255.0

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10))
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(train_images, train_labels, epochs=6, 
                    validation_data=(test_images, test_labels))

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

Fitting and predicting with K-nearest neighbor:

In [None]:
# error_rate = []
# len_for = len(new_train_data)
# for i in range(1, len_for):
#   neigh = KNC(n_neighbors=i)
#   #print("Making cluster(s): {0} of {1}".format(i, int(len_for)))
#   neigh.fit(new_train_data, train_labels)
#   pred_i = neigh.predict(new_test_data)
#   error_rate.append(np.mean(pred_i != test_labels))

# k_clusters = error_rate.index(min(error_rate))
# min_error = min(error_rate)
# print("Minimum error:-",min_error,"at K =",k_clusters)

neigh2 = KNC(n_neighbors=10)
neigh2.fit(new_train_data, train_labels)
knn_predict = neigh2.predict(new_test_data)

print(neigh2.score(new_test_data, test_labels))


Decision Tree:

In [None]:
import graphviz
from sklearn import tree
#from sklearn.tree import plot_tree
#import matplotlib.pyplot as plt

predictions =[]

for i in range(1, len(new_train_data)):
  clf_dtc = DTC(max_depth=i, splitter="best")
  print("Fitting: {0}".format(i))
  clf_dtc.fit(new_train_data, train_labels)
  #dot_data = tree.export_graphviz(clf_dtc, out_file = None)
  #graph = graphviz.Source(dot_data)
  #graph.render("CLF_DTC")
  dtc_predictions = clf_dtc.predict(new_test_data)
  predictions.append(dtc_predictions)

best_prediction = max(predictions)
max_d_index = predictions.index(best_prediction)
print(max_d_index)
print(clf_dtc.score(new_test_data, test_labels))