In [1]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(suppress = True)

#### Get train and test data from CSV files

In [2]:
def get_train_test_split(csv_file,test_percent):
    with open(csv_file, 'r') as f:
        results = []
        next(f)
        for line in f:
            words = line.split(',')
            for i in range(len(words)):
                words[i] = float(words[i])
            results.append(words)
    Data = np.array(results)   
    
    val_size = test_percent/100
    # calculate the number of validation data rows
    val_rows = int(val_size * Data.shape[0])


    # split the data into train and validation sets
    Data_train = Data[val_rows:]
    Data_train=np.array(Data_train)
    Data_Y_train = Data_train[:,0].astype(int)
    Data_X_train = np.delete(Data_train, 0, axis = 1)
    #
    Data_val = Data[:val_rows]
    Data_val=np.array(Data_val)
    Data_Y_val = Data_val[:,0].astype(int)
    Data_X_val = np.delete(Data_val, 0, axis = 1)
    return Data_X_train,Data_Y_train,Data_X_val,Data_Y_val
    

### FLDA 

In [3]:
def FLDA(X, y, n_components=None):
    classes = np.unique(y)
    total_mean=np.mean(X)
    if n_components is None:
        n_components = len(classes) - 1

    means = []
    scatter = np.zeros((X.shape[1], X.shape[1]))
    for c in classes:
        Xc = X[y == c]
        means.append(np.mean(Xc, axis=0))
        scatter += np.cov(Xc.T, ddof=1)

    S_W = scatter / len(classes)
    S_B = np.zeros((X.shape[1], X.shape[1]))
    for i, mean in enumerate(means):
        n = X[y == classes[i]].shape[0]
        mean = mean.reshape(-1, 1)
        S_B += n * (mean - total_mean).dot((mean - total_mean).T)

    eigenvalues, eigenvectors = np.linalg.eig(np.linalg.pinv(S_W).dot(S_B))
    #eigenvalues, eigenvectors=eigenvalues.real,eigenvectors.real
    top_indices = np.argsort(eigenvalues)[::-1][:n_components]
    top_eigenvectors = eigenvectors[:, top_indices]
    X_lda = X.dot(top_eigenvectors)

    return X_lda, top_eigenvectors

#### FLDA with nearest mean classfier

In [4]:
def FLDA_classifier(X_train, y_train, X_test, n_components=None):
    X_train_lda,eigenvectors = FLDA(X_train, y_train, n_components)
    means = []
    for c in np.unique(y_train):
        means.append(np.mean(X_train_lda[y_train == c], axis=0))

    X_test_lda = X_test.dot(eigenvectors)
    y_pred = []
    for x in X_test_lda:
        distances = [np.linalg.norm(x - mean) for mean in means]
        y_pred.append(np.argmin(distances))

    return np.array(y_pred)

#### Metrics

In [5]:
def metrics(predicted_labels, test_labels):
    
    # Accuracy
    
    Accuracy = 0
    for i in range(len(test_labels)):
        if predicted_labels[i] == test_labels[i]:
            Accuracy += 1
    Accuracy /= len(predicted_labels)
    Accuracy *= 100
    print('Classification Accuracy  on Test Data is: ', Accuracy, '\n')
    
    # Confusion Matrix
    classes=len(np.unique(test_labels))
    ConfMatrix = np.zeros([classes, classes])
    
    for i in range(len(test_labels)):
        ConfMatrix[test_labels[i] - 1, predicted_labels[i] - 1] += 1
    print('Confusion Matrix is: \n', ConfMatrix, '\n')
    
    # Precision
    
    Precision = []
    for i in range(classes):
        Precision.append(ConfMatrix[i][i] / np.sum(ConfMatrix[:,i]))
    
    # Recall
    
    Recall = []
    for i in range(classes):
        Recall.append(ConfMatrix[i][i] / np.sum(ConfMatrix[i,:]))
    
    # f1 Score
    
    f1Score = []
    for i in range(classes):
        f1Score.append(2 * Recall[i] * Precision[i] / (Recall[i] + Precision[i]))
        print('f1 Score of Class ', i + 1 , ' is: ', f1Score[i])

# A1_p3

In [6]:
def get_train_test(train_csv_file,test_csv_file):
    with open(train_csv_file, 'r') as f:
        results = []
        for line in f:
            words = line.split(',')
            for i in range(len(words)):
                words[i] = float(words[i])
            results.append(words)
    Data = np.array(results)
    Data_Y_train=Data[:,10].astype(int)-1
    Data_X_train = np.delete(Data, 10, axis = 1)
    with open(test_csv_file, 'r') as f:
        results = []
        for line in f:
            words = line.split(',')
            for i in range(len(words)):
                words[i] = float(words[i])
            results.append(words)
    Data_test = np.array(results)
    Data_Y_test = Data_test[:,10].astype(int)-1
    Data_X_test = np.delete(Data_test, 10, axis = 1)
    return Data_X_train,Data_Y_train,Data_X_test,Data_Y_test


In [7]:
Data_X_train,Data_Y_train,Data_X_test,Data_Y_test=get_train_test('p3_train.csv','p3_test.csv')

In [8]:
y_pred=FLDA_classifier(Data_X_train,Data_Y_train,Data_X_test)

In [9]:
print("Metrics for p3 in A1:")
metrics(y_pred,Data_Y_test)


Metrics for p3 in A1:
Classification Accuracy  on Test Data is:  57.14666666666667 

Confusion Matrix is: 
 [[1700.  292.  269.  333.  387.]
 [ 317. 1866.  253.  290.  314.]
 [ 349.  256. 1727.  290.  353.]
 [ 326.  245.  334. 1774.  354.]
 [ 394.  303.  369.  400. 1505.]] 

f1 Score of Class  1  is:  0.5604087687489698
f1 Score of Class  2  is:  0.6217927357547485
f1 Score of Class  3  is:  0.582756875316349
f1 Score of Class  4  is:  0.5797385620915034
f1 Score of Class  5  is:  0.5115567641060503


# A1_p4

In [10]:
Data_X_train,Data_Y_train,Data_X_test,Data_Y_test=get_train_test_split('p4_train.csv',30)

In [11]:
y_pred=FLDA_classifier(Data_X_train,Data_Y_train,Data_X_test)

In [12]:
print("Metrics for p4 in A1:")
metrics(y_pred,Data_Y_test)


Metrics for p4 in A1:
Classification Accuracy  on Test Data is:  88.49444444444444 

Confusion Matrix is: 
 [[1742.    3.   13.   11.    9.    3.    0.    0.    6.   13.]
 [   3. 1712.   14.    2.   42.    0.    1.    1.    1.   24.]
 [   2.    0. 1478.  154.   50.    0.   53.    0.    0.   63.]
 [   1.    0.    7. 1730.   52.    0.    5.    0.    1.    4.]
 [   9.   11.   11.   67. 1684.    1.    3.   13.    1.    0.]
 [   3.    0.   67.   25.    3. 1547.  153.    0.    2.    0.]
 [  10.    0.  181.  137.   12.  302. 1130.    0.    3.   25.]
 [   6.    2.    3.   15.    6.    0.    0. 1669.   48.   51.]
 [   1.    0.    4.   76.    6.    9.   13.   40. 1642.    9.]
 [ 148.    1.   27.   12.    7.    2.    3.    3.    2. 1595.]] 

f1 Score of Class  1  is:  0.9353020134228188
f1 Score of Class  2  is:  0.9702465287616889
f1 Score of Class  3  is:  0.8199722607489597
f1 Score of Class  4  is:  0.8587738893025565
f1 Score of Class  5  is:  0.917461182239172
f1 Score of Class  6  is:  0.8

# <b>A1_p5</b>

In [13]:
Data_X_train,Data_Y_train,Data_X_val,Data_Y_val=get_train_test_split('PCA_MNIST.csv',30)

In [14]:
y_pred=FLDA_classifier(Data_X_train,Data_Y_train,Data_X_val)

In [15]:
print("Metrics for p5 in A1:")
metrics(y_pred,Data_Y_val)

Metrics for p5 in A1:
Classification Accuracy  on Test Data is:  81.74444444444444 

Confusion Matrix is: 
 [[1642.    1.   62.    7.    4.    0.    1.   20.   21.   42.]
 [  21. 1557.   20.    1.  136.    0.   11.    2.    0.   52.]
 [   2.    2. 1473.  147.   28.    3.   75.    0.    2.   68.]
 [   0.    0.   22. 1703.   58.    3.    4.    4.    1.    5.]
 [   0.    7.   92.  107. 1580.    2.    5.    7.    0.    0.]
 [   0.    3.   48.   51.    4. 1169.  521.    0.    1.    3.]
 [  10.    0.  185.  203.    6.  311. 1053.    3.    2.   27.]
 [  10.    3.   10.   29.   19.    2.    1. 1586.   86.   54.]
 [   6.    0.   30.  114.    4.    6.   33.   61. 1512.   34.]
 [ 183.    0.   90.   21.   10.    0.    2.   36.   19. 1439.]] 

f1 Score of Class  1  is:  0.8938486663037561
f1 Score of Class  2  is:  0.9232137563000297
f1 Score of Class  3  is:  0.7687891440501045
f1 Score of Class  4  is:  0.8142481472627301
f1 Score of Class  5  is:  0.8659906823787339
f1 Score of Class  6  is:  0.