# Lab 3: LDA and QDA Classifier with SKLearn
In this lab, I'll use the SKLearn toolkit to implement a Linear Discriminant Analysis Classifier and a Quadratic Discriminant Analysis Classifier. The train error and test errors are printed below. I will also run additional tests to find which variables are least important in classifying the data. 

In [155]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

'''
Much help:
https://stackabuse.com/implementing-lda-in-python-with-scikit-learn/
'''

def test_dataset(data):
    if len(data) != 150:
        print(1)
        return False
    
    for row in data:
        if len(row) != 5:
            print('len', len(row))
            return False
        
        for column in row[:-1]:
            if type(column) != np.float64:
                print(type(column))
                return False
            
        if type(row[-1]) != str:
            print(4)
            return False
    
    return True

def read_data():
    data = pd.read_csv('iris.data', names=['sl','sw','pl','pw','class'], dtype={'sl':np.float64,'sw':np.float64,'pl':np.float64,'pw':np.float64,'class':str})
    return data

def train_test_split(data, x_col, y_col):
    x_all = data.iloc[:,:x_col] if isinstance(x_col, int) else data.iloc[:,x_col]
    y_all = data.iloc[:,y_col]
    X_train = pd.concat([x_all.iloc[0:40], x_all.iloc[50:90], x_all.iloc[100:140]])
    X_test = pd.concat([x_all.iloc[40:50], x_all.iloc[90:100], x_all.iloc[140:150]])
    y_train = pd.concat([y_all.iloc[0:40], y_all.iloc[50:90], y_all.iloc[100:140]])
    y_test = pd.concat([y_all.iloc[40:50], y_all.iloc[90:100], y_all.iloc[140:150]])
    return X_train, X_test, y_train, y_test
    
def lda_classifier(X_train, y_train):
    # create LDA Classifier
    lda = LinearDiscriminantAnalysis(n_components=1)
    
    # train over training data
    lda.fit(X_train, y_train)
    
    # return trained classifier
    return lda

def qda_classifier(X_train, y_train):
    # create QDA Classifier
    qda = QuadraticDiscriminantAnalysis()
    
    # train over training data
    qda.fit(X_train, y_train)
    
    # return trained classifier
    return qda

def prediction_results(pred, test):
    n = len(test)
    assert len(pred) == n
    correct = 0
    for i in range(n):
        if (pred[i] == test[i]):
            correct = correct + 1
    return (1-correct/n)*100

def main():

    # get data as dataframe
    data = read_data()

    # create test and training sets
    X_train, X_test, y_train, y_test = train_test_split(data, 4, 4)

    # create classifier
    lda = lda_classifier(X_train.values, y_train.values)
    qda = qda_classifier(X_train.values, y_train.values)

    # predict outcomes for test data
    lda_train_pred = lda.predict(X_test.values)
    lda_test_pred = lda.predict(X_train.values)
    qda_train_pred = qda.predict(X_test.values)
    qda_test_pred = qda.predict(X_train.values)
    
    print('>========= LDA Results ========<')
    p1 = prediction_results(lda_train_pred, y_test.values)
    p2 = prediction_results(lda_test_pred, y_train.values)
    print('Training error: {:.2f}%\nTesting error: {:.2f}%\n'.format(p1, p2))

    print('>========= QDA Results ========<')
    p3 = prediction_results(qda_train_pred, y_test.values)
    p4 = prediction_results(qda_test_pred, y_train.values)
    print('Training error: {:.2f}%\nTesting error: {:.2f}%\n'.format(p3, p4))
    
    # test excluding each variable
    rng = [0,1,2,3]
    pr = []
    for i in range(4):
        X_train, X_test, y_train, y_test = train_test_split(data, rng[:i]+rng[i+1:], 4)
        clfr = qda_classifier(X_train.values, y_train.values)
        pred = (clfr.predict(X_train.values), clfr.predict(X_test.values))
        err = (prediction_results(pred[0], y_train.values), prediction_results(pred[1], y_test.values))
        print('Training error while excluding variable {}: {:.2f}%'.format(i+1,err[0]))
        print('Testing error while excluding variable {}: {:.2f}%'.format(i+1,err[1])) 
    
main()

Training error: 0.00%
Testing error: 2.50%

Training error: 0.00%
Testing error: 1.67%

Training error while excluding variable 1: 2.50%
Testing error while excluding variable 1: 0.00%
Training error while excluding variable 2: 1.67%
Testing error while excluding variable 2: 0.00%
Training error while excluding variable 3: 4.17%
Testing error while excluding variable 3: 0.00%
Training error while excluding variable 4: 5.83%
Testing error while excluding variable 4: 3.33%


Based on my testing, I've concluded that sepal length and sepal width individually non-important to classification, since their exclusion only caused a minimal training error and a 0% testing error. Petal width is the most important variable, since the training error spiked up to 5.83% and the testing error to 3.33% when it was excluded.