***Lab 3 Austin Nguyen***

Friday, October 11, 2024 2:15- 5:00PM 


In [36]:
#Imports
import numpy as np
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.metrics import accuracy_score

# Load the dataset from given file
def load_csv(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                split_line = line.split(',')
                features = list(map(float, split_line[:4])) # Convert first 4 columns to float
                label = split_line[4]  # Class label as a string
                data.append(features + [label])
    return data

# Test the data format using the given function
def test_dataset(data):
    if len(data) != 150:
        return False
    
    for row in data:
        if len(row) != 5:
            return False
        
        for column in row[:-1]:
            if type(column) != np.float64:
                return False
            
        if type(row[-1]) != str:
            return False
    
    return True

# Split dataset into training and testing sets (80% training, 20% test)
def split_data(data):
    train_data = []
    test_data = []
    
    # Split per class
    classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
    for class_label in classes:
        class_data = [row for row in data if row[-1] == class_label]
        train_data += class_data[:40]  # First 40 instances for training
        test_data += class_data[40:]  # Remaining 10 for testing

    return train_data, test_data

# Separate features and labels
def separate_features_labels(dataset):
    features = np.array([row[:-1] for row in dataset], dtype=np.float64)
    labels = np.array([row[-1] for row in dataset])
    return features, labels

# Step 2: Linear Discriminant Analysis (LDA)
def train_lda(train_features, train_labels):
    lda = LinearDiscriminantAnalysis()
    lda.fit(train_features.astype(np.float64), train_labels)
    return lda

# Step 3: Quadratic Discriminant Analysis (QDA)
def train_qda(train_features, train_labels):
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(train_features.astype(np.float64), train_labels)
    return qda

# Function to calculate accuracy when missing one feature
def featureaccuracy(model, train_features, train_labels, test_features, test_labels):
    feature_names = ["Sepal Length", "Sepal Width", "Petal Length", "Petal Width"]
    full_train_accuracy = accuracy_score(train_labels, model.predict(train_features))
    full_test_accuracy = accuracy_score(test_labels, model.predict(test_features))

    print("\nAccuracy with all features:")
    print(f"Training accuracy: {full_train_accuracy * 100:.2f}%")
    print(f"Test accuracy: {full_test_accuracy * 100:.2f}%")

    # Iterate over each feature and remove it
    for i in range(train_features.shape[1]):
        # Remove feature i from the dataset
        reduced_train_features = np.delete(train_features, i, axis=1)
        reduced_test_features = np.delete(test_features, i, axis=1)

        # Retrain the model with the reduced dataset
        model.fit(reduced_train_features, train_labels)

        # Calculate accuracy with the reduced feature set
        reduced_train_accuracy = accuracy_score(train_labels, model.predict(reduced_train_features))
        reduced_test_accuracy = accuracy_score(test_labels, model.predict(reduced_test_features))

        # Print the results for the feature removed
        print(f"\nAccuracy without {feature_names[i]}:")
        print(f"Training accuracy: {reduced_train_accuracy * 100:.2f}%")
        print(f"Test accuracy: {reduced_test_accuracy * 100:.2f}%")

# Main workflow
if __name__ == "__main__":
    filename = 'iris.data.csv'  
    data = load_csv(filename)

    # Split the data into training and test sets
    train_data, test_data = split_data(data)

    # Separate features and labels for training and testing sets
    train_features, train_labels = separate_features_labels(train_data)
    test_features, test_labels = separate_features_labels(test_data)

    # Train and evaluate LDA model
    lda_classifier = train_lda(train_features, train_labels)

    # Feature importance using leave-one-out accuracy for LDA
    print("\n LDA Feature Importance")
    featureaccuracy(lda_classifier, train_features, train_labels, test_features, test_labels)

    # QDA model
    qda_classifier = train_qda(train_features, train_labels)
    
    train_predictions_qda = qda_classifier.predict(train_features)
    test_predictions_qda = qda_classifier.predict(test_features)

    train_accuracy_qda = accuracy_score(train_labels, train_predictions_qda)
    test_accuracy_qda = accuracy_score(test_labels, test_predictions_qda)

    print(f"\nQDA Training Accuracy: {train_accuracy_qda * 100:.2f}%")
    print(f"QDA Test Accuracy: {test_accuracy_qda * 100:.2f}%")




 LDA Feature Importance

Accuracy with all features:
Training accuracy: 97.50%
Test accuracy: 100.00%

Accuracy without Sepal Length:
Training accuracy: 98.33%
Test accuracy: 100.00%

Accuracy without Sepal Width:
Training accuracy: 97.50%
Test accuracy: 100.00%

Accuracy without Petal Length:
Training accuracy: 94.17%
Test accuracy: 100.00%

Accuracy without Petal Width:
Training accuracy: 95.83%
Test accuracy: 96.67%

QDA Training Accuracy: 98.33%
QDA Test Accuracy: 100.00%


***Part 4 Feature Importance Analysis***


When testing one at a time, Petal Length and Petal Width affected the accuracies of the tests while sepal length/width did not (accuracies went down)

