## Lab 4 Austin Nguyen

October 18, 2024: Friday 2:15 - 5:00 PM


In [2]:
import numpy as np
import time

# Load the dataset from given file
def load_csv(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                split_line = line.split(',')
                features = list(map(float, split_line[:4]))  # Convert first 4 columns to float
                label = split_line[4]  # Class label as a string
                data.append(features + [label])
    return data

# Split dataset into training and testing sets (80% training, 20% test)
def split_data(data):
    train_data = []
    test_data = []
    
    # Split per class
    classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
    for class_label in classes:
        class_data = [row for row in data if row[-1] == class_label]
        train_data += class_data[:40]  # First 40 instances for training
        test_data += class_data[40:]  # Remaining 10 for testing

    return train_data, test_data

# Separate features and labels
def separate_features_labels(dataset):
    features = np.array([row[:-1] for row in dataset], dtype=np.float64)
    labels = np.array([row[-1] for row in dataset])
    return features, labels

# Mean and covariance per class
def compute_class_stats(features, labels):
    class_stats = {}
    for class_label in np.unique(labels):
        class_features = features[labels == class_label]
        mean = np.mean(class_features, axis=0)
        cov = np.cov(class_features, rowvar=False)
        class_stats[class_label] = {"mean": mean, "cov": cov}
    return class_stats

# Shared covariance matrix for LDA
def compute_shared_cov(features, labels, class_stats):
    n_samples = len(features)
    shared_cov = np.zeros((features.shape[1], features.shape[1]))
    
    for class_label in np.unique(labels):
        class_cov = class_stats[class_label]["cov"]
        n_class_samples = len(features[labels == class_label])
        shared_cov += (n_class_samples / n_samples) * class_cov  # Weighted sum
    
    return shared_cov

# Binary classification: 1 for the class of interest, 0 for the other 2
def binary_classification(train_labels, test_labels, target_class):
    binary_train_labels = np.where(train_labels == target_class, 1, 0)  # Label 1 for target, 0 for others
    binary_test_labels = np.where(test_labels == target_class, 1, 0)    # Label 1 for target, 0 for others
    return binary_train_labels, binary_test_labels

# LDA classifier implementation
class LDA:
    def __init__(self):
        self.class_stats = None
        self.priors = None
        self.shared_cov = None

    def fit(self, X, y):
        self.class_stats = compute_class_stats(X, y)
        unique_classes, class_counts = np.unique(y, return_counts=True)
        self.priors = class_counts / len(y)
        self.shared_cov = compute_shared_cov(X, y, self.class_stats)

    def predict(self, X):
        predictions = []
        cov_inv = np.linalg.inv(self.shared_cov)
        
        for x in X:
            class_probs = []
            for idx, class_label in enumerate(self.class_stats.keys()):
                mean = self.class_stats[class_label]["mean"]
                prior = self.priors[idx]

                # Compute the LDA discriminant function using the shared covariance matrix
                discriminant = np.dot(np.dot(x, cov_inv), mean) - 0.5 * np.dot(np.dot(mean.T, cov_inv), mean) + np.log(prior)
                class_probs.append(discriminant)
                
            predictions.append(list(self.class_stats.keys())[np.argmax(class_probs)])
        return np.array(predictions)

# QDA classifier implementation (same from lab 3)
class QDA:
    def __init__(self):
        self.class_stats = None
        self.priors = None

    def fit(self, X, y):
        self.class_stats = compute_class_stats(X, y)
        unique_classes, class_counts = np.unique(y, return_counts=True)
        self.priors = class_counts / len(y)

    def predict(self, X):
        predictions = []
        for x in X:
            class_probs = []
            for idx, class_label in enumerate(self.class_stats.keys()):
                mean = self.class_stats[class_label]["mean"]
                cov = self.class_stats[class_label]["cov"]
                cov_inv = np.linalg.inv(cov)
                prior = self.priors[idx]
                
                # Compute the QDA discriminant function
                discriminant = -0.5 * np.dot(np.dot((x - mean).T, cov_inv), (x - mean)) \
                               - 0.5 * np.log(np.linalg.det(cov)) + np.log(prior)
                class_probs.append(discriminant)
            predictions.append(list(self.class_stats.keys())[np.argmax(class_probs)])
        return np.array(predictions)


# Modified QDA to assume diagonal covariance matrix
class DiagonalQDA:
    def __init__(self):
        self.class_stats = None
        self.priors = None

    def fit(self, X, y):
        self.class_stats = compute_class_stats(X, y)
        unique_classes, class_counts = np.unique(y, return_counts=True)
        self.priors = class_counts / len(y)

    def predict(self, X):
        predictions = []
        for x in X:
            class_probs = []
            for idx, class_label in enumerate(self.class_stats.keys()):
                mean = self.class_stats[class_label]["mean"]
                cov = self.class_stats[class_label]["cov"]
                
                # Assume diagonal covariance matrix: Use only the diagonal elements (variances)
                diagonal_cov_inv = np.diag(1 / np.diag(cov))
                prior = self.priors[idx]

                # Compute the QDA discriminant function with diagonal covariance
                discriminant = -0.5 * np.dot(np.dot((x - mean).T, diagonal_cov_inv), (x - mean)) \
                               - 0.5 * np.sum(np.log(np.diag(cov))) + np.log(prior)
                class_probs.append(discriminant)
            predictions.append(list(self.class_stats.keys())[np.argmax(class_probs)])
        return np.array(predictions)
    
# Calculate accuracy
def accuracy(predictions, labels):
    return np.mean(predictions == labels) * 100

# Main workflow
if __name__ == "__main__":
    filename = 'iris.data.csv'
    data = load_csv(filename)

    # Split the data into training and test sets
    train_data, test_data = split_data(data)

    # Separate features and labels for training and testing sets
    train_features, train_labels = separate_features_labels(train_data)
    test_features, test_labels = separate_features_labels(test_data)

    # LDA
    lda = LDA()
    lda.fit(train_features, train_labels)
    train_predictions_lda = lda.predict(train_features)
    test_predictions_lda = lda.predict(test_features)

    print(f"LDA Training Accuracy: {accuracy(train_predictions_lda, train_labels):.2f}%")
    print(f"LDA Test Accuracy: {accuracy(test_predictions_lda, test_labels):.2f}%")

    # Original QDA
    qda = QDA()
    start_time_qda = time.time()
    qda.fit(train_features, train_labels)
    train_predictions_qda = qda.predict(train_features)
    test_predictions_qda = qda.predict(test_features)
    end_time_qda = time.time()

    print(f"QDA Training Accuracy: {accuracy(train_predictions_qda, train_labels):.2f}%")
    print(f"QDA Test Accuracy: {accuracy(test_predictions_qda, test_labels):.2f}%")
    print(f"Original QDA Training Time: {end_time_qda - start_time_qda:.4f} seconds")

    # Diagonal QDA
    diagonal_qda = DiagonalQDA()
    start_time_diag_qda = time.time()
    diagonal_qda.fit(train_features, train_labels)
    train_predictions_diag_qda = diagonal_qda.predict(train_features)
    test_predictions_diag_qda = diagonal_qda.predict(test_features)
    end_time_diag_qda = time.time()

    print(f"Diagonal QDA Training Accuracy: {accuracy(train_predictions_diag_qda, train_labels):.2f}%")
    print(f"Diagonal QDA Test Accuracy: {accuracy(test_predictions_diag_qda, test_labels):.2f}%")
    print(f"Diagonal QDA Training Time: {end_time_diag_qda - start_time_diag_qda:.4f} seconds")


    classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

    for target_class in classes:
        print(f"\nBinary Classification: {target_class} vs Rest")
        
        # Prepare binary labels for classification
        binary_train_labels, binary_test_labels = binary_classification(train_labels, test_labels, target_class)

        # Train LDA model
        lda = LDA()
        lda.fit(train_features, binary_train_labels)

        # Make predictions
        train_predictions = lda.predict(train_features)
        test_predictions = lda.predict(test_features)

        # Calculate and print accuracies
        train_accuracy = accuracy(train_predictions, binary_train_labels)
        test_accuracy = accuracy(test_predictions, binary_test_labels)

        print(f"{target_class} Training Accuracy: {train_accuracy:.2f}%")
        print(f"{target_class} Test Accuracy: {test_accuracy:.2f}%")

# # Binary classification: 1 for the class of interest, 0 for the other 2
# def binary_classification(train_labels, test_labels, target_class):
#     binary_train_labels = np.where(train_labels == target_class, 1, 0)  # Label 1 for target, 0 for others
#     binary_test_labels = np.where(test_labels == target_class, 1, 0)    # Label 1 for target, 0 for others
#     return binary_train_labels, binary_test_labels



LDA Training Accuracy: 97.50%
LDA Test Accuracy: 100.00%
QDA Training Accuracy: 98.33%
QDA Test Accuracy: 100.00%
Original QDA Training Time: 0.0065 seconds
Diagonal QDA Training Accuracy: 95.83%
Diagonal QDA Test Accuracy: 100.00%
Diagonal QDA Training Time: 0.0048 seconds

Binary Classification: Iris-setosa vs Rest
Iris-setosa Training Accuracy: 100.00%
Iris-setosa Test Accuracy: 100.00%

Binary Classification: Iris-versicolor vs Rest
Iris-versicolor Training Accuracy: 73.33%
Iris-versicolor Test Accuracy: 76.67%

Binary Classification: Iris-virginica vs Rest
Iris-virginica Training Accuracy: 90.83%
Iris-virginica Test Accuracy: 100.00%


***Part 4 Conclusions for Binary Classification:***

Iris-setosa had 100% accuracy for both the testing and training, which implies that there is a defined line that can be drawn between this class of data and the other 2. The accuracies of the other 2 classes, Versicolor and Virginica, support the claim, since their percentages are smaller, it was harder to draw defined lines between the data sets of each. 

***Part 5 Runtime with Diagonal Cov Matrix:***

The runtime of the QDA with the diagonal matrix was smaller because we are only concerned with 4/16 of the possible elements of the matrix compared to all 16 elements of the 4 x 4 matrix of the original QDA

