In [77]:
import numpy as np
import scipy
from save_csv import results_to_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [78]:
class QDA:
    def __init__(self, data):
        self.gausses = []
        for label in np.unique(data[:, -1]):
            subset = data[data[:, -1] == label]
            subset = subset[:, 0:-1]
            gauss = MLEGaussian(subset=subset, label=label, data_len=len(data))
            self.gausses.append(gauss)
        
    def predict(self, X):
        # matrix to store discriminant scores for each sample and class so we can vectorize prediction
        scores = np.zeros((X.shape[0], len(self.gausses)))
        
        for digit, gauss in enumerate(self.gausses):
            mu = gauss.mu
            pi_c = gauss.pi_c
            cov_matrix = gauss.cov_matrix

            score = scipy.stats.multivariate_normal.logpdf(X, allow_singular=True, cov=cov_matrix, mean=mu)
            + np.log(pi_c)
            scores[:, digit] = score

            
        return np.argmax(scores, axis=1)

class MLEGaussian:
    def __init__(self, subset, label, data_len):
        self.subset = subset
        self.label = label
        self.pi_c = len(subset) / data_len
        
        # find mean vector and covariance matrix
        self.mu = np.mean(self.subset, axis=0)
        self.cov_matrix = np.cov(self.subset, rowvar=False)
        self.var = np.var(self.subset)
        
        # add regularization term to covariance matrix
        regularization_term = 1e-3 * np.eye(self.cov_matrix.shape[0])
        self.cov_matrix += regularization_term


In [83]:
np.random.seed(189)
full_data = np.load("../data/spam-data-hw3.npz")
training_data = full_data["training_data"]
training_labels = full_data["training_labels"]
X_train, X_test, y_train, y_test = train_test_split(training_data, training_labels, test_size=0.2, random_state=189)

In [84]:
joined_train = np.column_stack((X_train, y_train))
spam_test = full_data["test_data"]
qda = QDA(joined_train)
predictions = qda.predict(spam_test)
# accuracy_score(y_test, predictions)

In [85]:
results_to_csv(predictions, "spampredictions.csv")

In [82]:
full_data.files

['training_data', 'training_labels', 'test_data']

In [None]:
class LDA:
    def __init__(self, data):
        self.gausses = []
        for label in np.unique():
            subset = data[data[:, -1] == label]
            subset = subset[:, 0:-1]
            gauss = MLEGaussian(subset=subset, digit=label, data_len=len(data))
            self.gausses.append(gauss)
        
        # calculate pooled covariance matrix
        pooled_cov_matrix = np.zeros((784, 784))
        for gauss in self.gausses:
            subpool = (len(gauss.subset) - 1) * gauss.cov_matrix
            pooled_cov_matrix = pooled_cov_matrix + subpool
        pooled_cov_matrix = pooled_cov_matrix / (len(data) - 10)
        
        # add regularization term so matrix is invertible
        regularization_term = 1e-5 * np.eye(pooled_cov_matrix.shape[0])
        self.pooled_cov_matrix = pooled_cov_matrix + regularization_term
        self.pooled_cov_matrix_inv = np.linalg.pinv(self.pooled_cov_matrix)
        
    def predict(self, X):
        if X.shape[1] == 785:
            X = X[:, 0:-1] 
            
        # matrix to store discriminant scores for each sample and class so we can vectorize prediction
        scores = np.zeros((X.shape[0], len(self.gausses)))
        
        for digit, gauss in enumerate(self.gausses):
            mu = gauss.mu
            pi_c = gauss.pi_c
            
            # little out of order since we flattened mu in gaussian
            score = scipy.stats.multivariate_normal.logpdf(X, allow_singular=True, cov=self.pooled_cov_matrix, mean=mu)+ np.log(pi_c)

            scores[:, digit] = score
            
        return np.argmax(scores, axis=1)

In [87]:
lda = LDA(joined_train)
predictions = lda.predict(X_test)
accuracy_score(predictions, y_test)
# predictions = lda.predict(spam_test)


TypeError: MLEGaussian.__init__() got an unexpected keyword argument 'digit'