# Task 2 : QDA with updated means

In [9]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.decomposition import PCA
from scipy.stats import chi2
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer
import torchvision.models as models
import torch.nn as nn
from torchvision import transforms
import pickle
from scipy.stats import multivariate_normal
from scipy.special import logsumexp
warnings.filterwarnings("ignore")

## Loading and preprocessing the dataset

Here's our preprocessing pipeline - same as before.

In [3]:
class PreprocessingPipeline:
    def __init__(self, n_components=50):
        self.scaler = StandardScaler()
        self.transformer = PowerTransformer(method='yeo-johnson')
        self.pca = PCA(n_components=n_components)
        self.pca_mean = None
        self.pca_std = None

    def fit_transform(self, X_train):
        """
        Fits the preprocessing pipeline on the training set and applies transformations.
        
        Parameters:
        n: ndarray of shape (n_samples, n_features), training data.
        
        Returns:
        - pca_train: Preprocessed training data.
        """
        # Step 1: Standardization
        standardized_train = self.scaler.fit_transform(X_train)

        # Step 2: Yeo-Johnson Transformation
        transformed_train = self.transformer.fit_transform(standardized_train)

        # Step 3: PCA
        pca_train = self.pca.fit_transform(transformed_train)

        # Step 4: Standardize PCA-transformed data
        self.pca_mean = np.mean(pca_train, axis=0)
        self.pca_std = np.std(pca_train, axis=0)
        pca_train_standardized = (pca_train - self.pca_mean) / self.pca_std

        return pca_train_standardized

    def transform(self, X):
        """
        Applies the fitted preprocessing pipeline to new data.

        Parameters:
        - X: ndarray of shape (n_samples, n_features), data to preprocess.
        
        Returns:
        - pca_transformed: Preprocessed data.
        """
        # Step 1: Standardization
        standardized = self.scaler.transform(X)

        # Step 2: Yeo-Johnson Transformation
        transformed = self.transformer.transform(standardized)

        # Step 3: PCA
        pca_transformed = self.pca.transform(transformed)

        # Step 4: Standardize PCA-transformed data using training set stats
        pca_transformed_standardized = (pca_transformed - self.pca_mean) / self.pca_std

        return pca_transformed_standardized

We preprocess all 20 train and eval datasets.

In [None]:
pipeline=PreprocessingPipeline()
X_train=[pipeline.fit_transform(torch.load('extracted_data\X_train_1.pth'))]+[pipeline.transform(torch.load(f'extracted_data\X_train_{i}.pth')) for i in range(2,21)]
X_eval=[pipeline.transform(torch.load(f'extracted_data\X_eval_{i}.pth')) for i in range(1,21)]

## The modified QDA Model

We have used the QDA model just like in the previous task. Here we have used the mod_update function instead of the normal update function. In mod_update, we first generate pseudolabels of the new dataset from a different distribution, then we estimate the parameters once again, now while updating we give 0.1 weightage to the new parameters and 0.9 weightage to the old parameters instead of using the number-of-examples weighted approach we did before. The value of 0.1 was chosen by manual hypertunning. This is done as otherwise the new datasets were getting really low weightage as the model size increased, so we set fixed weights.

We had also tried 2 other approaches - i) Pseudodataset generation, data augmentation, followed by expectation maximization algorithm and ii) Minimizing the KL Divergence loss between the target domain (new dataset) and source domain (pseudodataset generated by trained model on previous datasets). They are described in the Task2_Experimentation notebook.

In [10]:
class QDAClassifier:
    def __init__(self):
        self.class_means = {}
        self.class_covariances = {}
        self.class_priors = {}
        self.class_counts = {}
        self.total_samples = 0
    
    def fit(self, X, y): # For getting the means, covariances and class priors from our initial training data
        """
        Fits the QDA model to the data.

        Parameters:
        - X: ndarray of shape (n_samples, n_features), training data
        - y: ndarray of shape (n_samples,), class labels
        """
        self.total_samples = X.shape[0]
        classes = np.unique(y)
        
        for c in classes:
            # Get data points belonging to class c
            class_data = X[y == c]
            class_count = class_data.shape[0]
            
            # Compute class-specific statistics
            self.class_means[c] = np.mean(class_data, axis=0)
            self.class_covariances[c] = np.cov(class_data, rowvar=False)
            self.class_priors[c] = class_count / self.total_samples
            self.class_counts[c] = class_count

    def predict(self, X): #Predicting the class of test examples by computing posterior
        """
        Predict the class labels for a dataset X.
    
        Parameters:
        - X: ndarray of shape (n_samples, n_features), the input data matrix.
    
        Returns:
        - predictions: ndarray of shape (n_samples,), the predicted class labels for each sample.
        """
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples, dtype=int)  # Initialize predictions array
        
        # Get the classes from the keys of class_means
        classes = list(self.class_means.keys())
    
        for i in range(n_samples):
            posteriors = []
            for c in classes:
                # Compute likelihood P(x | y = c) using the multivariate Gaussian PDF
                mean = self.class_means[c]
                cov = self.class_covariances[c]
                prior = self.class_priors[c]
                likelihood = multivariate_normal.pdf(X[i], mean=mean, cov=cov)
    
                # Compute posterior P(y = c | x) = P(x | y = c) * P(y = c)
                posterior = likelihood * prior
                posteriors.append(posterior)
            
            # Predict the class with the highest posterior
            predictions[i] = classes[np.argmax(posteriors)]
        
        return predictions
    
    def get_class_statistics(self): #Getting the current stored mean, covariance and prior for each class
        """
        Returns the learned statistics for each class.
        
        Returns:
        - class_means: dict of class means
        - class_covariances: dict of class covariance matrices
        - class_priors: dict of class priors
        - class_counts: dict of the number of samples per class
        """
        return {
            'means': self.class_means,
            'covariances': self.class_covariances,
            'priors': self.class_priors,
            'counts': self.class_counts,
        }
    def update(self, X): # For updating model f_i to f_i+1, by first predicting labels of unlabelled dataset, then training on this new dataset, and
                         # finally updating the model parameters by weighted average - taking into account the no. of training examples seen till now  
                         # and the no. of new test examples.

    # Step 2: Predict labels
        predicted_labels = self.predict(X)
    
    # Step 3: Update statistics per class
        for c in self.class_means.keys():
            # Get new samples for class c
            new_samples = X[predicted_labels == c]
            new_count = new_samples.shape[0]
            if new_count == 0:
                continue  # No new samples for this class
        
            # Update mean
            current_mean = self.class_means[c]
            current_count = self.class_counts[c]
            new_mean = (current_mean * current_count + new_samples.sum(axis=0)) / (current_count + new_count)
        
            # Update covariance
            current_cov = self.class_covariances[c]
            scatter_current = current_cov * current_count
            scatter_new = np.cov(new_samples, rowvar=False) * new_count
            scatter_updated = scatter_current + scatter_new
            updated_cov = scatter_updated / (current_count + new_count)
        
        # Update priors, counts, etc.
            self.class_means[c] = new_mean
            self.class_covariances[c] = updated_cov
            self.class_counts[c] += new_count
            self.class_priors[c] = self.class_counts[c] / self.total_samples

    # Update total sample count
        self.total_samples += X.shape[0]
    def mod_update(self, X, h):  # For updating model f_i to f_i+1, by first predicting labels of unlabelled dataset, then training on this new dataset, and
                                 # finally updating the model parameters by weighted average - taking into account the hyperparameter h.
        predicted_labels = self.predict(X)
        for c in self.class_means.keys():
            # Get new samples for class c
            new_samples = X[predicted_labels == c]
            new_count = new_samples.shape[0]
            if new_count == 0:
                continue  # No new samples for this class
        
            # Update mean
            current_mean = self.class_means[c]
            # current_count = self.class_counts[c]
            new_mean = current_mean *(1-h) + (h)*new_samples.sum(axis=0)/new_count
        
            # Update covariance
            current_cov = self.class_covariances[c]
            scatter_current = current_cov * (1-h)
            scatter_new = np.cov(new_samples, rowvar=False) * h
            scatter_updated = scatter_current + scatter_new
            updated_cov = scatter_updated
        
        # Update priors, counts, etc.
            self.class_means[c] = new_mean
            self.class_covariances[c] = updated_cov
            self.class_priors[c] = self.class_counts[c]*(1-h) + new_count*h
            self.class_counts[c] += new_count
    
    
    def generate_samples(self, num_samples): #Not used in this approach, explained in Experimentation
        """
        Generate synthetic samples using the learned class distributions.
        
        Parameters:
        - num_samples: int, total number of synthetic samples to generate.
        
        Returns:
        - X_generated: ndarray of shape (num_samples, n_features), the generated samples.
        - y_generated: ndarray of shape (num_samples,), the corresponding class labels.
        """
        # Initialize storage for generated samples and labels
        X_generated = []
        y_generated = []

        # Generate samples for each class based on the prior probabilities
        for c, prior in self.class_priors.items():
            # Number of samples to generate for this class
            class_samples = int(np.round(prior * num_samples))
            
            # Sample from the Gaussian distribution for this class
            mean = self.class_means[c]
            cov = self.class_covariances[c]
            generated = np.random.multivariate_normal(mean, cov, size=class_samples)
            
            # Append to the result
            X_generated.append(generated)
            y_generated.extend([c] * class_samples)

        # Concatenate and shuffle to create the final dataset
        X_generated = np.vstack(X_generated)
        y_generated = np.array(y_generated)
        indices = np.arange(len(y_generated))
        np.random.shuffle(indices)

        return X_generated[indices], y_generated[indices]

In [44]:
qdm=pickle.load(open("final_f10.pkl", "rb")) #Loading our saved model
for i in range(10,20):
    qdm.mod_update(X_train[i], 0.1)
    pickle.dump(qdm, open(f"f{i+1}.pkl", "wb")) #Saving the new models
    print(f"done {i+1} set")

model_list=[pickle.load(open(f'f{i+1}.pkl','rb')) for i in range(10,20)]

#Evaluating the models
accuracies=np.zeros((10,20))
for i in range(10):
    for j in range(i+11):
        prediction=model_list[i].predict(X_eval[j])
        accuracies[i][j]=np.mean(prediction == y_eval[j])*100
        print(f"done model {i+1}, set {j+1} ")

print(accuracies)
with open('accuracies.pkl', 'wb') as f:
    pickle.dump(accuracies, f)

done 11 set
done 12 set
done 13 set
done 14 set
done 15 set
done 16 set
done 17 set
done 18 set
done 19 set
done 20 set
done model 1, set 1 
done model 1, set 2 
done model 1, set 3 
done model 1, set 4 
done model 1, set 5 
done model 1, set 6 
done model 1, set 7 
done model 1, set 8 
done model 1, set 9 
done model 1, set 10 
done model 1, set 11 
done model 2, set 1 
done model 2, set 2 
done model 2, set 3 
done model 2, set 4 
done model 2, set 5 
done model 2, set 6 
done model 2, set 7 
done model 2, set 8 
done model 2, set 9 
done model 2, set 10 
done model 2, set 11 
done model 2, set 12 
done model 3, set 1 
done model 3, set 2 
done model 3, set 3 
done model 3, set 4 
done model 3, set 5 
done model 3, set 6 
done model 3, set 7 
done model 3, set 8 
done model 3, set 9 
done model 3, set 10 
done model 3, set 11 
done model 3, set 12 
done model 3, set 13 
done model 4, set 1 
done model 4, set 2 
done model 4, set 3 
done model 4, set 4 
done model 4, set 5 
done model

These are the final accuracies obtained.