# Assignment-1
## Team: Aditya Ahuja (2020275), Deeptanshu Barman Chowdhuri (2020293)

##  Imports & Dataset

In [None]:
import torch
import requests
import os
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, SubsetRandomSampler
from torch.utils.data import Dataset
from torch.utils.data import random_split
import lightning as L
import torchmetrics

In [None]:
device = torch.device(
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

# device = "cpu"


In [None]:
Noise_0_data = requests.get(
    "http://AdityaAhuja01.pythonanywhere.com/data/df_syn_train_0_0_.csv"
)
Noise_Low_data = requests.get(
    "http://AdityaAhuja01.pythonanywhere.com/data/df_synA_train_shuffled.csv"
)
Noise_High_data = requests.get(
    "http://AdityaAhuja01.pythonanywhere.com/data/df_synA_test_hard_shuffled_sample.csv"
)

if Noise_0_data.status_code == 200 and Noise_Low_data.status_code == 200 and Noise_High_data.status_code == 200:
    datafolder = "Data/Assignment1"

    if not os.path.exists(datafolder):
        os.makedirs(datafolder)

    with open(os.path.join(datafolder, "data_0_noise"), "wb") as f:
        f.write(Noise_0_data.text.encode("utf-8"))

    with open(os.path.join(datafolder, "data_Low_noise"), "wb") as f:
        f.write(Noise_Low_data.text.encode("utf-8"))

    with open(os.path.join(datafolder, "data_High_noise"), "wb") as f:
        f.write(Noise_High_data.text.encode("utf-8"))
else:
    print("Error in fetching data")

In [None]:
Noise_0_dataframe = pd.read_csv("Data/Assignment1/data_0_noise")
Noise_Low_dataframe = pd.read_csv("Data/Assignment1/data_Low_noise")
Noise_High_dataframe = pd.read_csv("Data/Assignment1/data_High_noise")

In [None]:
class_index = list(Noise_0_dataframe["era"].unique())
class_index_noise = list(Noise_Low_dataframe["era"].unique())
class_index_target = list(Noise_High_dataframe["target_10_val"].unique())
def encode(value, class_index = class_index):
    return class_index.index(value)

def encode_noise(value, class_index = class_index_noise):
    return class_index.index(value)

def encode_target(value, class_index = class_index_target):
    return class_index.index(value)

# Noise_0_dataframe["era"] = Noise_0_dataframe["era"].apply(encode)
# Noise_Low_dataframe["era"] = Noise_Low_dataframe["era"].apply(encode_noise)
Noise_High_dataframe["target_10_val"] = Noise_High_dataframe["target_10_val"].apply(encode_target)
Noise_Low_dataframe["target_10_val"] = Noise_Low_dataframe["target_10_val"].apply(encode_target)

## Setting up Dataset & Dataloaders

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, noise, transform=None, target_transform=None,target = None, drop = None):
        self.dataframe = dataframe
        if drop != None:
            self.X = dataframe.drop(drop, axis=1).values
        else:
            self.X = dataframe.values
        self.y = dataframe[target].values
        self.transform = transform
        self.target_transform = target_transform
        self.noise = noise

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item, label = self.X[idx], self.y[idx]
        return item, label

    def get_noise(self):
        return self.noise


In [None]:
#Setting up datasets
Noise_0_dataset = CustomDataset(Noise_0_dataframe, "0",drop = ["row_num","day","era","target_10_val","target_5_val"], target = "era")
Noise_Low_dataset = CustomDataset(Noise_Low_dataframe, "Low", drop = ["row_num","day","era","target_10_val","target_5_val","data_type"], target = "target_10_val")
Noise_High_dataset = CustomDataset(Noise_High_dataframe, "High", drop = ["row_num","day","era","target_10_val","target_5_val","data_type"], target = "target_10_val")
Noise_0_train, Noise_0_test = random_split(Noise_0_dataset, [int(0.8 * len(Noise_0_dataset)), len(Noise_0_dataset) - int(0.8 * len(Noise_0_dataset))])
Noise_Low_train, Noise_Low_test = random_split(Noise_Low_dataset, [int(0.8 * len(Noise_Low_dataset)), len(Noise_Low_dataset) - int(0.8 * len(Noise_Low_dataset))])
Noise_High_train, Noise_High_test = random_split(Noise_High_dataset, [int(0.8 * len(Noise_High_dataset)), len(Noise_High_dataset) - int(0.8 * len(Noise_High_dataset))])

In [None]:
#Setting up dataloaders
Noise_0_train_loader = DataLoader(Noise_0_train, batch_size=64, shuffle=True)
Noise_0_test_loader = DataLoader(Noise_0_test, batch_size=64, shuffle=True)
Noise_Low_train_loader = DataLoader(Noise_Low_train, batch_size=256, shuffle=True)
Noise_Low_test_loader = DataLoader(Noise_Low_test, batch_size=256, shuffle=True)
Noise_High_train_loader = DataLoader(Noise_High_train, batch_size=64, shuffle=True)
Noise_High_test_loader = DataLoader(Noise_High_test, batch_size=64, shuffle=True)

## Model

In [None]:
class MyMLP(torch.nn.Module):
    def __init__ (self, modules):
        super().__init__()
        self.relu = torch.nn.ReLU()
        MyModuleList = torch.nn.ModuleList([m for m in modules])
        self.layers = torch.nn.Sequential(*MyModuleList)
        self.softmax = torch.nn.Softmax(dim = 1)


    def forward(self, X: torch.Tensor):
        if (X.shape[1] != 24):
            raise ValueError("Input shape must be (batch_size, 24)")
        X = X.to(device)
        X = self.layers(X)
        X = self.softmax(X)

        return X

In [None]:
class EnsembleClassifier():
  def __init__(self, Dataset, num_classifiers,sample_percentage = 0.8):
    self.Dataset = Dataset
    self.sample_percentage = sample_percentage
    self.num_classifiers = num_classifiers
    self.models = torch.nn.ModuleList()

    #Currently hard coded the model, change later
    for clf in range(num_classifiers):
      self.models.append(MyMLP([torch.nn.Linear(24,64),torch.nn.ReLU(),torch.nn.Linear(64,256),torch.nn.Dropout(p = 0.2),torch.nn.ReLU(),torch.nn.Linear(256,64),torch.nn.Dropout(p = 0.2),torch.nn.ReLU(),torch.nn.Linear(64,12)]).to(device))

  def train(self,criterion,batchsize,num_epochs):
      num_samples = int(0.8 * len(self.Dataset))
      indices = list(range(num_samples))
      bootstrap_dataloaders = []
      accuracy = torchmetrics.Accuracy(task = "multiclass",num_classes = 12).to(device)

      for c in range(self.num_classifiers):
          sampled_indices = torch.randperm(num_samples)[:int(num_samples * self.sample_percentage)]
          sampler = SubsetRandomSampler(sampled_indices)
          sampled_dataloader = DataLoader(self.Dataset, batch_size = batchsize, sampler=sampler)
          bootstrap_dataloaders.append(sampled_dataloader)

      for epoch in range(num_epochs):
        print("Starting Epoch-",epoch)
        epoch_losses = torch.zeros(self.num_classifiers).to(device)
        epoch_accuracies = torch.zeros(self.num_classifiers).to(device)
        for model_num in range(len(self.models)):
          optimizer = optim.Adam(self.models[model_num].parameters(), lr=0.0001, weight_decay=0.001)
          self.models[model_num].train()
          for inputs, labels in bootstrap_dataloaders[model_num]:
            inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
            optimizer.zero_grad()
            outputs = self.models[model_num](inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_losses[model_num] += loss.item()
            epoch_accuracies[model_num] += accuracy(outputs, labels)

          epoch_losses[model_num] /= len(bootstrap_dataloaders[model_num])
          epoch_accuracies[model_num] /= len(bootstrap_dataloaders[model_num])
          print(f"Classifier {model_num}: Loss = {epoch_losses[model_num]:.4f}, Accuracy = {epoch_accuracies[model_num]:.4f}")

  def forward(self, x):
    ensemble_predictions = torch.zeros(x.size(0), 12, device=device)
    for model_num in range(len(self.models)):
      self.models[model_num].eval()
      outputs = self.models[model_num](x)
      ensemble_predictions += outputs
    ensemble_predictions /= self.num_classifiers
    return ensemble_predictions










In [17]:
Ensemble = EnsembleClassifier(Noise_Low_dataset,5,0.7)
Ensemble.train(torch.nn.CrossEntropyLoss(),128,10)

Classifier 4: Loss = 1.9261, Accuracy = 0.7007
Starting Epoch- 5


KeyboardInterrupt: 

In [None]:
for input,label in Noise_Low_test_loader:
  input, label = input.to(device).to(torch.float32), label.to(device).to(torch.long)
  outputs = Ensemble.forward(input)
  predicted_labels = torch.argmax(outputs, dim=1)
  print((predicted_labels == label).sum()/256)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10,verbose=True):
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        running_train_loss = 0.0
        correct_train = 0
        total_train = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_train_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

        train_loss = running_train_loss / len(train_loader)
        train_accuracy = correct_train / total_train
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)

        model.eval()
        running_val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.long)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

        val_loss = running_val_loss / len(val_loader)
        val_accuracy = correct_val / total_val
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')

    # Plotting
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(range(1, num_epochs + 1), train_losses, label='Train')
    plt.plot(range(1, num_epochs + 1), val_losses, label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train')
    plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()

    plt.show()


In [None]:
Model = MyMLP([torch.nn.Linear(24,64),torch.nn.ReLU(),torch.nn.Linear(64,128),torch.nn.Dropout(p = 0.2),torch.nn.ReLU(),torch.nn.Linear(128,64),torch.nn.Dropout(p = 0.2),torch.nn.ReLU(),torch.nn.Linear(64,5)]).to(device)
optimizer = optim.Adam(Model.parameters(), lr=0.001, weight_decay=0.001)
train_model(Model,torch.nn.CrossEntropyLoss(), optimizer,Noise_High_train_loader, Noise_High_test_loader,num_epochs=10)

In [None]:
PATH = "model.pt"
torch.save(Model.state_dict(), PATH)

In [None]:
PATH = "model.pt"
Model.load_state_dict(torch.load(PATH))