# Machine Learning HW2

### FFNN Model

In [17]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
from argparse import ArgumentParser
import sys


unk = '<UNK>'
# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html
class FFNN(nn.Module):
    def __init__(self, input_dim, h):
        super(FFNN, self).__init__()
        self.h = h
        self.W1 = nn.Linear(input_dim, h)
        self.activation = nn.ReLU() # The rectified linear unit; one valid choice of activation function
        self.output_dim = 5
        self.W2 = nn.Linear(h, self.output_dim)

        self.softmax = nn.LogSoftmax() # The softmax function that converts vectors into probability distributions; computes log probabilities for computational benefits
        self.loss = nn.NLLLoss() # The cross-entropy/negative log likelihood loss taught in class

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, input_vector):
        # Obtain first hidden layer representation
        hidden_rep = self.activation(self.W1(input_vector))

        # Obtain output layer representation
        output_rep = self.W2(hidden_rep)

        # Obtain probability distribution
        predicted_vector = self.softmax(output_rep)

        return predicted_vector


# Returns: 
# vocab = A set of strings corresponding to the vocabulary
def make_vocab(data):
    vocab = set()
    for document, _ in data:
        for word in document:
            vocab.add(word)
    return vocab 


# Returns:
# vocab = A set of strings corresponding to the vocabulary including <UNK>
# word2index = A dictionary mapping word/token to its index (a number in 0, ..., V - 1)
# index2word = A dictionary inverting the mapping of word2index
def make_indices(vocab):
    vocab_list = sorted(vocab)
    vocab_list.append(unk)
    word2index = {}
    index2word = {}
    for index, word in enumerate(vocab_list):
        word2index[word] = index 
        index2word[index] = word 
    vocab.add(unk)
    return vocab, word2index, index2word 


# Returns:
# vectorized_data = A list of pairs (vector representation of input, y)
def convert_to_vector_representation(data, word2index):
    vectorized_data = []
    for document, y in data:
        vector = torch.zeros(len(word2index)) 
        for word in document:
            index = word2index.get(word, word2index[unk])
            vector[index] += 1
        vectorized_data.append((vector, y))
    return vectorized_data



def load_data(train_data, val_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)

    tra = []
    val = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))

    return tra, val


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("-hd", "--hidden_dim", type=int, required = True, help = "hidden_dim")
    parser.add_argument("-e", "--epochs", type=int, required = True, help = "num of epochs to train")
    parser.add_argument("--train_data", required = True, help = "path to training data")
    parser.add_argument("--val_data", required = True, help = "path to validation data")
    parser.add_argument("--test_data", default = "to fill", help = "path to test data")
    parser.add_argument('--do_train', action='store_true')
    
    # Check if running in an interactive environment
    if 'ipykernel' in sys.modules or 'spyder' in sys.modules:
        args = parser.parse_args(args=["--hidden_dim", "128", "--epochs", "10", "--train_data", "training.json", "--val_data", "validation.json"])
    else:
        args = parser.parse_args()

    # fix random seeds
    random.seed(42)
    torch.manual_seed(42)

    # load data
    print("========== Loading data ==========")
    train_data, valid_data = load_data(args.train_data, args.val_data) # X_data is a list of pairs (document, y); y in {0,1,2,3,4}
    vocab = make_vocab(train_data)
    vocab, word2index, index2word = make_indices(vocab)

    print("========== Vectorizing data ==========")
    train_data = convert_to_vector_representation(train_data, word2index)
    valid_data = convert_to_vector_representation(valid_data, word2index)
    
    model = FFNN(input_dim = len(vocab), h = args.hidden_dim)
    optimizer = optim.SGD(model.parameters(),lr=0.01, momentum=0.9)
    print("========== Training for {} epochs ==========".format(args.epochs))
    for epoch in range(args.epochs):
        model.train()
        optimizer.zero_grad()
        loss = None
        correct = 0
        total = 0
        start_time = time.time()
        print("Training started for epoch {}".format(epoch + 1))
        random.shuffle(train_data) # Good practice to shuffle order of training data
        minibatch_size = 16
        N = len(train_data) 
        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                input_vector, gold_label = train_data[minibatch_index * minibatch_size + example_index]
                predicted_vector = model(input_vector)
                predicted_label = torch.argmax(predicted_vector)
                correct += int(predicted_label == gold_label)
                total += 1
                example_loss = model.compute_Loss(predicted_vector.view(1,-1), torch.tensor([gold_label]))
                if loss is None:
                    loss = example_loss
                else:
                    loss += example_loss
            loss = loss / minibatch_size
            loss.backward()
            optimizer.step()
        print("Training completed for epoch {}".format(epoch + 1))
        print("Training accuracy for epoch {}: {}".format(epoch + 1, correct / total))
        print("Training time for this epoch: {}".format(time.time() - start_time))

        # Write results to output file
        with open("training_results.out", "a") as file:
            file.write(f"Epoch {epoch + 1} Training Accuracy: {correct / total}\n")
            file.write(f"Epoch {epoch + 1} Training Time: {time.time() - start_time}\n")

        loss = None
        correct = 0
        total = 0
        start_time = time.time()
        print("Validation started for epoch {}".format(epoch + 1))
        minibatch_size = 16
        N = len(valid_data) 
        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                input_vector, gold_label = valid_data[minibatch_index * minibatch_size + example_index]
                predicted_vector = model(input_vector)
                predicted_label = torch.argmax(predicted_vector)
                correct += int(predicted_label == gold_label)
                total += 1
                example_loss = model.compute_Loss(predicted_vector.view(1,-1), torch.tensor([gold_label]))
                if loss is None:
                    loss = example_loss
                else:
                    loss += example_loss
            loss = loss / minibatch_size
        print("Validation completed for epoch {}".format(epoch + 1))
        print("Validation accuracy for epoch {}: {}".format(epoch + 1, correct / total))
        print("Validation time for this epoch: {}".format(time.time() - start_time))

        # Write results to output file
        with open("validation_results.out", "a") as file:
            file.write(f"Epoch {epoch + 1} Validation Accuracy: {correct / total}\n")
            file.write(f"Epoch {epoch + 1} Validation Time: {time.time() - start_time}\n")


Training started for epoch 1


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:11<00:00,  3.80it/s]


Training completed for epoch 1
Training accuracy for epoch 1: 0.530125
Training time for this epoch: 131.6940095424652
Validation started for epoch 1


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 17.33it/s]


Validation completed for epoch 1
Validation accuracy for epoch 1: 0.5425
Validation time for this epoch: 2.8951947689056396
Training started for epoch 2


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:51<00:00,  4.50it/s]


Training completed for epoch 2
Training accuracy for epoch 2: 0.585875
Training time for this epoch: 111.20730185508728
Validation started for epoch 2


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 19.15it/s]


Validation completed for epoch 2
Validation accuracy for epoch 2: 0.59375
Validation time for this epoch: 2.6130781173706055
Training started for epoch 3


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:00<00:00,  4.15it/s]


Training completed for epoch 3
Training accuracy for epoch 3: 0.614125
Training time for this epoch: 120.4599380493164
Validation started for epoch 3


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 19.80it/s]


Validation completed for epoch 3
Validation accuracy for epoch 3: 0.5875
Validation time for this epoch: 2.5382015705108643
Training started for epoch 4


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:00<00:00,  4.16it/s]


Training completed for epoch 4
Training accuracy for epoch 4: 0.63975
Training time for this epoch: 120.24436330795288
Validation started for epoch 4


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 18.39it/s]


Validation completed for epoch 4
Validation accuracy for epoch 4: 0.56875
Validation time for this epoch: 2.7273967266082764
Training started for epoch 5


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:56<00:00,  4.30it/s]


Training completed for epoch 5
Training accuracy for epoch 5: 0.66275
Training time for this epoch: 116.3075122833252
Validation started for epoch 5


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 17.46it/s]


Validation completed for epoch 5
Validation accuracy for epoch 5: 0.605
Validation time for this epoch: 2.869783878326416
Training started for epoch 6


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:58<00:00,  4.21it/s]


Training completed for epoch 6
Training accuracy for epoch 6: 0.689
Training time for this epoch: 118.7164876461029
Validation started for epoch 6


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 20.38it/s]


Validation completed for epoch 6
Validation accuracy for epoch 6: 0.605
Validation time for this epoch: 2.453601360321045
Training started for epoch 7


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:56<00:00,  4.29it/s]


Training completed for epoch 7
Training accuracy for epoch 7: 0.72625
Training time for this epoch: 116.46232199668884
Validation started for epoch 7


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 18.44it/s]


Validation completed for epoch 7
Validation accuracy for epoch 7: 0.54
Validation time for this epoch: 2.730081081390381
Training started for epoch 8


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:59<00:00,  4.18it/s]


Training completed for epoch 8
Training accuracy for epoch 8: 0.7435
Training time for this epoch: 119.70438933372498
Validation started for epoch 8


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 19.02it/s]


Validation completed for epoch 8
Validation accuracy for epoch 8: 0.61
Validation time for this epoch: 2.6285922527313232
Training started for epoch 9


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:05<00:00,  3.98it/s]


Training completed for epoch 9
Training accuracy for epoch 9: 0.775
Training time for this epoch: 125.74061226844788
Validation started for epoch 9


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 18.15it/s]


Validation completed for epoch 9
Validation accuracy for epoch 9: 0.59375
Validation time for this epoch: 2.7623159885406494
Training started for epoch 10


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [01:59<00:00,  4.20it/s]


Training completed for epoch 10
Training accuracy for epoch 10: 0.797625
Training time for this epoch: 119.03123784065247
Validation started for epoch 10


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 18.86it/s]


Validation completed for epoch 10
Validation accuracy for epoch 10: 0.59375
Validation time for this epoch: 2.666266918182373


### RNN Model

In [33]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
import string
from argparse import ArgumentParser
import pickle

unk = '<UNK>'
# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html
class RNN(nn.Module):
    def __init__(self, input_dim, h):  # Add relevant parameters
        super(RNN, self).__init__()
        self.h = h
        self.numOfLayer = 1
        self.rnn = nn.RNN(input_dim, h, self.numOfLayer, nonlinearity='tanh')
        self.W = nn.Linear(h, 5)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, inputs): 
        # Obtain hidden layer representation 
        output, hidden = self.rnn(inputs) 

        # Obtain output layer representations 
        output_rep = self.W(output) 

        # Sum over the output representations
        summed_output_rep = torch.sum(output_rep, dim=0) 

        # Obtain probability distribution 
        predicted_vector = self.softmax(summed_output_rep) 
        
        return predicted_vector


def load_data(train_data, val_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)

    tra = []
    val = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))
    return tra, val


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("-hd", "--hidden_dim", type=int, required = True, help = "hidden_dim")
    parser.add_argument("-e", "--epochs", type=int, required = True, help = "num of epochs to train")
    parser.add_argument("--train_data", required = True, help = "path to training data")
    parser.add_argument("--val_data", required = True, help = "path to validation data")
    parser.add_argument("--test_data", default = "to fill", help = "path to test data")
    parser.add_argument('--do_train', action='store_true')
    args = parser.parse_args(args=["--hidden_dim", '256', "--epochs", '10', "--train_data", "training.json", "--val_data",
"validation.json"])

    print("========== Loading data ==========")
    train_data, valid_data = load_data(args.train_data, args.val_data) # X_data is a list of pairs (document, y); y in {0,1,2,3,4}

    # Think about the type of function that an RNN describes. To apply it, you will need to convert the text data into vector representations.
    # Further, think about where the vectors will come from. There are 3 reasonable choices:
    # 1) Randomly assign the input to vectors and learn better embeddings during training; see the PyTorch documentation for guidance
    # 2) Assign the input to vectors using pretrained word embeddings. We recommend any of {Word2Vec, GloVe, FastText}. Then, you do not train/update these embeddings.
    # 3) You do the same as 2) but you train (this is called fine-tuning) the pretrained embeddings further.
    # Option 3 will be the most time consuming, so we do not recommend starting with this

    print("========== Vectorizing data ==========")
    model = RNN(50, args.hidden_dim)  # Fill in parameters
    # optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    word_embedding = pickle.load(open('./word_embedding.pkl', 'rb'))

    stopping_condition = False
    epoch = 0

    last_train_accuracy = 0
    last_validation_accuracy = 0

    while not stopping_condition:
        random.shuffle(train_data)
        model.train()
        # You will need further code to operationalize training, ffnn.py may be helpful
        print("Training started for epoch {}".format(epoch + 1))
        train_data = train_data
        correct = 0
        total = 0
        minibatch_size = 16
        N = len(train_data)

        loss_total = 0
        loss_count = 0
        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                input_words, gold_label = train_data[minibatch_index * minibatch_size + example_index]
                input_words = " ".join(input_words)

                # Remove punctuation
                input_words = input_words.translate(input_words.maketrans("", "", string.punctuation)).split()

                # Look up word embedding dictionary
                vectors = [word_embedding[i.lower()] if i.lower() in word_embedding.keys() else word_embedding['unk'] for i in input_words ]

                # Transform the input into required shape
                vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
                output = model(vectors)

                # Get loss
                example_loss = model.compute_Loss(output.view(1,-1), torch.tensor([gold_label]))

                # Get predicted label
                predicted_label = torch.argmax(output)

                correct += int(predicted_label == gold_label)
                # print(predicted_label, gold_label)
                total += 1
                if loss is None:
                    loss = example_loss
                else:
                    loss += example_loss

            loss = loss / minibatch_size
            loss_total += loss.data
            loss_count += 1
            loss.backward()
            optimizer.step()
        print(loss_total/loss_count)
        print("Training completed for epoch {}".format(epoch + 1))
        print("Training accuracy for epoch {}: {}".format(epoch + 1, correct / total))
        
        # Write results to output file
        with open("training_results.out", "a") as file:
            file.write(f"Epoch {epoch + 1} Training Accuracy: {correct / total}\n")
            file.write(f"Epoch {epoch + 1} Training Time: {time.time() - start_time}\n")
            
        trainning_accuracy = correct/total


        model.eval()
        correct = 0
        total = 0
        random.shuffle(valid_data)
        print("Validation started for epoch {}".format(epoch + 1))
        valid_data = valid_data

        for input_words, gold_label in tqdm(valid_data):
            input_words = " ".join(input_words)
            input_words = input_words.translate(input_words.maketrans("", "", string.punctuation)).split()
            vectors = [word_embedding[i.lower()] if i.lower() in word_embedding.keys() else word_embedding['unk'] for i
                       in input_words]

            vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
            output = model(vectors)
            predicted_label = torch.argmax(output)
            correct += int(predicted_label == gold_label)
            total += 1
            # print(predicted_label, gold_label)
        print("Validation completed for epoch {}".format(epoch + 1))
        print("Validation accuracy for epoch {}: {}".format(epoch + 1, correct / total))

        # Write results to output file
        with open("validation_results.out", "a") as file:
            file.write(f"Epoch {epoch + 1} Validation Accuracy: {correct / total}\n")
            file.write(f"Epoch {epoch + 1} Validation Time: {time.time() - start_time}\n")
            
        validation_accuracy = correct/total

        if validation_accuracy < last_validation_accuracy and trainning_accuracy > last_train_accuracy:
            stopping_condition=True
            print("Training done to avoid overfitting!")
            print("Best validation accuracy is:", last_validation_accuracy)
        else:
            last_validation_accuracy = validation_accuracy
            last_train_accuracy = trainning_accuracy

        epoch += 1



    # You may find it beneficial to keep track of training accuracy or training loss;

    # Think about how to update the model and what this entails. Consider ffnn.py and the PyTorch documentation for guidance


Training started for epoch 1


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [04:29<00:00,  1.85it/s]


tensor(2.3442)
Training completed for epoch 1
Training accuracy for epoch 1: 0.462625
Validation started for epoch 1


100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [00:09<00:00, 84.10it/s]


Validation completed for epoch 1
Validation accuracy for epoch 1: 0.4625
Training started for epoch 2


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [04:41<00:00,  1.78it/s]


tensor(1.8192)
Training completed for epoch 2
Training accuracy for epoch 2: 0.4395
Validation started for epoch 2


100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [00:09<00:00, 84.03it/s]


Validation completed for epoch 2
Validation accuracy for epoch 2: 0.4225
Training started for epoch 3


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [04:52<00:00,  1.71it/s]


tensor(1.2023)
Training completed for epoch 3
Training accuracy for epoch 3: 0.4235
Validation started for epoch 3


100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [00:09<00:00, 85.47it/s]


Validation completed for epoch 3
Validation accuracy for epoch 3: 0.4475
Training started for epoch 4


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [04:41<00:00,  1.77it/s]


tensor(1.2099)
Training completed for epoch 4
Training accuracy for epoch 4: 0.425125
Validation started for epoch 4


100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [00:08<00:00, 89.01it/s]


Validation completed for epoch 4
Validation accuracy for epoch 4: 0.41625
Training started for epoch 5


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [04:54<00:00,  1.70it/s]


tensor(1.2413)
Training completed for epoch 5
Training accuracy for epoch 5: 0.416375
Validation started for epoch 5


100%|████████████████████████████████████████████████████████████████████████████████| 800/800 [00:10<00:00, 73.80it/s]


Validation completed for epoch 5
Validation accuracy for epoch 5: 0.41375
Training started for epoch 6


 47%|█████████████████████████████████████▎                                          | 233/500 [02:25<02:46,  1.61it/s]


KeyboardInterrupt: 