# Loading libraries

In [None]:
import pandas as pd
import torch 
import spacy
from seqeval.metrics import f1_score

import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from seqeval.scheme import IOB2

import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import en_core_web_sm
from spacy.tokenizer import Tokenizer
nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer(nlp.vocab)
device = 'cuda' if torch.cuda.is_available() else 'cpu'


# Importing the data

In [None]:
df = pd.read_csv('hw2_train.csv', index_col=0)
df.columns = ["text", "labels"]
df_test=pd.read_csv("hw2_test.csv",index_col = 0)
df_test.columns = ["text"]
disctionary_list = []
for i, row in df.iterrows():
    temp = {}
    text_length = len(row['text'].split())
    labels_length = len(row['labels'].split())
    if(text_length == labels_length):
        temp['text'] = row['text']
        temp['labels'] = row['labels']
        disctionary_list.append(temp)
df= pd.DataFrame.from_dict(disctionary_list)

df.shape


## Splitting dataset

In [None]:
df_train, df_val = train_test_split(df, random_state = 42, test_size = 0.2,shuffle = True)
print(f' train data shape {df_train.shape}')
print(f' validation data shape {df_val.shape}')

In [None]:
embeddings_index = {}
f = open('glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
vocab = list(embeddings_index.keys())
embeddings = list(embeddings_index.values())

In [None]:
vocab = np.array(vocab)
embeddings = np.array(embeddings)

In [None]:
vocab = np.insert(vocab, 0, '<pad>')
vocab = np.insert(vocab, 1, '<unk>')
def add_unk_pad(embeddings):
    pad = np.zeros((1,embeddings.shape[1]))
    unk = np.mean(embeddings,axis = 0,keepdims = True)
    embeddings = np.vstack((pad,unk,embeddings))
    return embeddings

add_unk_pad(embeddings) 

In [None]:
print(f'Embeddings Shape {embeddings.shape}')


### Creating word to Index and index to words

In [None]:
word2idx = {w: idx for idx, w in enumerate(vocab)}
word2idx['<pad>'] = 0
word2idx['<unk>'] = 1
idx2word = {idx: w for idx, w in enumerate(vocab)}

### Tag dictionary and assigning indices

In [None]:
labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()

for lb in labels:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

labs_count = len(labels_to_ids)
labs_count

## Dataset Class

In [None]:
class TaggingDataset(Dataset):
    
    def __init__(self, 
                 data: pd.DataFrame):
        self.data = data
        self.text = self.data['text']
        if 'labels' in self.data.columns:
            self.labels = self.data['labels']
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, 
                    idx: int):
        text = self.text.iloc[idx]
        encoded_text = self.text_encoder(text)
        
        if 'labels' in self.data.columns:
            label = self.labels.iloc[idx]
            encoded_label = self.labels_encoder(label)
            return encoded_text, encoded_label 
        else:
            return encoded_text

    def get_tokens(self, 
                 text: str):
        return [i.text for i in tokenizer(text)]
    

    def text_encoder(self, 
                    text):
        list_of_texts = [word for word in self.get_tokens(text)]
        vector = []
        for word in list_of_texts:
            if word in word2idx:
                vector.append(word2idx[word])
            else:
                vector.append(1)
        return vector
    
    def labels_encoder(self, 
                   label):
        list_of_labels = [word for word in self.get_tokens(label)]
        vector = [labels_to_ids[word] for word in list_of_labels]
        return vector

    
training = TaggingDataset(df_train)
validation = TaggingDataset(df_val)
testing = TaggingDataset(df_test)


In [None]:
# Using pad_sequence from torch to create a collate function
def my_collate_fn(batch):
    if isinstance(batch[0], list):
        tensor_text = [torch.tensor(text) for text in batch]

        lengths = [len(text) for text in batch]
        lengths = torch.tensor(lengths)

        pad_texts = pad_sequence(tensor_text, batch_first = True, padding_value = 0) #using pad_sequence from torch

        return pad_texts, lengths

    else:
        texts, labels = zip(*batch)

        tensor_text = [torch.tensor(text) for text in texts]
        labels_tensor = [torch.tensor(label) for label in labels]

        lengths = [len(text) for text in texts]
        lengths = torch.tensor(lengths)

        pad_texts = pad_sequence(tensor_text, batch_first = True, padding_value = 0)
        labels_padded = pad_sequence(labels_tensor, batch_first = True, padding_value = 0)

        return pad_texts, labels_padded, lengths


### Dataloader

In [None]:

train_loader = DataLoader(training,batch_size = 50,                          shuffle = True, 
                          collate_fn = my_collate_fn)
validation_loader = DataLoader(validation, batch_size = 50, 
                        shuffle = True, collate_fn = my_collate_fn)

test_loader = DataLoader(testing,batch_size = 1, 
                        shuffle = False, 
                        collate_fn = my_collate_fn)

assert df_train.shape[0] == len(train_loader.dataset)
assert df_val.shape[0] == len(validation_loader.dataset)
assert df_test.shape[0] == len(test_loader.dataset)

## Creating the model

In [None]:
class LSTM(nn.Module):
    
    def __init__(self, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embeddings).float())
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True)
        num_directions = 2 if bidirectional else 1
        self.fc1 = nn.Linear(hidden_dim * num_directions, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, 
                x, 
                x_lengths):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, 
                                                            x_lengths, 
                                                            batch_first = True, 
                                                            enforce_sorted = False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, 
                                                                  batch_first = True)
        output = self.fc1(output)
        output = self.dropout(output)
        output = self.fc2(output)
        return output

## Hyper-parameters

In [None]:
torch.manual_seed(25)

EMBEDDING_DIM = embeddings.shape[1]
HIDDEN_DIM = 20
OUTPUT_DIM = labs_count
NUM_LAYERS = 3
BIDIRECTION = True
DROPOUT = 0.2

model = LSTM(EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            NUM_LAYERS, 
            BIDIRECTION, 
            DROPOUT).to(device)

print('LSTM Model: ', model)

In [None]:
ids_to_labels

In [None]:
def list_squeez(my_list):
    temp_list = [int(element) for element in my_list]
    return temp_list  

def slicing(my_list, slice_increment):
    return [my_list[i : i + slice_increment] for i in range(0, len(my_list), slice_increment)]

def idx_to_tags_conversion(lol, isTensor):
    iob_list = []
    for list_element in lol:
        if (isTensor):
            list_element = list_element.numpy() 
        iob = [ids_to_labels[index] for index in list_element]
        iob_list.append(iob)
    return iob_list

In [None]:

def train(loader, 
          model, 
          optimizer, 
          loss_fn):
    model.train()
    losses = []
    pbar = tqdm(loader)
    for x, y, lengths in pbar:
        optimizer.zero_grad()
  
        y_pred = model(x, lengths)
        print(y.shape, y_pred.shape)
        

        y_pred = y_pred.view(-1, y_pred.shape[-1])
        y = torch.flatten(y)
        
        print(y.shape, y_pred.shape)
        
        loss = loss_fn(y_pred, y)
        pbar.set_postfix({'Loss': loss.item()})
        losses.append(loss.item())
        
        loss.backward()  
        optimizer.step()          
    return round((sum(losses) / len(losses)), 4)


def evaluate(loader, 
             model, 
             loss_fn, 
             score_fn):
    model.eval()
    losses = []
    for x, y, lengths in tqdm(loader):
        y_pred = model(x, lengths)
        
        max_len = x.shape[1]
              
        y_pred = y_pred.view(-1, y_pred.shape[-1])
        y = torch.flatten(y)
        
        loss = loss_fn(y_pred, y)
        losses.append(loss.item())
        
        tags_iob = idx_to_tags_conversion(slicing(y, max_len), True)

        max_preds = y_pred.argmax(dim = 1, keepdim = True) 
        predictions_iob = idx_to_tags_conversion(slicing((list_squeez(max_preds)), max_len), False)
    
    score = score_fn(tags_iob, predictions_iob, scheme = IOB2)
    return tags_iob, predictions_iob, round((sum(losses) / len(losses)), 4), round(score, 4)

### Optimizing

In [None]:

optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 0.01)
loss_fn = nn.CrossEntropyLoss().to(device)
score_fn = f1_score

train_loss_list = []
val_loss_list = []
f1_score_list = []
n_epochs = 1
best_acc = 0
PATH = f'best-model.pt'

for epoch in range(n_epochs):
    train_loss = train(train_loader, 
                     model, 
                     optimizer, 
                     loss_fn)
    train_loss_list.append(train_loss)
    print('Train Loss: ', train_loss)
    
    tags, predictions, val_loss, accuracy = evaluate(validation_loader, 
                                                       model, 
                                                       loss_fn, 
                                                       score_fn)
    val_loss_list.append(val_loss)
    f1_score_list.append(accuracy)
    print('Val Accuracy: ', accuracy)
    print('Val Loss: ', val_loss)
    

    if accuracy > best_acc and accuracy > 0.70:
        torch.save(model.state_dict(), PATH)

In [None]:
best_model = LSTM(EMBEDDING_DIM, 
                   HIDDEN_DIM, 
                   OUTPUT_DIM, 
                   NUM_LAYERS, 
                   BIDIRECTION, 
                   DROPOUT).to(device)

best_model.load_state_dict(torch.load(PATH))
best_model.eval()

In [None]:
# Model Predict Function
def predict(loader, 
            model):
    predictions = []
    for x, lengths in tqdm(loader):
        with torch.no_grad():
            y_pred = best_model.forward(x, lengths)
            
            max_len = x.shape[1]

            # Convert y_pred to 2D Tensor
            y_pred = y_pred.view(-1, y_pred.shape[-1])

            max_preds = y_pred.argmax(dim = 1, keepdim = True) 
            predictions_iob = idx_to_tags_conversion(slicing((list_squeez(max_preds)), max_len), False)
            predictions.append(predictions_iob)
    
    return predictions

predicted_tags = predict(test_loader, best_model)
predicted_tags = np.array(predicted_tags)
predicted_tags = predicted_tags.squeeze().tolist()

## Produce prediction and submission

In [None]:

list_of_dict = []
for i in range(len(predicted_tags)):
    temp_dict = {}
    temp_dict["ID"] = i
    temp_dict["IOB Slot tags"] = ' '.join(predicted_tags[i])
    list_of_dict.append(temp_dict)

tags_df = pd.DataFrame.from_dict(list_of_dict)
tags_df.to_csv("submission.csv", index=False)