### Todo List

1. Build the training and testing process (by Wei Ren & Yafei Zhang) **Done!**
2. Input Embedding
    - Syntactic Textual Feature Embedding: Pos Tagging (by Yafei Zhang) **Done!**
    - Semantic Textual Feature Embedding: Pretrained Glove W2V embedding & BERT Embedding (by Wei Ren) **Done!**
    - Domain Feature Embedding: W2V embedding from Dota2 dataset (by Chandan Sithiraju)
3. Adjust the Bi-LSTM or some RNN models (by Ren Wei & Yafei Zhang)
    1. Stacked Seq2Seq model **Done!**
    2. Attention
    3. CRF Attachment **Done!**
4. Adjust the Transformer based model  (by Chandan Sithiraju)
    1. Stacked Seq2Seq model
    2. Attention
    3. CRF Attachment
5. Write the Evaluation Functions (by Wei Ren) **Done!**
6. Write the report (Each person writes the part that corresponds to their code)

In [None]:
!pip install -U 'scikit-learn<0.24'
!pip install urllib3==1.25.11
!pip install pytorch-pretrained-bert
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Load Data (Preprocessing)

In [None]:
# read IOB tagged NER dataset as dataframe
import csv

def read_data(file_name):
  csvfile = open(file_name)
  documents = csv.DictReader(csvfile)


  input_data = []
  target_data = []

  for row in documents:
    column_names = documents.fieldnames
    if 'labels' in column_names:
        sent = [word.lower() for word in (row['sents'].split(' '))]
        labels = [label for label in (row['labels'].split(' '))]
        if len(sent) == len(labels):
            input_data.append(sent)
            target_data.append(labels)
        else:
            print(sent, labels)
    else: # for testing data
        sent = [word.lower() for word in (row['sents'].split(' '))]
        input_data.append(sent)
        target_data.append(' ')
  
  return input_data, target_data

train_data, target_y_train = read_data("/content/train.csv")
test_data, target_y_test = read_data("/content/test_without_labels.csv")
validation_data, target_y_validation = read_data("/content/val.csv")


In [None]:
print(len(train_data))
print(train_data[20])
print(type(train_data[20]))
print(target_y_train[20])
print(len(validation_data))
print(len(test_data))
print(target_y_validation)

26078
['pls', 'report', 'sb', 'thanks']
<class 'list'>
['O', 'S', 'C', 'O']
8705
500
[['S'], ['S'], ['S', 'SEPA', 'O', 'O', 'O'], ['T'], ['O'], ['O', 'P', 'O'], ['O', 'P', 'O', 'T'], ['O'], ['T', 'C', 'SEPA', 'O'], ['O', 'P', 'SEPA', 'S', 'C', 'O'], ['O'], ['P', 'O', 'O', 'P', 'O', 'O'], ['P', 'O', 'S', 'O', 'P', 'P', 'O', 'O', 'O'], ['C', 'T', 'O'], ['S'], ['O', 'O'], ['O'], ['O', 'O', 'P', 'P', 'O'], ['O'], ['S'], ['P', 'O'], ['S', 'C', 'O', 'S', 'SEPA', 'O', 'O', 'O'], ['O', 'O', 'SEPA', 'O', 'O', 'O', 'P', 'O', 'O', 'O', 'P'], ['O', 'SEPA', 'S'], ['O', 'S'], ['O', 'O'], ['O', 'O', 'O', 'O', 'O'], ['S', 'O'], ['O'], ['O', 'D'], ['P', 'O', 'O', 'O', 'C', 'O'], ['O', 'S', 'C'], ['D', 'O'], ['P', 'O', 'P'], ['P', 'O', 'O', 'O', 'O', 'O'], ['T', 'O', 'P', 'O', 'O', 'P', 'SEPA', 'O', 'O', 'P', 'O', 'O'], ['O', 'O', 'O', 'O'], ['O'], ['O', 'SEPA', 'O', 'SEPA', 'P', 'S'], ['O', 'SEPA', 'O'], ['S'], ['P', 'O', 'O', 'O'], ['O', 'O'], ['S'], ['O'], ['P', 'O', 'O', 'O', 'O'], ['O', 'SEPA', 'O'

### Generate word_to_ix, word_list and tag_to_ix

In [None]:
word_to_ix = {}
for sentence in train_data+validation_data+test_data:
  for w in sentence:
    if w not in word_to_ix:
        word_to_ix[w] = len(word_to_ix)
word_list = list(word_to_ix.keys())

vocab_size = len(word_to_ix)

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}

for tags in target_y_train+target_y_validation:
  for t in tags:
    if t not in tag_to_ix:
      tag_to_ix[t] = len(tag_to_ix)


In [None]:
print("size of word_list:", len(word_list))
print(word_to_ix)
print(word_list)
print(len(tag_to_ix))
print(tag_to_ix)

size of word_list: 11243
{'wow': 0, 'wtf': 1, 'wpe': 2, 'hahaha': 3, 'i': 4, 'cant': 5, '[sepa]': 6, 'play': 7, 'with': 8, '4': 9, 'trash': 10, 'bg': 11, '#error!': 12, 'gg': 13, 'report': 14, 'my': 15, 'team': 16, 'rat': 17, 'please': 18, 'ez': 19, 'mid': 20, 'hahah': 21, 'arrows': 22, 'always': 23, 'decent': 24, 'fuck': 25, 'u': 26, 'gh': 27, 'engage': 28, 'at': 29, 'bot': 30, 'lc': 31, 'takle': 32, 'then': 33, 'top': 34, 'cmon': 35, 'the': 36, 'comeback': 37, 'is': 38, 'real': 39, ':)': 40, 'him': 41, 'vs': 42, 'me': 43, 'just': 44, 'end': 45, 'wan': 46, 'nex': 47, 'game': 48, 'g': 49, 'he': 50, 'not': 51, 'losing': 52, 'pls': 53, 'sb': 54, 'thanks': 55, 'omg': 56, 'ggwp': 57, 'wp': 58, 'cap': 59, 'lo': 60, 'lol': 61, 'fuyckjerfe': 62, 'noob': 63, 'invoker': 64, 'mean': 65, 'everyone': 66, 'on': 67, 'ur': 68, 'dumb': 69, 'enough': 70, 'to': 71, 'rot': 72, 'down': 73, '2': 74, 'hp': 75, '8': 76, 'it': 77, 'random': 78, ':/': 79, 'dead': 80, 'keeps': 81, 'charhing': 82, 'since': 83, '

## Input Embedding

- Syntactic Textual Feature Embedding: Pos Tagging (by Yafei Zhang) **Done!**
- Semantic Textual Feature Embedding: Pretrained Glove W2V embedding & BERT Embedding (by Wei Ren)
- Domain Feature Embedding: W2V embedding from Dota2 dataset (by Chandan 

### Pos Tagging Features

In [None]:
import nltk
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
tagdict = nltk.load('help/tagsets/upenn_tagset.pickle')

tagdict.keys()

[nltk_data] Error loading averaged_perceptron_tagger: HTTP Error 503:
[nltk_data]     Backend.max_conn reached
[nltk_data] Error loading tagsets: HTTP Error 503: Backend.max_conn
[nltk_data]     reached


LookupError: ignored

In [None]:
pos_size = len(list(tagdict.keys()))
postag_list = list(tagdict.keys())

# PoS tag <--> word index
pos_to_ix = { k:v for v, k in enumerate(postag_list) }

# PoS Embedding Matrix
pos_embedding_matrix = np.zeros((vocab_size, pos_size))
for word_idx, word in enumerate(word_list):
    tag = pos_tag([word])
    pos_idx = pos_to_ix[tag[0][1]]
    pos_embedding_matrix[word_idx, pos_idx] = 1

In [None]:
print(pos_size)
print(postag_list)
print(len(pos_to_ix))
print(pos_to_ix)
print(pos_embedding_matrix[11111])
print(pos_tag(["wtf"]))
print(pos_tag(["wtf"])[0][1])
print(pos_to_ix[pos_tag(["wtf"])[0][1]])

### Domain word embedding

In [None]:
# Read file en_chat_words which is a dict of word_indexes

import os
import pickle

#os.chdir(parent_dir)

file_to_read = open("/content/en_chat_words.pkl", "rb")

en_chat_words = pickle.load(file_to_read)

print(len(en_chat_words))

In [None]:
# Word_to_index is ready

domain_words = list(en_chat_words.keys())
print(len(domain_words), domain_words[0:20])

In [None]:
# domain model
from gensim.models import Word2Vec

domain_model = Word2Vec(sentences = [domain_words], size=100, window=5, min_count=1, workers=2, sg=1)
print(domain_model)

In [None]:
# Words in Domain and not in train data
#print( "fuck" in word_list)
set_difference = set(domain_words) - set(word_list)
list_difference = list(set_difference)

print(len(list_difference), list_difference)

In [None]:

!pwd

/content/drive/MyDrive/Colab Notebooks/COMP5046/Assignment2


### Pretrained Glove W2V embedding 

In [None]:
#use  glove-twitter-50
import gensim.downloader as api
import numpy as np
word_emb_model = api.load("glove-twitter-50") 

#EMBEDDING_DIM = 50
#EMBEDDING_DIM = pos_size
EMBEDDING_DIM = 50+pos_size+domain_model.vector_size
#EMBEDDING_DIM = domain_model.vector_size

embedding_matrix = []
for i, word in enumerate(word_list):
#for word in word_list:
    try:
        #pos_embedding_matrix.append(pos_embedding_matrix[i])
        #p_embedding_matrix.append(word_emb_model.wv[word])
        embedding_matrix.append(np.concatenate((word_emb_model.wv[word],pos_embedding_matrix[i],domain_model.wv[word]),0))
        #p_embedding_matrix.append(domain_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

In [None]:
print(embedding_matrix[1])



convert dataset into idxs

In [None]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
      input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_data,word_to_ix)
train_output_index = to_index(target_y_train,tag_to_ix)
val_input_index = to_index(validation_data,word_to_ix)
val_output_index = to_index(target_y_validation,tag_to_ix)
test_input_index = to_index(test_data,word_to_ix)
# test_output_index = to_index(target_y_test,tag_to_ix)

In [None]:
print(len(train_input_index))
print(len(train_output_index))
print(train_data[1000:1010])
print(target_y_train[1000:1010])
print(train_input_index[1000:1010])
print(train_output_index[1000:1010])
print(tag_to_ix)

### BERT Embedding

In [None]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
# Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Pretrained_BERT = BertModel.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 2537247.93B/s]
100%|██████████| 407873900/407873900 [00:19<00:00, 20922311.21B/s]


In [None]:
import torch 
from tqdm import tqdm_notebook as tqdm
bert_embedding_matrix = []
for i, word in enumerate(tqdm(word_list)):
    tokenized_word = tokenizer.tokenize(word)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_word)
    tokens_tensor = torch.tensor([indexed_tokens])
    # print(tokenized_word)
    # print(indexed_tokens)
    with torch.no_grad():
        last_hidden_states = Pretrained_BERT(tokens_tensor)[0]
        # print(last_hidden_states[-1].shape)

        token_embedding = torch.zeros(768) # word embedding = sum of the embedding for each tokens
        for token_i in range(len(tokenized_word)):
            hidden_layers = torch.zeros(768)
            for layer_i in [8, 9, 10, 11]:
                # print(layer_i)
                vec = last_hidden_states[layer_i][0][token_i] 
                hidden_layers = hidden_layers + vec # sum the last four layers
            token_embedding = token_embedding + hidden_layers
        tmp = token_embedding.numpy().tolist()
        # print(token_embedding.shape)
        # print(len(tmp))
        bert_embedding_matrix.append(tmp)
bert_embedding_matrix = np.array(bert_embedding_matrix) # convert to an numpy array form

        # concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # concat last four layers [number_of_tokens, 3072]	
        # summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] # sum last four layers [number_of_tokens, 768]
            

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/11243 [00:00<?, ?it/s]

In [None]:
np.savetxt("/content/bert_embedding_matrix.txt", bert_embedding_matrix, delimiter = ',')
bert_embedding_matrix = np.loadtxt("/content/bert_embedding_matrix.txt", delimiter = ',', skiprows = 0, encoding = "UTF-8")

print(type(bert_embedding_matrix))
print(bert_embedding_matrix.shape)
# print(type(bert2))
# print(bert2.shape)
# print(np.sum(bert2 == bert_embedding_matrix))

<class 'numpy.ndarray'>
(11243, 768)


### Concatenate Embeddings

In [None]:
Domain_bert = np.concatenate([embedding_matrix, bert_embedding_matrix], axis=1) # (11243, 818)
domain_bert_glove = np.concatenate([embedding_matrix, bert_embedding_matrix, p_embedding_matrix], axis=1) # (11243, 863)
pos_bert = np.concatenate([pos_embedding_matrix, bert_embedding_matrix], axis=1) # (11243, 813)

In [None]:
print(domain_bert_glove.shape)

(11243, 918)


## Slot Filling Models
> you MUST NOT directly use the existing state-of-art seq2seq architecture as it is but need to modify the seq2seq architecture and propose your own model 

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import numpy as np

torch.manual_seed(1)

### Helper Functions


In [None]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


def cal_acc(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i, idxs in enumerate(input_index):
        tags_index = output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        t = torch.tensor(tags_index, dtype=torch.long).to(device)
        _, p = model(sentence_in)
        ground_truth.extend(tags_index)
        predicted.extend(p)
    # import pdb 
    # pdb.set_trace()
    acc_num = 0
    for i in range(len(predicted)):
        if(predicted[i] == ground_truth[i]):
            acc_num += 1
    
    accuracy = acc_num / len(predicted)

    return predicted, ground_truth, accuracy

### Baseline Model (Bi-LSTM and CRF)

In [None]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        #self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim// 2).to(device))
    
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats



    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        #import pdb
        #pdb.set_trace()
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

### BiLSTM with attention and CRF

In [None]:
class BiLSTM_Atten_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(pos_embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        #self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim// 2).to(device))
    
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def attention_net(self, lstm_output, final_state):
        # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
        # final_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        batch_size = len(lstm_output)
        # hidden = final_state.view(batch_size,-1,1)
        hidden = torch.cat((final_state[0],final_state[1]),dim=1).unsqueeze(2)
        # hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)
        # attn_weights : [batch_size,n_step]
        soft_attn_weights = F.softmax(attn_weights,1)

        # context: [batch_size, n_hidden * num_directions(=2)]
        context = torch.bmm(lstm_output.transpose(1,2),soft_attn_weights.unsqueeze(2)).squeeze(2)

        return context, soft_attn_weights
        
        def attn(self, lstm_output, h_t):
            # lstm_output [3, 10, 16]  h_t[10, 16]
            h_t = h_t.unsqueeze(0)
            # [10, 16, 1]
            h_t = h_t.permute(1, 2, 0)
            lstm_output = lstm_output.permute(1, 0, 2)
            attn_weights = torch.bmm(lstm_output, h_t)
            attn_weights = attn_weights.permute(1, 0, 2).squeeze()
            # [3, 10]
            attention = F.softmax(attn_weights, 1)
            # bmm: [10, 16, 3] [10, 3, 1]
            attn_out = torch.bmm(lstm_output.transpose(1, 2), attention.unsqueeze(-1).transpose(1,0))
            return attn_out.squeeze()

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        lstm_out = lstm_out.transpose(0, 1) # lstm_out : [batch_size, seq_len, n_hidden * num_directions(=2)]


        attn_output, attention = self.attention_net(lstm_out, self.hidden[0])

        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats



    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        import pdb
        pdb.set_trace()
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

### Stacked BiLSTM with CRF

In [None]:
class STACKED_BiLSTM_CRF(nn.Module):

    #def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim_1, hidden_dim_2):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim_1 = hidden_dim_1
        self.hidden_dim_2 = hidden_dim_2
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            #num_layers=1, bidirectional=True)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim_1 // 2,
                            num_layers=1, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim_1, hidden_dim_2 // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        #self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.hidden2tag = nn.Linear(hidden_dim_2, self.tagset_size)
        

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
        self.hidden_2 = self.init_hidden_2()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim_1 // 2).to(device),
                torch.randn(2, 1, self.hidden_dim_1// 2).to(device))
    
    def init_hidden_2(self):
        return (torch.randn(2, 1, self.hidden_dim_2 // 2).to(device),
            torch.randn(2, 1, self.hidden_dim_2// 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        self.hidden_2 = self.init_hidden_2()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out_1, self.hidden = self.lstm1(embeds, self.hidden)
        lstm_out_2, self.hidden_2 = self.lstm2(lstm_out_1, self.hidden_2)
        lstm_out_2 = lstm_out_2.view(len(sentence), self.hidden_dim_2)
        lstm_feats = self.hidden2tag(lstm_out_2)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

NameError: ignored

### Bi-LSTM without CRF

In [None]:
class BiLSTM(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()

        self.hidden_dim = hidden_dim
        self.tagset_size = len(tag_to_ix)
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.word_embeds.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.ce = nn.CrossEntropyLoss()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim// 2).to(device))
    
    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, _ = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats


    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        tag_seq = torch.argmax(feats, dim=1)
        score = self.ce(feats, tags)

        return score

    # Use for inference
    def forward(self, sentence): 
        # Get the emission scores from the BiLSTM
        feats = self._get_lstm_features(sentence)
        tag_seq = torch.argmax(feats, dim=1)
        score = self.ce(feats, tag_seq)

        return score, tag_seq


### Stucked Bi-LSTM without CRF

In [None]:
class STUCKED_BiLSTM(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim_1, hidden_dim_2):
        super(STUCKED_BiLSTM, self).__init__()

        self.hidden_dim_1 = hidden_dim_1
        self.hidden_dim_2 = hidden_dim_2
        self.tagset_size = len(tag_to_ix)
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.word_embeds.weight.requires_grad = False
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim_1 // 2,
                            num_layers=1, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim_1, hidden_dim_2 // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim_2, self.tagset_size)
        self.ce = nn.CrossEntropyLoss()
    
    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim_1 // 2).to(device),
                torch.randn(2, 1, self.hidden_dim_1// 2).to(device))

    def init_hidden_2(self):
        return (torch.randn(2, 1, self.hidden_dim_2 // 2).to(device),
            torch.randn(2, 1, self.hidden_dim_2// 2).to(device))
    
    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        self.hidden_2 = self.init_hidden_2()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out_1, _ = self.lstm1(embeds, self.hidden)
        lstm_out_2, _ = self.lstm2(lstm_out_1, self.hidden_2)
        lstm_out_2 = lstm_out_2.view(len(sentence), self.hidden_dim_2)
        lstm_feats = self.hidden2tag(lstm_out_2)
        return lstm_feats


    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        tag_seq = torch.argmax(feats, dim=1)
        score = self.ce(feats, tags)

        return score

    # Use for inference
    def forward(self, sentence): 
        # Get the emission scores from the BiLSTM
        feats = self._get_lstm_features(sentence)
        tag_seq = torch.argmax(feats, dim=1)
        score = self.ce(feats, tag_seq)

        return score, tag_seq

## Training

### Build Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 200
HIDDEN_DIM_2 = 100
EMBEDDING_DIM = 195
Epoch = 2

#biLSTM with CRF
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)

#biLSTM withOUT CRF
#model = BiLSTM(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)

#Stucked biLSTM with CRF
#model = STUCKED_BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM_2).to(device)

#Stucked biLSTM without CRF
#model = STUCKED_BiLSTM(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM_2).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

### Training and Validating

In [None]:
"""Each epoch will take about 1-2 minutes"""

import datetime
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings("ignore")

for epoch in tqdm(range(Epoch)):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(tqdm(train_input_index)):
        tags_index = train_output_index[i]
#         if i % 1000 == 0:
#             print(i, tags_index) 

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        #print(sentence_in.shape)
        #print(targets.shape)
        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    # Call the cal_acc functions you implemented as required
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Enpoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

# The log below is the sample output for this section
# Please make sure you keep your own running log for submission

## Transformer model


### Build the model

In [None]:
# Taken from University of Sydney COMP5046_Lab11_solution.ipynb
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term) #0::2 means starting with index 0, step = 2
        pe[:, 1::2] = torch.cos(position * div_term) 
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
# Taken from University of Sydney COMP5046_Lab11_solution.ipynb
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, sent_max_len,  dropout=0.5):
        # print ("ntoken, ninp, nhead, nhid, nlayers, sent_max_len", ntoken, ninp, nhead, nhid, nlayers, sent_max_len)
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, sent_max_len)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        #triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        # print(f"1src.shape{src.shape}")
        src = self.pos_encoder(src)
        # print(f"2src.shape{src.shape}")
        output = self.transformer_encoder(src, self.src_mask)
        # print(f"3 output.shape{output.shape}")
        output = self.decoder(output)
        # print(f"4 output.shape{output.shape}")
        return output

In [None]:
# Taken from University of Sydney COMP5046_Lab11_solution.ipynb
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate

import time

bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def get_padded_sent(input, output, max_len):
    diff = max_len - len(input)
    input += [word_to_ix[PAD_TAG]] * diff
    output += [tag_to_ix[PAD_TAG]] * diff

    return input, output



def train(model, optimizer,scheduler, word_to_ix, train_input_index, train_output_index):

    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(word_to_ix)


    for batch, idxs in enumerate(train_input_index):
        tags_index = train_output_index[batch]
        # if batch % 1000 == 0:
        #     print(batch, tags_index) 
        # print(f"idxs {idxs}, tags_index {tags_index}")
        # print(f"idx.shape {idxs.shape}, tags_index {tags_index}")
        # data, targets = get_batch(train_data, i)
        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        # print(f"before idxs {idxs}, tags_index {tags_index}")
        ins, outs = get_padded_sent(idxs, tags_index, sent_max_len)
        # print(f"after ins {ins}, outs {outs}")
        sentence_in = torch.tensor(ins, dtype=torch.long).to(device)
        targets = torch.tensor(outs, dtype=torch.long).to(device)

        optimizer.zero_grad()
        output = model(sentence_in)
        # print(f"output.shape: {output.shape},output.view(-1, ntokens).shape {output.view(-1, sent_max_len).shape}, targets.shape: {targets.shape}")
        # print("Hello", output)
        # output = output.view(-1, sent_max_len)
        # output = output.view(-1, sent_max_len)
        # print("Hello222", output.shape, output)

        loss = criterion(output.view(-1, sent_max_len).T, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 5000
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_input_index), scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


def evaluate(eval_model, train_input_index, train_output_index):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    # ntokens = len(word_to_ix)#len(TEXT.vocab.stoi)
    with torch.no_grad():
        for batch, idxs in enumerate(train_input_index):
            tags_index = train_output_index[batch]
            if batch % 5000 == 0:
                print("Evaluating",batch, "of", len(train_input_index)) 
                

            # data, targets = get_batch(train_data, i)
            ins, outs = get_padded_sent(idxs, tags_index, sent_max_len)
            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = torch.tensor(ins, dtype=torch.long).to(device)
            targets = torch.tensor(outs, dtype=torch.long).to(device)

            # data, targets = get_batch(data_source, i)
            output = eval_model(sentence_in)
            # print(f"Eval output.shape: {output.shape}, targets.shape {targets.shape}")
            output_flat = output.view(-1, sent_max_len).T
            # print(f"Eval output_flat.shape: {output_flat.shape}, targets.shape {targets.shape}")
            total_loss += len(idxs) * criterion(output_flat, targets).item()
    return total_loss / (len(train_input_index) - 1)

### Training Multiple Configurations

In [None]:

train_sent_max_len = np.max([len(idxs) for idxs in train_input_index])
val_sent_max_len = np.max([len(idxs) for idxs in val_input_index])
test_sent_max_len = np.max([len(idxs) for idxs in test_input_index])
# print(train_sent_max_len, val_sent_max_len, test_sent_max_len)

100
57 72 25


In [None]:
sent_max_len = 100
ntokens = len(word_to_ix)#len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
dropout = 0.2 # the dropout value

configs = [
    {
        "nlayers": 2, # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
        "nhead": 1, # the number of heads in the multiheadattention models
        "losses":[],
        "ppls":[]
    },
    {
        "nlayers": 2, # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
        "nhead": 2, # the number of heads in the multiheadattention models
        "losses":[],
        "ppls":[]
    },
    {
        "nlayers": 2, # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
        "nhead": 4, # the number of heads in the multiheadattention models
        "losses":[],
        "ppls":[]
    },
    {
        "nlayers": 2, # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
        "nhead": 8, # the number of heads in the multiheadattention models
        "losses":[],
        "ppls":[]
    }
]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for config in configs:

    ntokens = len(word_to_ix)#len(TEXT.vocab.stoi) # the size of vocabulary
    emsize = 200 # embedding dimension
    nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
    nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    nhead = 4 # the number of heads in the multiheadattention models
    dropout = 0.2 # the dropout value
    
    model = TransformerModel(ntokens, emsize, config["nhead"], nhid, config["nlayers"], sent_max_len, dropout).to(device)

    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    best_val_loss = float("inf")
    epochs = 3 # The number of epochs
    best_model = None

    criterion = nn.CrossEntropyLoss()
    lr = 5.0 # learning rate
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    
    best_val_loss = float("inf")
    epochs = 2 # The number of epochs
    best_model = None

    print("="*100)
    print("Training the model with following configs: number of heads={0} & number of encoders={1}".format(config["nhead"], config["nlayers"]))
    print("="*100)

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train(model, optimizer, scheduler, word_to_ix, train_input_index, train_output_index)
        # val_loss = evaluate(model, train_input_index, train_output_index)
        val_loss = evaluate(model, val_input_index, val_output_index)
        print('-' * 89)
        try:
            val_ppl = math.exp(val_loss)
        except OverflowError:
            val_ppl = float('inf')
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                        val_loss, val_ppl))
        print('-' * 89)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model

        scheduler.step()

        config["losses"].append(val_loss)
        config["ppls"].append(val_ppl)

    test_loss = evaluate(model, val_input_index, val_output_index)
       
    # test_loss = evaluate(best_model, test_input_index, test_)
    print('=' * 89)
    try:
        test_ppl = math.exp(test_loss)
    except OverflowError:
        test_ppl = float('inf')
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, test_ppl))
    print('=' * 89)
    config["test_loss"] = test_loss
    config["test_ppl"] = test_ppl



Training the model with following configs: number of heads=1 & number of encoders=2




| epoch   1 |  5000/26078 batches | lr 5.00 | ms/batch 20.01 | loss  6.79 | ppl   890.07
| epoch   1 | 10000/26078 batches | lr 5.00 | ms/batch 20.27 | loss  6.54 | ppl   689.31
| epoch   1 | 15000/26078 batches | lr 5.00 | ms/batch 20.23 | loss  6.35 | ppl   570.46
| epoch   1 | 20000/26078 batches | lr 5.00 | ms/batch 20.24 | loss  6.28 | ppl   534.24
| epoch   1 | 25000/26078 batches | lr 5.00 | ms/batch 20.23 | loss  6.25 | ppl   519.64
Evaluating 0
Evaluating 5000
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 585.32s | valid loss 864.01 | valid ppl      inf
-----------------------------------------------------------------------------------------
| epoch   2 |  5000/26078 batches | lr 4.51 | ms/batch 20.24 | loss  6.25 | ppl   517.94
| epoch   2 | 10000/26078 batches | lr 4.51 | ms/batch 20.23 | loss  6.15 | ppl   470.04
| epoch   2 | 15000/26078 batches | lr 4.51 | ms/batch 20.24 | loss  6.17 | ppl   480.58
| e

In [None]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)
config["test_loss"] = test_loss
config["test_ppl"] = math.exp(test_loss)


In [None]:
# Best Model saving 
saved_model_path="/content/bestTransformerModel.pkl"
# torch.save(model, PATH)
torch.save(model , saved_model_path)

# !pwd

## Testing and Evaluating

### Test on validation set

With CRF

In [None]:
# Call the cal_acc functions you implemented as required
y_pred, y_true, _ = cal_acc(model,val_input_index, val_output_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

Without CRF

In [None]:
# Call the cal_acc functions you implemented as required
y_pred, y_true, _ = cal_acc(model,val_input_index, val_output_index)
#for with out crf model
y_pred_list = [int(t.item()) for t in y_pred]

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

y_true_decode = decode_output(y_true)
#y_pred_decode = decode_output(y_pred)
y_pred_decode = decode_output(y_pred_list)

In [None]:
print(y_pred)
print(y_true_decode)

[6, 6, 6, 5, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 3, 2, 3, 8, 5, 2, 2, 4, 5, 6, 8, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2, 6, 2, 4, 4, 2, 2, 2, 2, 3, 2, 6, 2, 2, 2, 2, 2, 4, 4, 2, 2, 6, 4, 2, 6, 8, 2, 6, 5, 2, 2, 2, 2, 2, 5, 2, 2, 2, 4, 2, 2, 2, 4, 2, 5, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 4, 2, 2, 2, 8, 2, 2, 6, 8, 7, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 5, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 5, 4, 2, 2, 5, 2, 6, 4, 2, 2, 2, 2, 2, 6, 2, 4, 2, 2, 2, 2, 2, 5, 2, 4, 5, 2, 2, 5, 6, 2, 5, 2, 2, 5, 2, 5, 2, 2, 2, 7, 5, 2, 2, 2, 3, 2, 6, 6, 5, 2, 5, 2, 2, 5, 2, 5, 2, 5, 2, 5, 2, 4, 2, 5, 2, 2, 2, 5, 2, 4, 4, 2, 2, 5, 3, 2, 5, 2, 5, 2, 2, 4, 2, 2, 8, 5, 2, 2, 2, 2, 4, 3, 2, 4, 2, 2, 6, 2, 2, 2, 3, 2, 4, 2, 4, 5, 2, 4, 2, 4, 2, 2, 5, 2, 6, 4, 2, 4, 6, 2, 2, 2, 2, 2, 2, 5, 4, 2, 2, 4, 2, 2, 6, 2, 2, 4, 2, 4, 2, 2, 6, 2, 2, 2, 6, 8, 5, 2, 2, 5, 4, 6, 5, 6, 2, 4, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 6, 4, 2, 2, 2, 2, 4, 2, 8, 2, 5, 4, 2, 2, 5, 2, 4, 2, 2, 2, 3, 2, 5, 8, 2, 2, 2, 2, 5, 2, 4, 2, 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true_decode,y_pred_decode,digits=4))
# The log below is the sample output for this section
# Please make sure you keep your own running log for submission

### Test on testing set (submit to kaggle)

with crf

In [None]:
#
def gen_csv(model, input_index):
    with open('result.csv', 'w') as f:
        f.write("ID,Predicted\n")
        words = 0
        for i, idxs in enumerate(input_index):
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            _, p = model(sentence_in)
            labels = decode_output(p)
            for label in labels:
                f.write("{}, {}\n".format(words, label))
                words += 1
            
            # import pdb 
            # pdb.set_trace()
        print(words)
    return

gen_csv(model, test_input_index)


without crf

In [None]:

def gen_csv(model, input_index):
    with open('result.csv', 'w') as f:
        f.write("ID,Predicted\n")
        words = 0
        for i, idxs in enumerate(input_index):
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            _, p = model(sentence_in)
            #for with out crf model
            p = [int(t.item()) for t in p]
            labels = decode_output(p)
            for label in labels:
                f.write("{}, {}\n".format(words, label))
                words += 1
            
            # import pdb 
            # pdb.set_trace()
        print(words)
    return

gen_csv(model, test_input_index)

2326
