## Importing required libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import time
import numpy as np
import csv
import torch.utils.data as data_utils
import pickle
import json

from scipy.special import softmax

from nltk import pos_tag
import nltk
nltk.download('universal_tagset')
import spacy
sp = spacy.load('en_core_web_sm')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


## Data Importing and Data Preparation

In [None]:
DATA_PATH = '/content/drive/MyDrive/MscThesis/public_data_PP/gimpel_pos'

#### **Reading in the data**

In [None]:
def read_csv_data(path):
  '''
  User define function to read the .csv file from the given path
  '''
  data = []
  with open(path, 'r') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for line in tsvreader:
      data.append(line)
    # to remove the header
  return data[1:]


def readbin(f_in):
  '''
  User define function to read the .bin file from the given path
  '''
  inp = open(f_in, "rb")
  out = pickle.load(inp)
  inp.close()
  return out

In [None]:
# prepared word embeddings for the model.
word_emb_matrix = DATA_PATH + '/embeddings/word_emb_matrix.bin'
word_emb = readbin(word_emb_matrix)


# prepared character embeddings for the model
char_emb_matrix = DATA_PATH + '/embeddings/char_emb_matrix.bin'
char_emb = readbin(char_emb_matrix)


# the train and dev data from csv
train_path = DATA_PATH + '/input_data/train_data.tsv'
train_data = read_csv_data(train_path)


dev_path = DATA_PATH + '/input_data/dev_data.tsv'
dev_data = read_csv_data(dev_path)


# the indices of the words in a sentence, saved as arrays. Hint:Helps you know where each sentence ends
train_iis_path = DATA_PATH + '/input_data/word_iis_trn.npy'
dev_iis_path = DATA_PATH + '/input_data/word_iis_dev.npy'

word_iis_trn = np.load(train_iis_path)
word_iis_dev = np.load(dev_iis_path)


# the padded sentences (maximum of 40 words per sentence). Two words in the same sentence will have the same word_pad. The numbers
# indicate the idx of the word in the word embedding dictionary.
train_wordpad_path = DATA_PATH + '/input_data/word_pad_trn.npy'
dev_wordpad_path = DATA_PATH + '/input_data/word_pad_dev.npy'

word_pad_trn = np.load(train_wordpad_path)
word_pad_dev = np.load(dev_wordpad_path)

# the character padding
train_charpad_path = DATA_PATH + '/input_data/char_pad_trn.npy'
dev_charpad_path = DATA_PATH + '/input_data/char_pad_dev.npy'

char_pad_trn = np.load(train_charpad_path)
char_pad_dev = np.load(dev_charpad_path)

#### **Getting the training labels**

In [None]:
def get_soft(string_format):
    annotator_dict = {ann_idx:labels_dict[annotation] for ann_idx, annotation in enumerate(string_format.split(',')) if annotation != ""}
    ann_labs = list(annotator_dict.values())
    distr = [ann_labs.count(l) for l in range(len(labels_dict))]
    return distr, softmax(distr).tolist()

labels_dict = {'ADJ': 0, 'ADP': 1, 'ADV': 2, 'CCONJ': 3, 'DET': 4,'NOUN': 5, 'NUM': 6, 'PART': 8,'PRON': 7,'PUNCT': 9,'VERB': 10, 'X': 11}


train_softs = []
train_distr = []
for line in train_data:
    if line:
        distr, soft = get_soft(line[-1])
        train_softs.append(soft)
        train_distr.append(distr)

#print((train_softs[5:], train_distr[5:]))

####  **Training Parameters** 

In [None]:
lstm_size = 200
attn_size = 600

num_epochs = 20

batsize = 1000
sizeout_rate = 0.8

word_emb_size = 300
char_emb_size = 64

word_padsize = word_pad_trn.shape[1]
char_padsize = char_pad_trn.shape[1]

## **Training Functions**

In [None]:
def lookup_embeddings(embedding_lookup, index_matrix):
    flattened_indices = torch.flatten(index_matrix)
    selected = torch.index_select(embedding_lookup, 0, flattened_indices)
    return selected.reshape(index_matrix.shape[0], index_matrix.shape[1], embedding_lookup.shape[-1])


def to_numpy(torch_tensor):
    return torch_tensor.cpu().clone().detach().numpy()

def to_cuda(x):
    """ GPU-enable a tensor """
    if torch.cuda.is_available():
        x = x.cuda()
    return x

def create_dataset(word_pad_trn_bat, word_iis_trn_bat, char_pad_trn_bat, has_labels=False, y_hot_trn_bat=None, y_soft_trn_bat=None):
    word_pad_trn_bat, word_iis_trn_bat = torch.from_numpy(word_pad_trn_bat).long().to(device), torch.from_numpy(word_iis_trn_bat).long().to(device)
    char_pad_trn_bat = torch.from_numpy(char_pad_trn_bat).long().to(device)
    if has_labels:
        y_hot_trn_bat = torch.from_numpy(y_hot_trn_bat).float().to(device)
        y_soft_trn_bat = torch.from_numpy(y_soft_trn_bat).float().to(device)
    return word_pad_trn_bat, word_iis_trn_bat, char_pad_trn_bat, y_hot_trn_bat, y_soft_trn_bat


def backprop_hot(optimizer, loss):
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return

def get_predictions(model, eval_loader):
    hard_preds = []
    soft_preds = []
    model.eval()
    for wwordpad_bat, wwordiis_bat, ccharpad_bat in eval_loader:
        one_hot_pred, _ = model(wwordpad_bat, wwordiis_bat, ccharpad_bat, None, None)
        one_hot_pred = one_hot_pred.detach().cpu().numpy()
        hard_preds.extend(np.argmax(one_hot_pred, 1))
        soft_preds.extend(one_hot_pred)
    return hard_preds, soft_preds

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


## **Evaluating Functions**

In [None]:

def cross_entropy_metric(targets, predictions, epsilon=1e-12):
    """
    Computes cross entropy between targets and predictions.
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray
    Returns: scalar
    https://stackoverflow.com/questions/47377222/what-is-the-problem-with-my-implementation-of-the-cross-entropy-function
    """
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    ce = -np.sum(targets*np.log(predictions))/N
    return ce


def f1_metric(solution, prediction, num_classes=2):
    matches, gold, system = {}, {}, {}
    for i in range(num_classes):
        matches[i] = 0
        system[i] = 0
        gold[i] = 0

    for g, p in zip(solution, prediction):
        if p == g:
            matches[p] += 1

        gold[g] += 1
        system[p] += 1

    recall = {}
    precision = {}
    f1 = {}
    for i in range(num_classes):
        recall[i] = 1.0 * matches[i] / gold[i] if matches[i] != 0 else 0
        precision[i] = 1.0 * matches[i] / system[i] if matches[i] !=0 else 0
        f1[i] =  (2 * (precision[i] * recall[i])/(precision[i] + recall[i])) if (precision[i] + recall[i]) > 0 else 0

    support = np.array([gold[i] for i in range(num_classes)])

    average_recall = np.average([recall[i] for i in range(num_classes)], weights=support)
    average_precision = np.average([precision[i] for i in range(num_classes)], weights=support)
    average_f1 = np.average([f1[i] for i in range(num_classes)], weights=support)
    return average_f1, average_recall, average_precision


def load_dictionary(filepath):
    with open(filepath, 'r') as f:
        dictionary = json.load(f)
    return dictionary


def get_hard_score(reference_path, submission_path, num_classes):

    # submissions are in a dictionary format
    reference_dictionary = load_dictionary(reference_path)
    submission_dictionary = load_dictionary(submission_path)

    # getting the submission vectors
    golds = []
    predictions = []
    for document, doc_contents in reference_dictionary.items():      
        sub = submission_dictionary[document]
        for item_id, contents in doc_contents.items():
            golds.append(contents['gold'])
            predictions.append(sub[item_id]['gold'])

    #print('Dev reference gold: ',golds)
    #print('Dev result gold:', predictions)

    f1, recall, precision = f1_metric(np.array(golds), np.array(predictions), num_classes)

    return f1, recall, precision


def get_soft_score(reference_path, submission_path):

    # submissions are in a dictionary format
    reference_dictionary = load_dictionary(reference_path)
    submission_dictionary = load_dictionary(submission_path)

    # getting the submission vectors
    softs = []
    predictions = []
    for document, doc_contents in reference_dictionary.items():
        sub = submission_dictionary[document]
        for item_id, contents in doc_contents.items():
            softs.append(contents['soft'])
            predictions.append(sub[item_id]['soft'])
    # evaluating using cross_entropy
    score = cross_entropy_metric(np.array(softs), np.array(predictions))
    return score

## **Getting, Tensorizing and Batching the Data**

In [None]:
train_softs = np.array(train_softs)

train_softs.shape

train_mv = np.argmax(train_softs, 1)

word_pad_dev_tens, word_iis_dev_tens, char_pad_dev_tens, hot_dev_tens, soft_dev_tens = create_dataset(word_pad_dev, word_iis_dev, char_pad_dev)
dev = data_utils.TensorDataset(word_pad_dev_tens, word_iis_dev_tens, char_pad_dev_tens)
dev_loader = data_utils.DataLoader(dev, batch_size=batsize, shuffle=False)

word_pad_trn_tens, word_iis_trn_tens, char_pad_trn_ten, hot_trn_tens, soft_trn_tens = create_dataset(word_pad_trn, word_iis_trn, char_pad_trn, True, train_mv, train_softs)
train = data_utils.TensorDataset(word_pad_trn_tens, word_iis_trn_tens, char_pad_trn_ten, hot_trn_tens, soft_trn_tens)
train_loader = data_utils.DataLoader(train, batch_size=batsize, shuffle=True)


## **Model**

In [None]:
class Word_Encoder(torch.nn.Module):
    def __init__(self, lstm_size, embedding_size):
        super().__init__()

        self.bilstm = torch.nn.LSTM(embedding_size, lstm_size, bidirectional=True, batch_first=True)
        self.dropout = torch.nn.Dropout(0.2)

    def forward(self, wword_pad, col_indices):
        embedded_words = lookup_embeddings(word_embedding_lookup, wword_pad)
        rnn_context, _ = self.bilstm(embedded_words)
        rnn_sequence = torch.stack([torch.index_select(seq, 0, i) for seq, i in zip(rnn_context, col_indices)], 0)
        rnn_sequence = self.dropout(rnn_sequence)
        return rnn_sequence, rnn_context


class Char_Encoder(torch.nn.Module):
    def __init__(self, lstm_size, embedding_size):
        super().__init__()

        self.bilstm = torch.nn.LSTM(embedding_size, lstm_size, bidirectional=True, batch_first=True)
        self.dropout = torch.nn.Dropout(0.2)

    def forward(self, cchar_pad):
        embedded_chars = lookup_embeddings(char_embedding_lookup, cchar_pad)
        rnn_sequence, _ = self.bilstm(embedded_chars)
        rnn_sequence = self.dropout(rnn_sequence[:,1])
        reshaped = torch.reshape(rnn_sequence, [-1, 1, rnn_sequence.shape[1]])
        return reshaped


class Attention(torch.nn.Module):
    def __init__(self, attn_emb_dim, attn_size):
        super().__init__()

        self.attn_nn = torch.nn.Sequential(
                    torch.nn.Linear(attn_emb_dim, attn_size),
                    torch.nn.ReLU())

        self.u_omega = torch.nn.Parameter(torch.randn([attn_size]))

    def forward(self, attn_in, s):
        v = self.attn_nn(attn_in)
        vu = torch.matmul(v.squeeze(1), self.u_omega)
        alphas = torch.nn.functional.softmax(vu, 0)
        final = torch.sum(attn_in * alphas.unsqueeze(-1), 1)
        return final



class RNN_all(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.word_encoder = Word_Encoder(lstm_size, word_emb_size)
        self.char_encoder = Char_Encoder(lstm_size, char_emb_size)

        self.char_attention = Attention(lstm_size*2, attn_size)
        self.word_attention = Attention(lstm_size*2, attn_size)

        concat_size = lstm_size*4
        hidden1 = int(lstm_size*4*sizeout_rate)
        out_final = int(hidden1*sizeout_rate)
        self.fulcon = torch.nn.Sequential(
                    torch.nn.Linear(concat_size, hidden1),
                    torch.nn.Linear(hidden1, out_final))

        self.output_hot = torch.nn.Linear(out_final, hotsize)
    
    def forward(self, wword_pad, wword_iis, cchar_pad, one_hot_labels, soft_labels, eval=True):
        word_sequence, word_context = self.word_encoder(wword_pad, wword_iis)
        word_attn= self.word_attention(word_sequence, 'word')

        char_sequence = self.char_encoder(cchar_pad)
        char_attn = self.char_attention(char_sequence, 'char')

        concat_attn = torch.cat([word_attn, char_attn], 1)
        ful = self.fulcon(concat_attn)

        pred_hot = self.output_hot(ful)  
        softmax_scores = torch.nn.functional.softmax(pred_hot, 1) + 1e-43
        if eval:
            return softmax_scores, None
        else:
            soft_labels = soft_labels + 1e-43
            softmax_scores = torch.nn.functional.softmax(pred_hot, 1) + 1e-4
            cross_entropy = torch.mul(soft_labels, softmax_scores.log())
            loss  = -torch.sum(cross_entropy)
            return softmax_scores, loss


#### Model Training

In [None]:
from torch.nn.functional import cross_entropy
"""**Training using the soft labels**"""

hotsize = 12
assert len(train_softs[0]) == hotsize

word_embedding_lookup = torch.from_numpy(word_emb).float().to(device)
char_embedding_lookup = torch.from_numpy(char_emb).float().to(device)

print('Beginning the Training')
NUM_EXPERIMENTS = 1
num_epochs = 20

accs = []
prfs = []
ct_prfs = []
jsds = []
kls = []
similarity_ents = []
ents_correlation = []
ce_results = []

dev_accs = []
dev_prfs = []

train_losses=[]

for exp in range(NUM_EXPERIMENTS):
    print('\nExperiment %d #######################'%exp)
    best_val_f, best_val_acc = 0, 0
    best_val_r, best_val_p = 0, 0

    last_batch = 0

    model = RNN_all()
    model = to_cuda(model)
    optimizer = torch.optim.Adam(params=[p for p in model.parameters()],lr=0.001)

    for epoch in range(num_epochs):
        running_loss = 0
        acc, dev_acc= 0 , 0
        nepoch = epoch + 1
        start_time = time.time()
        model.train()

        for word_pad_trn_bat, word_iis_trn_bat, char_pad_trn_bat, y_hot_trn_bat, y_soft_trn_bat in train_loader:
            hard_predictions, hard_loss = model(word_pad_trn_bat, word_iis_trn_bat, char_pad_trn_bat, y_hot_trn_bat, y_soft_trn_bat, False)
            backprop_hot(optimizer, hard_loss)
            running_loss += hard_loss.item()            

            # as Output of the network are log-probabilities, need to take exponential for probabilities
            ps = torch.exp(hard_predictions)
            top_p , top_class = ps.topk(1,dim=1)
            equals = top_class == y_hot_trn_bat.view(*top_class.shape)
            # Convert correct_counts to float and then compute the mean
            acc+= torch.mean(equals.type(torch.FloatTensor))
            
                    
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')

        train_losses.append(running_loss / len(train_loader))
        accs.append(acc/len(train_loader))
        print(f'Training Loss: {train_losses[epoch]} | Training Acc: {accs[epoch].item()}')             

        # evaluate after each epoch using 
        dev_hard_preds, dev_soft_preds = get_predictions(model, dev_loader)
    
        print('------------------------------------------------------------')
    


Beginning the Training

Experiment 0 #######################
Epoch: 01 | Epoch Time: 1m 40s
Training Loss: 1997.7433471679688 | Training Acc: 0.337150514125824
------------------------------------------------------------
Epoch: 02 | Epoch Time: 1m 32s
Training Loss: 1511.5850931803386 | Training Acc: 0.5724623203277588
------------------------------------------------------------
Epoch: 03 | Epoch Time: 1m 33s
Training Loss: 1315.7501152886284 | Training Acc: 0.7008172273635864
------------------------------------------------------------
Epoch: 04 | Epoch Time: 1m 36s
Training Loss: 1197.2987331814236 | Training Acc: 0.757544755935669
------------------------------------------------------------
Epoch: 05 | Epoch Time: 1m 35s
Training Loss: 1115.631544325087 | Training Acc: 0.8003333210945129
------------------------------------------------------------
Epoch: 06 | Epoch Time: 1m 36s
Training Loss: 1061.4736633300781 | Training Acc: 0.8332903385162354
-------------------------------------

#### Model Summary

In [None]:
print(model)

RNN_all(
  (word_encoder): Word_Encoder(
    (bilstm): LSTM(300, 200, batch_first=True, bidirectional=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (char_encoder): Char_Encoder(
    (bilstm): LSTM(64, 200, batch_first=True, bidirectional=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (char_attention): Attention(
    (attn_nn): Sequential(
      (0): Linear(in_features=400, out_features=600, bias=True)
      (1): ReLU()
    )
  )
  (word_attention): Attention(
    (attn_nn): Sequential(
      (0): Linear(in_features=400, out_features=600, bias=True)
      (1): ReLU()
    )
  )
  (fulcon): Sequential(
    (0): Linear(in_features=800, out_features=640, bias=True)
    (1): Linear(in_features=640, out_features=512, bias=True)
  )
  (output_hot): Linear(in_features=512, out_features=12, bias=True)
)


## **Evaluating model on Dev data**

In [None]:
"""**Putting the data in the codalab answer format**"""
#Gimpel-POS_answers.jsonlines- dev file

codalab_dict = {str(i): {"gold": int(dev_hard_preds[i]), "soft": pred.tolist()} for i, pred in enumerate(dev_soft_preds)}
codalab_dict = {"dummy_name": codalab_dict} 

with open(DATA_PATH +'/Gimpel-POS_answers-dev.jsonlines', 'w') as f:
    json.dump(codalab_dict, f)

In [None]:
# Path of dev reference file
dev_ref_path ='/content/drive/MyDrive/MscThesis/dev_reference_labels/Gimpel-POS_answers.jsonlines'
# Path of dev result file
dev_res_path= '/content/drive/MyDrive/MscThesis/public_data_PP/gimpel_pos/Gimpel-POS_answers-dev.jsonlines'

num_classes = 12
f1, recall, precision = get_hard_score(dev_ref_path, dev_res_path, num_classes)
soft_score_dev = get_soft_score(dev_ref_path, dev_res_path)
print('Dev Baseline Model')
print(f'Precision:{precision*100 : .2f}% | Recall: {recall*100: .2f}% ')
print(f'F1 scores: {f1} | Cross-Entropy: {soft_score_dev}')

Dev Baseline Model
Precision: 78.87% | Recall:  78.56% 
F1 scores: 0.7733012051352617 | Cross-Entropy: 1.0884745740067088


## **Evaluating model on Test data**

In [None]:
"""Importing Test Data"""
# the test data from csv
test_path = '/content/drive/MyDrive/MscThesis/public_data_evaluation/gimpel_pos/test_data.tsv'
test_data = read_csv_data(test_path)

# the indices of the words in a sentence, saved as arrays. Hint:Helps you know where each sentence ends
test_iis_path = '/content/drive/MyDrive/MscThesis/public_data_evaluation/gimpel_pos/word_iis_tst.npy'
word_iis_test = np.load(test_iis_path)

# the padded sentences (maximum of 40 words per sentence). Two words in the same sentence will have the same word_pad. The numbers
# indicate the idx of the word in the word embedding dictionary.
test_wordpad_path = '/content/drive/MyDrive/MscThesis/public_data_evaluation/gimpel_pos/word_pad_tst.npy'
word_pad_test = np.load(test_wordpad_path)

# the character padding
test_charpad_path = '/content/drive/MyDrive/MscThesis/public_data_evaluation/gimpel_pos/char_pad_tst.npy'
char_pad_test = np.load(test_charpad_path)

#"""**Getting, Tensorizing and Batching the Data**"""
word_pad_tst_tens, word_iis_tst_tens, char_pad_tst_tens, hot_tst_tens, soft_tst_tens = create_dataset(word_pad_test, word_iis_test, char_pad_test)
test = data_utils.TensorDataset(word_pad_tst_tens, word_iis_tst_tens, char_pad_tst_tens)
test_loader = data_utils.DataLoader(test, batch_size=batsize, shuffle=False)


In [None]:
test_hard_preds, test_soft_preds = get_predictions(model, test_loader)

In [None]:
"""**Putting the data in the codalab answer format**"""
#Gimpel-POS_answers.jsonlines- test file

codalab_dict = {str(i): {"gold": int(test_hard_preds[i]), "soft": pred.tolist()} for i, pred in enumerate(test_soft_preds)}
codalab_dict = {"dummy_name": codalab_dict} 
with open(DATA_PATH +'/Gimpel-POS_answers-test.jsonlines', 'w') as f:
    json.dump(codalab_dict, f)

In [None]:
# Path of test reference file
test_ref_path= '/content/drive/MyDrive/MscThesis/test_reference_data/Gimpel-POS_answers.jsonlines'
# Path of test result file
test_res_path= '/content/drive/MyDrive/MscThesis/public_data_PP/gimpel_pos/Gimpel-POS_answers-test.jsonlines'

num_classes = 12
f1, recall, precision  = get_hard_score(test_ref_path, test_res_path, num_classes)
soft_score = get_soft_score(test_ref_path, test_res_path)

print('Test Baseline Model')
print(f'Precision:{precision*100 : .2f}% | Recall: {recall*100: .2f}% ')
print(f'F1 scores: {f1} | Cross-Entropy: {soft_score_dev}')

Test Baseline Model
Precision: 77.69% | Recall:  77.37% 
F1 scores: 0.7635734636950491 | Cross-Entropy: 1.0884745740067088


## **References**

*   Pytorch.org. 2022. PyTorch documentation — PyTorch 1.12 documentation. [online] Available at: <https://pytorch.org/docs/stable/index.html> [Accessed 21 August 2022].

*  Numpy.org. 2022. NumPy documentation — NumPy v1.23 Manual. [online] Available at: <https://numpy.org/doc/stable/> [Accessed 21 August 2022].

*   Spacy.io. 2022. spaCy- Documentation. [online] Available at: <https://spacy.io/api> [Accessed 21 August 2022].

*   Pytorch.org. 2022. GRU — PyTorch 1.12 documentation. [online] Available at: <https://pytorch.org/docs/stable/generated/torch.nn.GRU.html> [Accessed 21 August 2022].

*  Pytorch.org. 2022. RNN — PyTorch 1.12 documentation. [online] Available at: <https://pytorch.org/docs/stable/generated/torch.nn.RNN.html> [Accessed 21 August 2022].

*  Stack Overflow. 2022. Stack Overflow - Where Developers Learn, Share, & Build Careers. [online] Available at: <https://stackoverflow.com/> [Accessed 21 August 2022].
