## Importing required libraries

In [None]:
import pandas as pd
import torch
import time
import numpy as np
import csv
import torch.utils.data as data_utils
import pickle
import json

from scipy.special import softmax

from nltk import pos_tag
import nltk
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')
import spacy
sp = spacy.load('en_core_web_sm')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


## Data Importing and Data Preparation

In [None]:
DATA_PATH = '/content/drive/MyDrive/MscThesis/public_data_PP/gimpel_pos'

#### **Reading in the data**

In [None]:
def read_csv_data(path):
  '''
  User define function to read the .csv file from the given path
  '''
  data = []
  with open(path, 'r') as f:
    tsvreader = csv.reader(f, delimiter="\t")
    for line in tsvreader:
      data.append(line)
    # to remove the header
  return data[1:]


def readbin(f_in):
  '''
  User define function to read the .bin file from the given path
  '''
  inp = open(f_in, "rb")
  out = pickle.load(inp)
  inp.close()
  return out

In [None]:
# prepared word embeddings for the model.
word_emb_matrix = DATA_PATH + '/embeddings/word_emb_matrix.bin'
word_emb = readbin(word_emb_matrix)

# prepared character embeddings for the model
char_emb_matrix = DATA_PATH + '/embeddings/char_emb_matrix.bin'
char_emb = readbin(char_emb_matrix)

# the train and dev data from csv
train_path = DATA_PATH + '/input_data/train_data.tsv'
train_data = read_csv_data(train_path)

dev_path = DATA_PATH + '/input_data/dev_data.tsv'
dev_data = read_csv_data(dev_path)

# the indices of the words in a sentence, saved as arrays. Hint:Helps you know where each sentence ends
train_iis_path = DATA_PATH + '/input_data/word_iis_trn.npy'
dev_iis_path = DATA_PATH + '/input_data/word_iis_dev.npy'

word_iis_trn = np.load(train_iis_path)
word_iis_dev = np.load(dev_iis_path)

# the padded sentences (maximum of 40 words per sentence). Two words in the same sentence will have the same word_pad. The numbers
# indicate the idx of the word in the word embedding dictionary.
train_wordpad_path = DATA_PATH + '/input_data/word_pad_trn.npy'
dev_wordpad_path = DATA_PATH + '/input_data/word_pad_dev.npy'

word_pad_trn = np.load(train_wordpad_path)
word_pad_dev = np.load(dev_wordpad_path)

# the character padding
train_charpad_path = DATA_PATH + '/input_data/char_pad_trn.npy'
dev_charpad_path = DATA_PATH + '/input_data/char_pad_dev.npy'

char_pad_trn = np.load(train_charpad_path)
char_pad_dev = np.load(dev_charpad_path)

#### **Getting the training labels**

In [None]:
def get_soft(string_format):
    annotator_dict = {ann_idx:labels_dict[annotation] for ann_idx, annotation in enumerate(string_format.split(',')) if annotation != ""}
    ann_labs = list(annotator_dict.values())
    distr = [ann_labs.count(l) for l in range(len(labels_dict))]
    return distr, softmax(distr).tolist()

labels_dict = {'ADJ': 0, 'ADP': 1, 'ADV': 2, 'CCONJ': 3, 'DET': 4,'NOUN': 5, 'NUM': 6, 'PART': 8,'PRON': 7,'PUNCT': 9,'VERB': 10, 'X': 11}


train_softs = []
train_distr = []
for line in train_data:
    if line:
        distr, soft = get_soft(line[-1])
        train_softs.append(soft)
        train_distr.append(distr)

#print((train_softs[5:], train_distr[5:]))

####  **Training Parameters** 

In [None]:
lstm_size = 128
attn_size = 512

num_epochs = 20

batsize = 1000
sizeout_rate = 0.8

word_emb_size = 300
char_emb_size = 64

word_padsize = word_pad_trn.shape[1]
char_padsize = char_pad_trn.shape[1]

## **Training Functions**

In [None]:
#Embedding lookup used for word and character embeddings
def lookup_embeddings(embedding_lookup, index_matrix):
    flattened_indices = torch.flatten(index_matrix)
    selected = torch.index_select(embedding_lookup, 0, flattened_indices)
    return selected.reshape(index_matrix.shape[0], index_matrix.shape[1], embedding_lookup.shape[-1])

# tensorizing the data
def to_numpy(torch_tensor):
    return torch_tensor.cpu().clone().detach().numpy()

# to check GPU availability
def to_cuda(x):
    """ GPU-enable a tensor """
    if torch.cuda.is_available():
        x = x.cuda()
    return x

#Creating Tensor for dataloder
def create_dataset(word_pad_trn_bat, word_iis_trn_bat, char_pad_trn_bat, has_labels=False, y_hot_trn_bat=None, y_soft_trn_bat=None):
    word_pad_trn_bat, word_iis_trn_bat = torch.from_numpy(word_pad_trn_bat).long().to(device), torch.from_numpy(word_iis_trn_bat).long().to(device)
    char_pad_trn_bat = torch.from_numpy(char_pad_trn_bat).long().to(device)
    if has_labels:
        y_hot_trn_bat = torch.from_numpy(y_hot_trn_bat).float().to(device)
        y_soft_trn_bat = torch.from_numpy(y_soft_trn_bat).float().to(device)
    return word_pad_trn_bat, word_iis_trn_bat, char_pad_trn_bat, y_hot_trn_bat, y_soft_trn_bat

# to initialize to the optimizer
def backprop_hot(optimizer, loss):
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return

# to get predection from the model
def get_predictions(model, eval_loader):
    hard_preds = []
    soft_preds = []
    model.eval()
    for wwordpad_bat, wwordiis_bat, ccharpad_bat in eval_loader:
        one_hot_pred, _ = model(wwordpad_bat, wwordiis_bat, ccharpad_bat, None, None)
        one_hot_pred = one_hot_pred.detach().cpu().numpy()
        hard_preds.extend(np.argmax(one_hot_pred, 1))
        soft_preds.extend(one_hot_pred)
    return hard_preds, soft_preds

# to calculate each epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


## **Evaluating Functions**

In [None]:
#To calculate the cross entropy for soft evaluation
def cross_entropy_metric(targets, predictions, epsilon=1e-12):
    """
    Computes cross entropy between targets and predictions.
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray
    Returns: scalar
    https://stackoverflow.com/questions/47377222/what-is-the-problem-with-my-implementation-of-the-cross-entropy-function
    """
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    ce = -np.sum(targets*np.log(predictions))/N
    return ce

# To calculate F1- scores given actuall and predictions for hard evaluation
def f1_metric(solution, prediction, num_classes=2):
    matches, gold, system = {}, {}, {}
    for i in range(num_classes):
        matches[i] = 0
        system[i] = 0
        gold[i] = 0

    for g, p in zip(solution, prediction):
        if p == g:
            matches[p] += 1

        gold[g] += 1
        system[p] += 1

    recall = {}
    precision = {}
    f1 = {}
    for i in range(num_classes):
        recall[i] = 1.0 * matches[i] / gold[i] if matches[i] != 0 else 0
        precision[i] = 1.0 * matches[i] / system[i] if matches[i] !=0 else 0
        f1[i] =  (2 * (precision[i] * recall[i])/(precision[i] + recall[i])) if (precision[i] + recall[i]) > 0 else 0

    support = np.array([gold[i] for i in range(num_classes)])

    average_recall = np.average([recall[i] for i in range(num_classes)], weights=support)
    average_precision = np.average([precision[i] for i in range(num_classes)], weights=support)
    average_f1 = np.average([f1[i] for i in range(num_classes)], weights=support)
    return average_f1, average_recall, average_precision

# to load the jsonaline files
def load_dictionary(filepath):
    with open(filepath, 'r') as f:
        dictionary = json.load(f)
    return dictionary


def get_hard_score(reference_path, submission_path, num_classes):

    # submissions are in a dictionary format
    reference_dictionary = load_dictionary(reference_path)
    submission_dictionary = load_dictionary(submission_path)

    # getting the submission vectors
    golds = []
    predictions = []
    for document, doc_contents in reference_dictionary.items():      
        sub = submission_dictionary[document]
        for item_id, contents in doc_contents.items():
            golds.append(contents['gold'])
            predictions.append(sub[item_id]['gold'])

    #print('Dev reference gold: ',golds)
    #print('Dev result gold:', predictions)

    f1, recall, precision = f1_metric(np.array(golds), np.array(predictions), num_classes)

    return f1, recall, precision


def get_soft_score(reference_path, submission_path):

    # submissions are in a dictionary format
    reference_dictionary = load_dictionary(reference_path)
    submission_dictionary = load_dictionary(submission_path)

    # getting the submission vectors
    softs = []
    predictions = []
    for document, doc_contents in reference_dictionary.items():
        sub = submission_dictionary[document]
        for item_id, contents in doc_contents.items():
            softs.append(contents['soft'])
            predictions.append(sub[item_id]['soft'])
    # evaluating using cross_entropy
    score = cross_entropy_metric(np.array(softs), np.array(predictions))
    return score

## **Getting, Tensorizing and Batching the Data**

In [None]:
train_softs = np.array(train_softs)
train_softs.shape
train_mv = np.argmax(train_softs, 1)

#Creating Dev Dataloader
word_pad_dev_tens, word_iis_dev_tens, char_pad_dev_tens, hot_dev_tens, soft_dev_tens = create_dataset(word_pad_dev, word_iis_dev, char_pad_dev)
dev = data_utils.TensorDataset(word_pad_dev_tens, word_iis_dev_tens, char_pad_dev_tens)
dev_loader = data_utils.DataLoader(dev, batch_size=batsize, shuffle=False)

#Creating Train Dataloader
word_pad_trn_tens, word_iis_trn_tens, char_pad_trn_ten, hot_trn_tens, soft_trn_tens = create_dataset(word_pad_trn, word_iis_trn, char_pad_trn, True, train_mv, train_softs)
train = data_utils.TensorDataset(word_pad_trn_tens, word_iis_trn_tens, char_pad_trn_ten, hot_trn_tens, soft_trn_tens)
train_loader = data_utils.DataLoader(train, batch_size=batsize, shuffle=True)


## **Model**

In [None]:
#Defining the Word Encoder
class Word_Encoder(torch.nn.Module):
    def __init__(self, lstm_size, embedding_size):
        super().__init__()

        self.bilstm = torch.nn.LSTM(embedding_size, lstm_size, bidirectional=True, batch_first=True)
        self.dropout = torch.nn.Dropout(0.2)

    def forward(self, wword_pad, col_indices):
        embedded_words = lookup_embeddings(word_embedding_lookup, wword_pad)
        rnn_context, _ = self.bilstm(embedded_words)
        rnn_sequence = torch.stack([torch.index_select(seq, 0, i) for seq, i in zip(rnn_context, col_indices)], 0)
        rnn_sequence = self.dropout(rnn_sequence)
        return rnn_sequence, rnn_context

#Defining the Character Encoder
class Char_Encoder(torch.nn.Module):
    def __init__(self, lstm_size, embedding_size):
        super().__init__()

        self.bilstm = torch.nn.LSTM(embedding_size, lstm_size, bidirectional=True, batch_first=True)
        self.dropout = torch.nn.Dropout(0.2)

    def forward(self, cchar_pad):
        embedded_chars = lookup_embeddings(char_embedding_lookup, cchar_pad)
        rnn_sequence, _ = self.bilstm(embedded_chars)
        rnn_sequence = self.dropout(rnn_sequence[:,1])
        reshaped = torch.reshape(rnn_sequence, [-1, 1, rnn_sequence.shape[1]])
        return reshaped

#Defining a seperate Attention
class Attention(torch.nn.Module):
    def __init__(self, attn_emb_dim, attn_size):
        super().__init__()

        self.attn_nn = torch.nn.Sequential(
                    torch.nn.Linear(attn_emb_dim, attn_size),
                    torch.nn.Tanh()
        )

        self.u_omega = torch.nn.Parameter(torch.randn([attn_size]))

    def forward(self, attn_in, s):
        v = self.attn_nn(attn_in)
        vu = torch.matmul(v.squeeze(1), self.u_omega)
        alphas = torch.nn.functional.softmax(vu, 0)
        final = torch.sum(attn_in * alphas.unsqueeze(-1), 1)
        return final

# Defining the finale RNN architecture
class RNN_all(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.word_encoder = Word_Encoder(lstm_size, word_emb_size)
        self.char_encoder = Char_Encoder(lstm_size, char_emb_size)

        self.char_attention = Attention(lstm_size*2, attn_size)
        self.word_attention = Attention(lstm_size*2, attn_size)

        concat_size = lstm_size*4
        hidden1 = int(lstm_size*4*sizeout_rate)
        out_final = int(hidden1*sizeout_rate)
        self.fulcon = torch.nn.Sequential(
                    torch.nn.Linear(concat_size, hidden1),
                    torch.nn.Linear(hidden1, out_final))

        self.output_hot = torch.nn.Linear(out_final, hotsize)
    
    def forward(self, wword_pad, wword_iis, cchar_pad, one_hot_labels, soft_labels, eval=True):
        word_sequence, word_context = self.word_encoder(wword_pad, wword_iis)
        word_attn= self.word_attention(word_sequence, 'word')

        char_sequence = self.char_encoder(cchar_pad)
        char_attn = self.char_attention(char_sequence, 'char')

        concat_attn = torch.cat([word_attn, char_attn], 1)
        ful = self.fulcon(concat_attn)

        pred_hot = self.output_hot(ful)  
        softmax_scores = torch.nn.functional.softmax(pred_hot, 1) + 1e-43
        if eval:
            return softmax_scores, None
        else:
            soft_labels = soft_labels + 1e-43
            softmax_scores = torch.nn.functional.softmax(pred_hot, 1) + 1e-4
            cross_entropy = torch.mul(soft_labels, softmax_scores.log())
            loss  = -torch.sum(cross_entropy)
            return softmax_scores, loss


#### Model Training

In [None]:
from torch.nn.functional import cross_entropy
"""**Training using the soft labels**"""

hotsize = 12
assert len(train_softs[0]) == hotsize

word_embedding_lookup = torch.from_numpy(word_emb).float().to(device)
char_embedding_lookup = torch.from_numpy(char_emb).float().to(device)

print('Beginning the Training')
NUM_EXPERIMENTS = 1

accs = [] #for training accuracy
train_losses=[] # for training loss

dev_accs = []
dev_prfs = []

for exp in range(NUM_EXPERIMENTS):
    print('\nExperiment %d #######################'%exp)
    best_val_f, best_val_acc = 0, 0
    best_val_r, best_val_p = 0, 0

    last_batch = 0

    model = RNN_all()
    model = to_cuda(model)
    optimizer = torch.optim.Adam(params=[p for p in model.parameters()],lr=0.001)

    for epoch in range(num_epochs):
        running_loss = 0
        acc, dev_acc= 0 , 0
        nepoch = epoch + 1
        start_time = time.time()
        model.train()

        for word_pad_trn_bat, word_iis_trn_bat, char_pad_trn_bat, y_hot_trn_bat, y_soft_trn_bat in train_loader:
            hard_predictions, hard_loss = model(word_pad_trn_bat, word_iis_trn_bat, char_pad_trn_bat, y_hot_trn_bat, y_soft_trn_bat, False)
            backprop_hot(optimizer, hard_loss)
            running_loss += hard_loss.item()            

            # as Output of the network are log-probabilities, need to take exponential for probabilities
            ps = torch.exp(hard_predictions)
            top_p , top_class = ps.topk(1,dim=1)
            equals = top_class == y_hot_trn_bat.view(*top_class.shape)
            # Convert correct_counts to float and then compute the mean
            acc+= torch.mean(equals.type(torch.FloatTensor))
            
        #monitioring the epoch time             
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')

        #Traing Loss and Accuracy
        train_losses.append(running_loss / len(train_loader))
        accs.append(acc/len(train_loader))
        print(f'Training Loss: {train_losses[epoch]} | Training Acc: {accs[epoch].item()}')             

        # evaluate after each epoch using 
        dev_hard_preds, dev_soft_preds = get_predictions(model, dev_loader)
    
        # # as Output of the network are log-probabilities, need to take exponential for probabilities
        # ps_d = torch.exp(torch.as_tensor(dev_soft_preds))
        # top_pd , top_class_d = ps_d.topk(1,dim=1)
        # equals_d = top_class_d == torch.as_tensor(dev_hard_preds).view(*top_class_d.shape)
        # # Convert correct_counts to float and then compute the mean
        # dev_acc += torch.mean(equals_d.type(torch.FloatTensor))
        # dev_accs.append(dev_acc/ len(dev_loader))
        # print('Dev Acc: ', dev_accs[epoch].item())
        print('------------------------------------------------------------')
    


Beginning the Training

Experiment 0 #######################
Epoch: 01 | Epoch Time: 0m 2s
Training Loss: 2075.7429809570312 | Training Acc: 0.29721149802207947
------------------------------------------------------------
Epoch: 02 | Epoch Time: 0m 2s
Training Loss: 1611.9846327039932 | Training Acc: 0.5129712820053101
------------------------------------------------------------
Epoch: 03 | Epoch Time: 0m 1s
Training Loss: 1416.1907721625435 | Training Acc: 0.6350609660148621
------------------------------------------------------------
Epoch: 04 | Epoch Time: 0m 1s
Training Loss: 1294.696017795139 | Training Acc: 0.7100394368171692
------------------------------------------------------------
Epoch: 05 | Epoch Time: 0m 1s
Training Loss: 1203.9942186143662 | Training Acc: 0.7553836107254028
------------------------------------------------------------
Epoch: 06 | Epoch Time: 0m 1s
Training Loss: 1138.1594136555989 | Training Acc: 0.788641631603241
-----------------------------------------

#### Model Summary

In [None]:
print(model)

RNN_all(
  (word_encoder): Word_Encoder(
    (bilstm): LSTM(300, 128, batch_first=True, bidirectional=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (char_encoder): Char_Encoder(
    (bilstm): LSTM(64, 128, batch_first=True, bidirectional=True)
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (char_attention): Attention(
    (attn_nn): Sequential(
      (0): Linear(in_features=256, out_features=512, bias=True)
      (1): Tanh()
    )
  )
  (word_attention): Attention(
    (attn_nn): Sequential(
      (0): Linear(in_features=256, out_features=512, bias=True)
      (1): Tanh()
    )
  )
  (fulcon): Sequential(
    (0): Linear(in_features=512, out_features=409, bias=True)
    (1): Linear(in_features=409, out_features=327, bias=True)
  )
  (output_hot): Linear(in_features=327, out_features=12, bias=True)
)


## **Evaluating model on Dev data**

In [None]:
"""**Putting the data in the codalab answer format**"""
#Gimpel-POS_answers.jsonlines- dev file

codalab_dict = {str(i): {"gold": int(dev_hard_preds[i]), "soft": pred.tolist()} for i, pred in enumerate(dev_soft_preds)}
codalab_dict = {"dummy_name": codalab_dict} 

with open(DATA_PATH +'/Gimpel-POS_answers-dev.jsonlines', 'w') as f:
    json.dump(codalab_dict, f)

In [None]:
# Path of dev reference file
dev_ref_path ='/content/drive/MyDrive/MscThesis/dev_reference_labels/Gimpel-POS_answers.jsonlines'
# Path of dev result file
dev_res_path= DATA_PATH+'/Gimpel-POS_answers-dev.jsonlines'

num_classes = 12
f1, recall, precision = get_hard_score(dev_ref_path, dev_res_path, num_classes)
soft_score_dev = get_soft_score(dev_ref_path, dev_res_path)
print('Dev Baseline Model')
print(f'Precision:{precision*100 : .2f}% | Recall: {recall*100: .2f}% ')
print(f'F1 scores: {f1} | Cross-Entropy: {soft_score_dev}')

Dev Baseline Model
Precision: 78.65% | Recall:  78.49% 
F1 scores: 0.7713658644362331 | Cross-Entropy: 1.0867203501313014


## **Evaluating model on Test data**

In [None]:
Data_path_test = '/content/drive/MyDrive/MscThesis/public_data_evaluation/gimpel_pos'

In [None]:
"""Importing Test Data"""
# the test data from csv
test_path = Data_path_test +'/test_data.tsv'
test_data = read_csv_data(test_path)

# the indices of the words in a sentence, saved as arrays. Hint:Helps you know where each sentence ends
test_iis_path = Data_path_test +'/word_iis_tst.npy'
word_iis_test = np.load(test_iis_path)

# the padded sentences (maximum of 40 words per sentence). Two words in the same sentence will have the same word_pad. The numbers
# indicate the idx of the word in the word embedding dictionary.
test_wordpad_path = Data_path_test +'/word_pad_tst.npy'
word_pad_test = np.load(test_wordpad_path)

# the character padding
test_charpad_path = Data_path_test + '/char_pad_tst.npy'
char_pad_test = np.load(test_charpad_path)

#"""**Getting, Tensorizing and Batching the Data**"""
word_pad_tst_tens, word_iis_tst_tens, char_pad_tst_tens, hot_tst_tens, soft_tst_tens = create_dataset(word_pad_test, word_iis_test, char_pad_test)
test = data_utils.TensorDataset(word_pad_tst_tens, word_iis_tst_tens, char_pad_tst_tens)
test_loader = data_utils.DataLoader(test, batch_size=batsize, shuffle=False)


In [None]:
test_hard_preds, test_soft_preds = get_predictions(model, test_loader)

In [None]:
"""**Putting the data in the codalab answer format**"""
#Gimpel-POS_answers.jsonlines- test file

codalab_dict = {str(i): {"gold": int(test_hard_preds[i]), "soft": pred.tolist()} for i, pred in enumerate(test_soft_preds)}
codalab_dict = {"dummy_name": codalab_dict} 
with open(DATA_PATH +'/Gimpel-POS_answers-test.jsonlines', 'w') as f:
    json.dump(codalab_dict, f)

In [None]:
# Path of test reference file
test_ref_path= '/content/drive/MyDrive/MscThesis/test_reference_data/Gimpel-POS_answers.jsonlines'
# Path of test result file
test_res_path= '/content/drive/MyDrive/MscThesis/public_data_PP/gimpel_pos/Gimpel-POS_answers-test.jsonlines'

num_classes = 12
f1, recall, precision  = get_hard_score(test_ref_path, test_res_path, num_classes)
soft_score = get_soft_score(test_ref_path, test_res_path)

print('Test Baseline Model')
print(f'Precision:{precision*100 : .2f}% | Recall: {recall*100: .2f}% ')
print(f'F1 scores: {f1} | Cross-Entropy: {soft_score_dev}')

Test Baseline Model
Precision: 78.10% | Recall:  77.92% 
F1 scores: 0.7688897758924997 | Cross-Entropy: 1.0867203501313014


## POS Tagging: NLTK and Spacy

**Creating dev dataframe:**

In [None]:
dev_df = pd.read_csv(DATA_PATH + '/input_data/dev_data.tsv', sep='\t')
dev_df['Hard_label'] = dev_hard_preds # storing dev hard lable
dev_df['Soft_label'] = dev_soft_preds # storing dev soft lable
dev_df['Hard_label_txt'] = ''
dev_df['Sec_max_soft']= '' 
dev_df['Nltk_lable']= '' # for nltk tags 
dev_df['Spacy_lable']='' # for spacy tags

In [None]:
dev_df.shape

(3027, 9)

In [None]:
#converting numerical lable to text lable
for i in range(len(dev_df)):
  dev_df['Hard_label_txt'].iloc[i] = list(labels_dict.keys())[int(dev_df['Hard_label'][i])]
  #getting second maximun probabilty tag
  slist = list(dev_df['Soft_label'][i]) 
  s_max_index = slist.index(sorted(slist)[-2])
  dev_df['Sec_max_soft'].iloc[i] = list(labels_dict.keys())[int(s_max_index)]
  

In [None]:
dev_df.head(6)

Unnamed: 0,Twitt_ID,Token_Id_in_Dataset,Token,Hard_label,Soft_label,Hard_label_txt,Sec_max_soft,Nltk_lable,Spacy_lable
0,0,0,If,1,"[0.016566958, 0.24222298, 0.057194225, 0.21483...",ADP,CCONJ,,
1,0,1,you,7,"[0.0023907463, 0.0096095195, 0.007940512, 0.00...",PART,VERB,,
2,0,2,can,10,"[0.0037132069, 0.008821263, 0.011603291, 0.008...",VERB,NOUN,,
3,0,3,see,10,"[0.01735294, 0.01964078, 0.032485157, 0.015387...",VERB,ADV,,
4,0,4,only,2,"[0.14321233, 0.08693356, 0.34051514, 0.0649644...",ADV,ADJ,,
5,0,5,one,6,"[0.06372617, 0.05645435, 0.05653922, 0.0629671...",NUM,NOUN,,


In [None]:
dev_df['Hard_label_txt'].value_counts()

NOUN     907
VERB     459
PUNCT    435
PART     335
ADP      240
ADJ      172
X        125
DET      103
PRON     100
ADV       89
CCONJ     48
NUM       14
Name: Hard_label_txt, dtype: int64

**Data Cleaning before tag generation**

In [None]:
dic = {"it's":'it', "they’re":'they', 'fill-ups': 'fillup', "I'm": 'Iam', "y'all":'all',"Y'all":'all',
       "isn't": 'is', 'X-mas':'xmas', 'How-To':'how', "Don't":'do', '!!!!!!':'!', "what's":'what',
       "that's":'that', "he's":'he', "that'd":'that', "you'll":'you', "can't":'can', "We'd":'we', 
       "He's":'he', "you're":'you', '&hearts':'hearts','Wont': 'will', "It's":'is', 'im':'I', 'gonna':'going', 'cannot':'can', 'aint': 'are',
       "Ray-Ray":'ray', "Im": 'Iam',"That's":'that',"line-up":'lineup', "re-share":'reshare','!?':'!','!!':'!','>>':'>', 'of_____________':'of',
       "id":'ID', '<<-----':'<', '<==':'<','!!!':'!','!!!!!':'!', '-&':'-',"Ricochet's":'Ricochet', '??':'?', '...':'.','.....':'.','........':'.',
       ']:':']', "!'":'!',"=[[":'=',"♥♡♥":'♥',"didn't":'did', "Isn't":'is',"⌣́_⌣̀)":'_',"[*":'*',
       "gotta":'go', "whats":'what', 't…':'.', 'love…':'love','*]':'*', '!!!!':'!', '!!!!!!!!':'!',
        ")RT":'RT', "(@":'(',"Tracy's":'Tracy','????':'?',"I've": 'I', "she's":'she', "don't":'do', "won't":'will',"right-FREE": 'FREE',
       "there's":'there', "I&":'I','S/O':'so','<<':'<',"KoryBaker262626…":'KoryBaker26262', "D:":':', "Kalamazoo)":'Kalamazoo',
       "((":'(', '*))':'*', "wasn't":'was', "We're":'We',"=>":'=',"!.":'.',

       }
dev_df['Token'] = dev_df['Token'].replace(dic)


#### Generating NLTK Tags 

In [None]:
#NLTK tagger
nltk_tags= pos_tag(list(dev_df['Token']), tagset='universal')

#saving tags in the dataframe
for i in range(len(nltk_tags)):
  dev_df['Nltk_lable'].iloc[i]= nltk_tags[i][1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
dev_df.head(5)

Unnamed: 0,Twitt_ID,Token_Id_in_Dataset,Token,Hard_label,Soft_label,Hard_label_txt,Sec_max_soft,Nltk_lable,Spacy_lable
0,0,0,If,1,"[0.016566958, 0.24222298, 0.057194225, 0.21483...",ADP,CCONJ,ADP,
1,0,1,you,7,"[0.0023907463, 0.0096095195, 0.007940512, 0.00...",PART,VERB,PRON,
2,0,2,can,10,"[0.0037132069, 0.008821263, 0.011603291, 0.008...",VERB,NOUN,VERB,
3,0,3,see,10,"[0.01735294, 0.01964078, 0.032485157, 0.015387...",VERB,ADV,VERB,
4,0,4,only,2,"[0.14321233, 0.08693356, 0.34051514, 0.0649644...",ADV,ADJ,ADV,


In [None]:
#NLTK Tag Count
dev_df['Nltk_lable'].value_counts()

NOUN    1106
VERB     452
.        384
PRON     243
ADJ      206
ADP      201
DET      159
ADV      134
CONJ      58
PRT       57
NUM       20
X          7
Name: Nltk_lable, dtype: int64

#### Generating Spacy Tags

In [None]:
import re
#to remove # from the token
for i in range(len(dev_df)):
  dev_df['Token'][i] = re.sub('#', '', dev_df['Token'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
#creating text corpus
txt = list(dev_df['Token'])
txt = ' '.join(txt)

#initializing spaCy object for toke generation
text = sp(txt)
text_token=[]
for token in text:
  text_token.append((token, token.pos_))

#saving tags in the dataframe
for i in range(len(dev_df)):
  if dev_df['Token'][i] == str(text_token[i][0]):
    dev_df['Spacy_lable'][i]= text_token[i][1]
  else:
    print(dev_df['Token'][i], str(text_token[i][0]))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


    
    


In [None]:
#spaCy Tag count
dev_df['Spacy_lable'].value_counts()

NOUN     602
PUNCT    430
PROPN    421
VERB     341
PRON     310
ADP      198
ADJ      138
DET      137
ADV      125
AUX      112
CCONJ     54
SCONJ     41
INTJ      40
PART      32
NUM       23
X         18
SPACE      2
           2
SYM        1
Name: Spacy_lable, dtype: int64

#### Saving dataframe for Tag Comparison:

In [None]:
#saving dataframe to excel
dev_df.to_excel('/content/drive/MyDrive/MscThesis/dev.xlsx')

## Model State 

In [None]:
# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
word_encoder.bilstm.weight_ih_l0 	 torch.Size([512, 300])
word_encoder.bilstm.weight_hh_l0 	 torch.Size([512, 128])
word_encoder.bilstm.bias_ih_l0 	 torch.Size([512])
word_encoder.bilstm.bias_hh_l0 	 torch.Size([512])
word_encoder.bilstm.weight_ih_l0_reverse 	 torch.Size([512, 300])
word_encoder.bilstm.weight_hh_l0_reverse 	 torch.Size([512, 128])
word_encoder.bilstm.bias_ih_l0_reverse 	 torch.Size([512])
word_encoder.bilstm.bias_hh_l0_reverse 	 torch.Size([512])
char_encoder.bilstm.weight_ih_l0 	 torch.Size([512, 64])
char_encoder.bilstm.weight_hh_l0 	 torch.Size([512, 128])
char_encoder.bilstm.bias_ih_l0 	 torch.Size([512])
char_encoder.bilstm.bias_hh_l0 	 torch.Size([512])
char_encoder.bilstm.weight_ih_l0_reverse 	 torch.Size([512, 64])
char_encoder.bilstm.weight_hh_l0_reverse 	 torch.Size([512, 128])
char_encoder.bilstm.bias_ih_l0_reverse 	 torch.Size([512])
char_encoder.bilstm.bias_hh_l0_reverse 	 torch.Size([512])
char_attention.u_omega 	 torch.Size([512])
cha

In [None]:
#saving the model
PATH='/content/drive/MyDrive/MscThesis/model.pth'
torch.save(model.state_dict(), PATH)

In [None]:
#loading a saved model - Model class must be defined somewhere
model = torch.load(PATH)
model.eval()

## **Reference**

*   Pytorch.org. 2022. PyTorch documentation — PyTorch 1.12 documentation. [online] Available at: <https://pytorch.org/docs/stable/index.html> [Accessed 21 August 2022].

*  Numpy.org. 2022. NumPy documentation — NumPy v1.23 Manual. [online] Available at: <https://numpy.org/doc/stable/> [Accessed 21 August 2022].

*   Spacy.io. 2022. spaCy- Documentation. [online] Available at: <https://spacy.io/api> [Accessed 21 August 2022].

*   Pytorch.org. 2022. GRU — PyTorch 1.12 documentation. [online] Available at: <https://pytorch.org/docs/stable/generated/torch.nn.GRU.html> [Accessed 21 August 2022].

*  Pytorch.org. 2022. RNN — PyTorch 1.12 documentation. [online] Available at: <https://pytorch.org/docs/stable/generated/torch.nn.RNN.html> [Accessed 21 August 2022].

*  Stack Overflow. 2022. Stack Overflow - Where Developers Learn, Share, & Build Careers. [online] Available at: <https://stackoverflow.com/> [Accessed 21 August 2022].
