In [None]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [None]:
# %cd "/content/drive/MyDrive/Colab Notebooks/SemEval"


In [1]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import numpy as np
import torch.nn as nn
import string
from torch.utils.data import DataLoader, random_split
import copy
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import torch.nn.functional as F
from sklearn.metrics import classification_report

In [2]:
from allennlp.modules import ConditionalRandomField
from allennlp.modules.conditional_random_field import allowed_transitions

In [3]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
# device = torch.device("cpu")
VECTOR_PATH = './vectors/cc.bn.300.vec'
EMB_DIMENSION = 300
MAX_SEQ_LENGTH = 64 

### Dataset Class

In [4]:
class EntityDataset(Dataset):

    def __init__(self, data_dir, split, lang='bn'):
        """Initialize the attributes of the object of the class."""
        
        # data directory
        self.data_dir = data_dir
        
        self.lang = lang
        # load text dataset  
        self.sentences, self.labels = self._read_data(data_dir, split) 
        
        # load the glove embedding
        self.vector_path = VECTOR_PATH
        
        # set the embedding dimension 50/100/300
        self.emb_dimension = EMB_DIMENSION
        
        
        # set the maximum sequence length or max tweet length
        self.max_seq_len = MAX_SEQ_LENGTH
        
        # create the vocabulary from the dataset
        self.vocab = sorted(self._create_vocabulary())
        
        
        # map word or tokens to index 
        self.word_to_index = {word: idx+1 for idx, word in enumerate(sorted(self.vocab))}
        
        # set pad token index to 0 and unk token index last of vocab
        self.word_to_index['[PAD]'] = 0
        self.word_to_index['[UNK]'] = len(self.vocab)+1
        
        # define the entitly labels to index values
        self.tags = sorted(self._get_tags_list())
        self.label_to_index = {tag: idx for idx, tag in enumerate(sorted(self.tags))}
        self.label_to_index['[PAD]'] = -1
        
        # create the embedding vector
        self.word_embeddings = self._create_embedding()
        
       
        

    def __len__(self):
        """Return the size of the dataset."""
        return len(self.sentences)

    def __getitem__(self, index):
        """Return a data sample for a given index, along with the lable of the corresponding tweet"""
        
        
        # - get the data sample corresponding to 'index' (use the list 'self.image_path_list')
        data_sample = self.sentences[index]
        label = self.labels[index]
        
        # tokenize the sentence and label
        tokens = self._tokenize_text(data_sample)
        labels = self._tokenize_text(label)

        # use the word_to_index mapping to transform the tokens into indices and save them into an IntTensor
        x = torch.IntTensor([self.word_to_index[word] 
                             if word in self.word_to_index 
                             else self.word_to_index["[UNK]"] 
                             for word in tokens])
        
        # transform the variable to cuda or cpu
        x = x.to(device)
        
        
        
        
        # get the index-th label and store it into a FloatTensor
        y = [self._label_map(l) for l in labels]
        y = torch.IntTensor(torch.stack(y))
        # transform the variable to cuda or cpu
        y = y.to(device)
        # stores the text indices and the label into a dictionary
        features = {'token_ids': x, 'labels': y}
        
        
        return features

    
    def _create_embedding(self):
        
        """create a matrix containing word vectors"""

        # load the glove embedding to a dict. token is the key and value is the vector
        embeddings_index = {}
#         vector_paths = os.listdir(self.vector_path)
#         for path in vector_paths:
#             path = os.join(self.vector_path,path)
        with open(self.vector_path,'r') as file:
            embeddings_index = {line.split()[0]: np.asarray(line.split()[1], dtype='float32') for line in file}

        # create the embedding matrix. keep the words that only present in the dataset. 
        # each row represent one vector
        # row index is the word map index
        embedding_matrix = np.zeros((len(self.word_to_index) + 2, self.emb_dimension))
        for word, i in self.word_to_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        
        embedding_matrix[len(self.vocab)+1] = torch.randn(self.emb_dimension)
                
        return torch.tensor(embedding_matrix, device=device)
        
        
    def _create_vocabulary(self):
        """Create a vocabulary of unique words from the given text files."""
        
        path = 'vocab.txt'
        with open(path, 'r') as file:
            vocab = [line.strip() for line in file]

        return list(vocab)
    
    def _get_tags_list(self):
        """Create a vocabulary of unique words from the given text files."""
        
        path = 'tags.txt'
        with open(path, 'r') as file:
            labels = [line.strip() for line in file]
            

        return list(labels)

    def _tokenize_text(self, line):
        """
        Remove non-characters from the text and pads the text to max_seq_len.
        *!* Padding is necessary for ensuring that all text_files have the same size
        *!* This is required since DataLoader cannot handle tensors of variable length

        Return a list of all tokens in the text
        """

        tokens = line.split()
        for i in range(self.max_seq_len - len(tokens)):
            tokens.append('[PAD]')
        return tokens
    
    def _label_map(self,label):
        
        """ convert to labels to one hot vectors"""
        
        class_num = len(self.tags)
        
        one_hot = torch.zeros(class_num, dtype=torch.int32)
        idx = self.label_to_index[label]
        if idx!=-1:
            one_hot[idx] = 1
        
        return one_hot
            
            
    
    def _read_data(self, path, split):
        
        """ read txt file and return as list of strings"""
        
        path = f'{path}/{split}.csv'
        
        df = pd.read_csv(path)
        df = df[df['lang']==self.lang]
        
        sents =  df['sent'].to_list()
        labels = df['labels'].to_list()
        
        return sents, labels
         

### Create Dataloaders

In [5]:
dataset_train = EntityDataset('Dataset','train')
dataset_test = EntityDataset('Dataset','dev')

In [6]:
VOCAB = dataset_train.vocab
indexMap = {v:k for k,v in dataset_train.label_to_index.items() if k!='[PAD]'}
indexMap

{0: 'B-AerospaceManufacturer',
 1: 'B-AnatomicalStructure',
 2: 'B-ArtWork',
 3: 'B-Artist',
 4: 'B-Athlete',
 5: 'B-CarManufacturer',
 6: 'B-Cleric',
 7: 'B-Clothing',
 8: 'B-Disease',
 9: 'B-Drink',
 10: 'B-Facility',
 11: 'B-Food',
 12: 'B-HumanSettlement',
 13: 'B-MedicalProcedure',
 14: 'B-Medication/Vaccine',
 15: 'B-MusicalGRP',
 16: 'B-MusicalWork',
 17: 'B-ORG',
 18: 'B-OtherLOC',
 19: 'B-OtherPER',
 20: 'B-OtherPROD',
 21: 'B-Politician',
 22: 'B-PrivateCorp',
 23: 'B-PublicCorp',
 24: 'B-Scientist',
 25: 'B-Software',
 26: 'B-SportsGRP',
 27: 'B-SportsManager',
 28: 'B-Station',
 29: 'B-Symptom',
 30: 'B-Vehicle',
 31: 'B-VisualWork',
 32: 'B-WrittenWork',
 33: 'I-AerospaceManufacturer',
 34: 'I-AnatomicalStructure',
 35: 'I-ArtWork',
 36: 'I-Artist',
 37: 'I-Athlete',
 38: 'I-CarManufacturer',
 39: 'I-Cleric',
 40: 'I-Clothing',
 41: 'I-Disease',
 42: 'I-Drink',
 43: 'I-Facility',
 44: 'I-Food',
 45: 'I-HumanSettlement',
 46: 'I-MedicalProcedure',
 47: 'I-Medication/Vaccine

In [7]:
batch_size = 8
train_dataloader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(dataset_test, batch_size=batch_size, shuffle=True)


In [8]:
# First we define the validation set by splitting the training data into 2 subsets (90% training and 10% validation)
n_train_examples = int(len(dataset_train)*0.9)
n_valid_examples = len(dataset_train) - n_train_examples
train_data, valid_data = random_split(dataset_train, [n_train_examples, n_valid_examples])

# We also define the corresponding dataloaders
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_dataloader = DataLoader(valid_data, batch_size=batch_size)

In [36]:
# print an example batch

batch_example = next(iter(train_dataloader))
tweet_batch_example = batch_example['token_ids']
labels_batch_example = batch_example['labels']

print(tweet_batch_example.shape)
print(labels_batch_example.shape)

len(batch_example)

torch.Size([8, 64])
torch.Size([8, 64, 68])


2

### Create RNN model class

In [10]:
class RNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, word_embeddings,
                 max_sequence_length, num_layers, hidden_size, bidirectional, output_size, act_fn):
        super(RNN, self).__init__()
        
        # embedding layer: converts tokens ids with respectve word vec
        self.input_layer = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.input_layer.weight.data = word_embeddings
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size = hidden_size, 
                           num_layers = num_layers, 
                           bidirectional=bidirectional, batch_first=True)
        
        if bidirectional:
            self.direction = 2
        else:
            self.direction = 1
            
        self.layers = num_layers
        
        
            
        self.hidden_size = hidden_size

        # output layer
        self.output_layer = nn.Sequential(
            nn.Linear(self.direction*hidden_size, output_size),act_fn)
        

        
        
    
    def forward(self, x):
        
        # get embedding 
        emb = self.input_layer(x)
        
        batch = x.shape[0]
        # initialize a hidden state and cell state
        h0,c0 = self.init_hidden(batch)
        
        # get output from lstm layers
        l,_ = self.lstm(emb.float(),(h0,c0))
        
#         print(l.shape)
        
        # flatten the output
        l = l.reshape(-1,l.shape[2])
    
        # get final class probabilities
        out = self.output_layer(l)
    
        
        
        return out
    
    def init_hidden(self, batch_size):
                
        torch.manual_seed(0)
        h0 = torch.randn(self.direction*self.layers, batch_size, self.hidden_size, device=device) 
        c0 = torch.randn(self.direction*self.layers, batch_size, self.hidden_size, device=device)

        return h0,c0

### Training and Evaluation Functions

In [11]:
def index_to_tag(labels):
    
    """convert a batch of label indices to list of tags"""
    
    #define index to tag mapping
#     indexMap = {0:'B', 1:'I', 2:'O'}
    
    #reshape labels to batch_size*MAX_SEQ_LENGTH
    labels = labels.reshape((-1,MAX_SEQ_LENGTH))
    
    batchTags = []
    
    #convert label index to tags
    for batch in labels:
    
        tags = [indexMap[idx.item()] for idx in batch]
        
        batchTags.append(tags)
    
    return batchTags

def index_to_token(token_ids):
    
    """convert a batch of token indices to list of strings"""
    
    batchSent = []
    
    for item in token_ids:
    
        sent = [VOCAB[idx-1] if idx < len(VOCAB) else 'UNK' for idx in item if idx!=0]
        
        batchSent.append(sent)
    
    return batchSent


def print_predictions(tokens, pred_tags, true_tags):
    
    
    batch_tokens = index_to_token(tokens)
      
    batch_pred_tags = index_to_tag(pred_tags)
    
    batch_true_tags = index_to_tag(true_tags)
    
    
    

    
    from colorama import Fore, Style, Back
    
    outputs = []
    
    preds = []
    
    true = []
    
    for tokens,true_tags,pred_tags in zip(batch_tokens,batch_pred_tags,batch_true_tags):
        
        true_tags = true_tags[:len(tokens)]
        pred_tags = pred_tags[:len(tokens)]
        
        output = []
    
        for t,tl,pl in zip(tokens,true_tags,pred_tags):

            assert len(tokens) == len(pred_tags) == len(true_tags)

            if tl == pl:
                o = f"{t} {Back.GREEN}[{tl}][{pl}]{Style.RESET_ALL}"

            else:
                o = f"{t} {Back.GREEN}[{tl}]{Style.RESET_ALL}{Back.RED}[{pl}]{Style.RESET_ALL}"


            output.append(o)
            
        outputs.append(" ".join(output))
        preds.extend(pred_tags)
        true.extend(true_tags)
    
    return outputs, preds, true



def eval_lstm(model, eval_dataloader, return_predictions = False):
    
    model = copy.deepcopy(model)
    # Set the model in 'evaluation' mode (this disables some layers (batch norm, dropout...) which are not needed when testing)
    model.eval() 
    
    predictions = []

    # In evaluation phase, we don't need to compute gradients (for memory efficiency)
    with torch.no_grad():
        # initialize the total and correct number of labels to compute the accuracy
        correct_labels = 0
        total_labels = 0
        
        # Iterate over the dataset using the dataloader
        for batch in eval_dataloader:

            #get sentences and labels
            sent = batch['token_ids']
            labels = batch['labels']
            
            
            #get number of class or tags
            num_class = labels.shape[-1]
    
            #find the padded tokens
            padx = (sent > 0).float()
            
            #reshape it to make it as the same shape with labels
            padx = padx.reshape(-1)
            
            batch_size = sent.shape[0]
            
            
            #count non-pad tokens
            num_tokens = padx.sum().item()
        
            #count padded tokens
            num_pad_tokens = padx.shape[0] - num_tokens
            
            #reshape it to make it as the same shape with model output
            labels = labels.reshape(-1,num_class)
            
            # Get the predicted labels
            y_predicted = model(sent)
            
            # To get the predicted labels, we need to get the max over all possible classes
            # multiply with padx to ignore padded token predictions 
            label_predicted = torch.argmax(y_predicted.data, 1)*padx
            labels = torch.argmax(labels, 1)*padx
            

            # Compute accuracy: count the total number of samples,
            #and the correct labels (compare the true and predicted labels)
            
            total_labels += num_tokens #only added the non-padded tokens in count
            
            # subtract the padded tokens to ignore padded token predictions in final count
            correct_labels += ((label_predicted == labels).sum().item() - num_pad_tokens)
            
            # get output
            if return_predictions:
                predictions.append(print_predictions(sent,label_predicted,labels))
    
    accuracy = 100 * correct_labels / total_labels
    
    if return_predictions:
        return accuracy, predictions
    
    return accuracy

In [30]:
# !pip install colorama
def extract_spans(tags):
    cur_tag = None
    cur_start = None
    gold_spans = {}

    def _save_span(_cur_tag, _cur_start, _cur_id, _gold_spans):
        if _cur_start is None:
            return _gold_spans
        _gold_spans[(_cur_start, _cur_id - 1)] = _cur_tag  # inclusive start & end, accord with conll-coref settings
        return _gold_spans

    # iterate over the tags
    for _id, nt in enumerate(tags):
        indicator = nt[0]
        if indicator == 'B':
            gold_spans = _save_span(cur_tag, cur_start, _id, gold_spans)
            cur_start = _id
            cur_tag = nt[2:]
            pass
        elif indicator == 'I':
            # do nothing
            pass
        elif indicator == 'O':
            gold_spans = _save_span(cur_tag, cur_start, _id, gold_spans)
            cur_tag = 'O'
            cur_start = _id
            pass
    _save_span(cur_tag, cur_start, _id + 1, gold_spans)
    return gold_spans

In [40]:
def loss_fn(outputs, labels, mask, id_to_tag):
    
#     num_class = labels.shape[-1]
# #     labels = labels.reshape(-1,num_class) 
#     print(labels.size())
    batch = 8

    num_class = 68
    
    crf_layer = ConditionalRandomField(num_tags=num_class, 
                            constraints=allowed_transitions(constraint_type="BIO", labels=id_to_tag))
    
    crf_layer.to('cuda')
    
    loss = -crf_layer(outputs, labels, mask) / float(batch)
    best_path = crf_layer.viterbi_tags(outputs, mask)

    pred_results, pred_tags = [], []
    for i in range(batch_size):
        tag_seq, _ = best_path[i]
        pred_tags.append([id_to_tag[x] for x in tag_seq])
#         pred_results.append(extract_spans([id_to_tag[x] for x in tag_seq if x in id_to_tag]))

#     self.span_f1(pred_results, metadata)
    output = {"loss": loss, "results": pred_tags, "path": best_path}
    
    return output
    
#     #define cross entropy loss 
#     criterion = nn.CrossEntropyLoss(reduction='none')
    
#     #reshape labels to give a flat vector of length batch_size*seq_len
#     num_class = labels.shape[-1]
    
#     # reshape label to make it similar to model output
#     labels = labels.reshape(-1,num_class) 

#     #get loss
#     loss = criterion(outputs, labels.float())
    
#     #get non-pad index
#     non_pad_index=[i for i in range(labels.shape[0]) if labels[i].sum()!=0]
    
#     #get final loss
#     loss = loss[non_pad_index].mean()
    
#     return loss
    

    

def training_lstm(model, train_dataloader, valid_dataloader, num_epochs, learning_rate, verbose=True):

    # Make a copy of the model (avoid changing the model outside this function)
    model_tr = copy.deepcopy(model)
    
    
    # Set the model in 'training' mode (ensures all parameters' gradients are computed - it's like setting 'requires_grad=True' for all parameters)
    model_tr.train()
    
    # Define the optimizer
    optimizer = torch.optim.Adam(model_tr.parameters(), lr=learning_rate)
    
    # Initialize lists to record the training loss over epochs
    loss_all_epochs = []
    val_loss_all_epochs = []
    
    best_accuracy = 0.0
    
    
    accuracy = []
    
    
    # Training loop
    for epoch in range(num_epochs):
        # Initialize the training loss for the current epoch
        loss_current_epoch = 0
        val_loss_epoch = 0
        
        # Iterate over batches using the dataloader
        for batch_index, batch in enumerate(train_dataloader):
            
            label = batch['labels']

            optimizer.zero_grad()
            
            out = model_tr.forward(batch['token_ids'])
            l = loss_fn(out,label)
            l.backward()
            optimizer.step()
            loss_current_epoch += (l.item())
            
            val_loss_epoch += loss_fn(out,label).item()
            
            # - use the 'backward' method to compute the gradients
            # - apply the gradient descent algorithm
            # Also think of updating the loss at the current epoch


        # At the end of each epoch, record and display the loss over all batches in train and val set
        loss_current_epoch = loss_current_epoch/len(train_dataloader)
        val_loss_epoch = val_loss_epoch/len(train_dataloader)
        
        loss_all_epochs.append(loss_current_epoch)
        val_loss_all_epochs.append(val_loss_epoch)
        
        # 
        acc = eval_lstm(model_tr, valid_dataloader)
        
        accuracy.append(acc)
        if acc > best_accuracy:
            best_accuracy = acc
            torch.save(model_tr.state_dict(), 'model_opt.pt')
            
        
        
        if verbose:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss_current_epoch))
        
    return model_tr, loss_all_epochs ,accuracy


### Training

In [13]:
#the vocab size that is built from train set
vocab_size = len(dataset_train.vocab)
# the embedding dimenstion 50/100/300
emb_dim = EMB_DIMENSION
# get the embedding matrix
word_embeddings = dataset_train.word_embeddings
# max sequence length
max_sequence_length = MAX_SEQ_LENGTH

#define lstm layers
num_layers = 5
#define hidden size
hidden_size = 32
#set if LSTM should be bidirectional 
bidirectional = False
# output size i.e class size 
output_size = len(dataset_train.tags)
# activation function
act_fn = nn.LogSoftmax(dim=-1)

# create a RNN  model instance. REMARK: remove .cuda() at the end if gpu is not available
rnn = RNN(vocab_size, emb_dim, word_embeddings, max_sequence_length, 
          num_layers,hidden_size, bidirectional, output_size, act_fn).cuda()


### Trial & Error

In [41]:

o = rnn.forward(tweet_batch_example)
# o.shape
# o.shape
o = o.reshape(8,max_sequence_length,68)
# o.size()torch.argmax
labels = torch.argmax(labels_batch_example, dim=-1)
# labels.
flat_labels = labels_batch_example.reshape(-1,68)
pad_index=[1 if flat_labels[i].sum()!=0 else 0 for i in range(flat_labels.shape[0])]
mask = torch.FloatTensor(pad_index)
mask = mask.to('cuda')

mask = mask.reshape(8,max_sequence_length)
# mask
# o
out = loss_fn(o,labels, mask, indexMap)
out
# labels
# l =labels_batch_example.reshape(-1,3)
# non_pad_index=[i for i in range(l.shape[0]) if l[i].sum()!=0 ]
# l = l[non_pad_index]
# l.shape


# l = labels_batch_example.reshape(-1,3)
# l= torch.argmax(l,1)
# out = print_predictions(tweet_batch_example,l,l)
# print(out[0])

{'loss': tensor(49.5805, device='cuda:0', grad_fn=<DivBackward0>),
 'results': [['B-ORG',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-MusicalWork'],
  ['B-ORG',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-MusicalWork'],
  ['B-ORG',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-AnatomicalStructure',
   'B-MusicalWork'],
  ['B-ORG',

In [78]:
t = torch.ones((4,5,6))
print(t.shape)
t = t.reshape(4,-1)
t.shape

torch.Size([4, 5, 6])


torch.Size([4, 30])

In [None]:

# number of epochs
num_epochs = 10
# learning rate
learning_rate = 0.01

# train model
model_tr, loss_all_epochs, accuracy = training_lstm(rnn, train_dataloader, valid_dataloader, num_epochs, learning_rate)


# Visualization

In [None]:
plt.figure()
epochs = [i for i in range(num_epochs)]
plt.plot(epochs, loss_all_epochs, 'r', label='Loss')
plt.xlabel('epochs'), plt.ylabel('loss')
plt.legend()
plt.show()

### Eval Test set 

In [None]:
acc, preds = eval_lstm(model_tr,test_dataloader,True)
outputs=[]
pred_labels=[]
true_labels = []
for o,p,t in preds:
    outputs.extend(o)
    pred_labels.extend(p)
    true_labels.extend(t)


In [None]:
print(classification_report(pred_labels,true_labels))

In [None]:
for i, out in enumerate(outputs[:3]):
    print(out)
    print('\n')