In [103]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
from gensim.models import KeyedVectors
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [104]:
treebank_corpus = treebank.tagged_sents(tagset='universal')
brown_corpus = brown.tagged_sents(tagset='universal')
conll_corpus = conll2000.tagged_sents(tagset='universal')
tagged_sentences = treebank_corpus + brown_corpus + conll_corpus

In [105]:
tagged_sentences[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

In [106]:
X = [] # store input sequence
Y = [] # store output sequence

for sentence in tagged_sentences:
    X_sentence = []
    Y_sentence = []
    for entity in sentence:         
        X_sentence.append(entity[0])  # entity[0] contains the word
        Y_sentence.append(entity[1])  # entity[1] contains corresponding tag
        
    X.append(X_sentence)
    Y.append(Y_sentence)

In [107]:
X[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [108]:
num_words = len(set([word.lower() for sentence in X for word in sentence]))
num_tags   = len(set([word.lower() for sentence in Y for word in sentence]))
print(num_words)
print(num_tags)

59448
12


In [109]:
unique_tags = list(set([word.lower() for sentence in Y for word in sentence]))
unique_tags_dict = {}
index = 1
for tag in unique_tags:
    unique_tags_dict[tag] = index 
    index += 1
print(unique_tags_dict)

{'adj': 1, '.': 2, 'num': 3, 'x': 4, 'adv': 5, 'pron': 6, 'prt': 7, 'det': 8, 'conj': 9, 'noun': 10, 'verb': 11, 'adp': 12}


In [110]:
unique_words = list(set([word.lower() for sentence in X for word in sentence]))
unique_words_dict = {}
index = 1
for word in unique_words:
    unique_words_dict[word] = index 
    index += 1
print(len(unique_words_dict))

59448


In [111]:
def prepare_sequence(seq, to_ix):
    """Input: takes in a list of words, and a dictionary containing the index of the words
    Output: a tensor containing the indexes of the word"""
    idxs = [to_ix[w.lower()] for w in seq]
    return idxs

In [112]:
from keras_preprocessing.sequence import pad_sequences
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
def split(list_a, batch_size):

  for i in range(0, len(list_a), batch_size):
    yield list_a[i:i + batch_size]

cuda


In [113]:
batch_size = 8
X_batches = list(split(X, batch_size))
Y_batches = list(split(Y,batch_size))

In [114]:
X_batches_padded = []
Y_batches_padded = []
max_length_list = []

for b_s,b_t in zip(X_batches,Y_batches):
    max_seq_length = 0
    for sentence in b_s:
        if len(sentence) > max_seq_length:
            max_seq_length = len(sentence)
    
    sen_encoded = []
    tag_encoded = []
    for sentence,tags in zip(b_s,b_t):
        sen_encoded.append(prepare_sequence(sentence, unique_words_dict))
        tag_encoded.append(prepare_sequence(tags, unique_tags_dict))
    
    X_batches_padded.append(pad_sequences(sen_encoded, maxlen=max_seq_length, padding="pre", truncating="post"))
    Y_batches_padded.append(pad_sequences(tag_encoded, maxlen=max_seq_length, padding="pre", truncating="post"))
    max_length_list.append(max_seq_length)

In [115]:
print(len(Y_batches_padded))
print(len(X_batches_padded))
print(len(max_length_list))

9026
9026
9026


In [116]:
print(len(Y_batches_padded[0]))
len(X_batches_padded[0])

8


8

In [117]:
print(Y_batches_padded[0])
print(X_batches_padded[0])
print(max_length_list[0])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10
  10  2  3 10  1  2 11 11  8 10 12  8  1 10 10  3  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0 10 10 11 10 12 10 10  2  8 10 11 10  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 10 10  2  3 10  1  9  1 10 12
  10 10 10 10  2 11 11  4  8  1 10 12  8  1  1 10  2]
 [ 8 10 12 10  5 11  4  4  7 11 10 10 10 11 11  8  1 10 12 10 10 12  8 10
  12 10 11  4  7  6  5 12  3 10 12  2 10 11  4  4  2]
 [ 0  0  0  0  0  0  8 10 10  2 10  2 11  5  1 12  6 11  8 10  2 12  5  1
  10  7  6 11 10  8  4 11  7 10  1  2 10 11  4  4  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 10 10  2  8 10 12  1  1 10 10
   8  4 11 10 10  2 11 11 10 12  6 10 10 10 12  3  2]
 [ 0  0  0  0 12  1 10 11 11  4  5 12  8 10 12  2  8  1 10 11 12 10  7 10
  10 10 12 10  2  8 10  1  4  7 11  1 10  7  8 10  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  8 10 10 11  

In [118]:
X_final = []
Y_final = []

for index in range(len(X_batches_padded)):
    X_batch_tensor = torch.zeros((batch_size,max_length_list[index]),dtype = int).to(device= device)
    Y_batch_tensor = torch.zeros((batch_size,max_length_list[index]), dtype = int).to(device = device)

    count = 0
    for x, y in zip(X_batches_padded[index],Y_batches_padded[index]):
        X_batch_tensor[count] = torch.tensor(x).to(device = device)
        Y_batch_tensor[count] = torch.tensor(y).to(device =device)
        count += 1
    
    X_final.append(X_batch_tensor)
    Y_final.append(Y_batch_tensor)

In [119]:
print(len(X_final))
print(len(Y_final))
print(X_final[0].shape)
print(Y_final[0].shape)
print(X_final[0][0].shape)
print(Y_final[0][0].shape)
print(X_final[1].shape)
print(Y_final[1].shape)
print(X_final[0])
print(Y_final[0])

9026
9026
torch.Size([8, 41])
torch.Size([8, 41])
torch.Size([41])
torch.Size([41])
torch.Size([8, 38])
torch.Size([8, 38])
tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,  6123, 18217, 51070, 21817, 57400, 56718, 51070,
         58036, 33418, 27769, 18305, 40453, 25376,  5030, 12877,  5910,  8138,
         14849],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0, 21892, 18217,
         53864, 32416, 34367, 47216, 42055, 51070, 27769, 32353,     9,  7463,
         14849],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0, 29464, 49536, 51070,  3062, 57400, 56718,
         10020, 53154, 32416, 34367, 58225, 26761, 1

In [120]:
EMBEDDING_SIZE  = 300  # each word in word2vec model is represented using a 300 dimensional vector
VOCABULARY_SIZE = num_words + 1

with open('./embedding_weights.pickle', 'rb') as file:
    embedding_weights = pickle.load(file)

print(embedding_weights.shape)

torch.Size([59449, 300])


In [121]:
def create_emb_layer(weights_matrix, non_trainable=False):

    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [122]:
class RNNTagger_encoder(nn.Module):
    def __init__(self, hidden_dim, target_size,batch_size):
        super(RNNTagger_encoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.target_size = target_size
        self.batch_size = batch_size
        
        self.word_embeddings, vocab_size, embedding_dim = create_emb_layer(embedding_weights, True)
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first = True)
        #self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence):
        #print("ENTERING ENCODER")

        #Input shape: [len(sentence)]
        embeds = self.word_embeddings(sentence)  
        #embeds shape: [len(sentence), embdeddin_dim]
  
        
        #input shape: [len(sentence),1,embedding_dim] (L,N,Hin​) when batch_first=False)
        rnn_out, hidden_state_out = self.rnn(embeds)  
        #rnn_out shape: [len(sentence),1,hidden_dim] 
        #hiddsen_state_out shape: [1,1,hidden_shape]  The hidden state corresponding to last time step

        #print("LEAVING ENCODER")
        
        return rnn_out,hidden_state_out

In [123]:
class RNNTagger_decoder(nn.Module):
    def __init__(self, hidden_dim, target_size,batch_size):
        super(RNNTagger_decoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.target_size = target_size
        self.batch_size = batch_size
        
        self.word_embeddings, vocab_size, embedding_dim = create_emb_layer(embedding_weights, True)
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first = True)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence,hidden):
        #print("ENTERING DECODER")


        #Input shape: [len(sentence)]  --torch.Size([8, 1])
        #Input shape: [batch_size,max_length in that batch]
        embeds = self.word_embeddings(sentence)  
        #print("after embedding:", embeds.shape) #--torch.Size([8, 1, 300])
        #embeds shape: [len(sentence), embdeddin_dim] -- torch.Size([batch_Size, max_length_in_that_batch, embedding_dim])
   
        
        #input shape: [len(sentence),1,embedding_dim] (L,N,Hin​) when batch_first=False)
        #input deocder shape: torch.Size([1, 1, 64]
        rnn_out, hidden_state_out = self.rnn(embeds,hidden) 
        #print("after rnn:", rnn_out.shape , hidden_state_out.shape) #torch.Size([8, 1, 64]),torch.Size([1, 8, 64])


        #input shape: [len(sentence),hidden_dim]  -- torch.Size([8, 1, 64])
        tag_space = self.hidden2tag(rnn_out)
        #tag_shape : (len(sentence),target_size) --torch.Size([8, 1, 13])
        
        tag_scores = F.log_softmax(tag_space, dim=2) 
        #print("DONE SOFTMAX:", tag_scores.shape) #--torch.Size([8, 1, 13])

        #print("LEAVING DECODER")
        return tag_scores,hidden_state_out

In [124]:
class RNNTagger_seq2seq(nn.Module):
    def __init__(self, encoder, decoder, target_size,batch_size,device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.target_size =  target_size
        self.batch_size = batch_size
        
        
    def forward(self, sentence, gt_pos_tags):
        

        outputs = torch.zeros(self.batch_size,sentence.shape[1],self.target_size).to(self.device)
        outputs = outputs.transpose(1,2)
        #print("before encoder:", outputs.shape) #--torch.Size([8, 13, 45])
        
        #print("for encoder input:" , sentence.shape)  --torch.Size([8, 45])
        encoder_out, hidden = self.encoder(sentence)
        #print("DONE ENCODER:", encoder_out.shape , hidden.shape) --torch.Size([8, 45, 64]) and torch.Size([1, 8, 64])
        

        index = 0

        sentence_col_time = sentence.transpose(0,1)
        #print("before for loop:", sentence_col_time.shape) --torch.Size([45, 8])

        for token in sentence_col_time:
            #print("entering for loop", token.shape) --torch.Size([8])
            
            token = token.unsqueeze(1)
            #print("after unsqueeze:", token.shape) --torch.Size([8, 1])
            
            token = token.to(device = self.device)

 
            output, hidden = self.decoder(token, hidden)
            #print("output shape:", output.shape) --torch.Size([1, target_size])  # 8 1 13
            #print("hidden shape:", hidden.shape) --torch.Size([1, 1, hidden_dim])
            
    
            #print("output shape before:" , output.shape) --torch.Size([8, 1, 13])
            output = output.squeeze(1)
            #print("output shape after:", output.shape) --torch.Size([8, 13])
            outputs[:,:,index] = output
            index += 1

        
        return outputs

In [125]:
def train_loop(model,loss_function,optimizer,device,X,Y):
    train_length = len(X)
    epoch_train_loss = 0 
   
    model.train()
    for i in tqdm(range(train_length)):
        sentence_batch = X[i]
        tags_batch = Y[i]

        model.zero_grad()

        tag_scores = model(sentence_batch,tags_batch)
        #--torch.Size([8, 13, 45])

        loss = loss_function(tag_scores, tags_batch)
        epoch_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    return model, epoch_train_loss/train_length

In [126]:
def validation_loop(model,loss_function,device,X,Y):
    val_length = len(X)
    epoch_val_loss = 0 

    for i in tqdm(range(val_length)):
        sentence_batch = X[i]
        tags_batch = Y[i]

        model.eval()

        tag_scores = model(sentence_batch,tags_batch)   

        loss = loss_function(tag_scores, tags_batch)
        epoch_val_loss += loss.item()
     
    
    return epoch_val_loss/val_length

In [127]:
HIDDEN_DIM = 64
batch_size = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
enc = RNNTagger_encoder(HIDDEN_DIM, len(unique_tags_dict.keys())+1,batch_size)
dec = RNNTagger_decoder(HIDDEN_DIM,len(unique_tags_dict.keys())+1,batch_size)
model = RNNTagger_seq2seq(enc, dec, len(unique_tags_dict.keys())+1,batch_size,device).to(device=device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

cuda


In [128]:
TEST_SIZE = 0.1
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y_final, test_size=TEST_SIZE, random_state=4)

VALID_SIZE = 0.15
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=VALID_SIZE, random_state=4)

In [129]:
epochs = 2
for epoch in range(epochs):
    model , train_loss = train_loop(model,loss_function,optimizer,device,X_train,Y_train)
    val_loss = validation_loop(model,loss_function,device,X_validation,Y_validation)
    print("For epoch {}, training loss: {}, validation loss: {}".format(epoch, train_loss, val_loss))

100%|██████████| 6904/6904 [03:11<00:00, 36.06it/s]
100%|██████████| 1219/1219 [00:14<00:00, 82.34it/s]


For epoch 0, training loss: 0.4572828239378188, validation loss: 0.3427211894753822


100%|██████████| 6904/6904 [03:14<00:00, 35.52it/s]
100%|██████████| 1219/1219 [00:14<00:00, 84.72it/s]

For epoch 1, training loss: 0.30026187022191797, validation loss: 0.2809552652081375



