In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
from gensim.models import KeyedVectors
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
treebank_corpus = treebank.tagged_sents(tagset='universal')
brown_corpus = brown.tagged_sents(tagset='universal')
conll_corpus = conll2000.tagged_sents(tagset='universal')
tagged_sentences = treebank_corpus + brown_corpus + conll_corpus

In [3]:
tagged_sentences[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

In [4]:
X = [] # store input sequence
Y = [] # store output sequence

for sentence in tagged_sentences:
    X_sentence = []
    Y_sentence = []
    for entity in sentence:         
        X_sentence.append(entity[0])  # entity[0] contains the word
        Y_sentence.append(entity[1])  # entity[1] contains corresponding tag
        
    X.append(X_sentence)
    Y.append(Y_sentence)

In [5]:
X[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [6]:
num_words = len(set([word.lower() for sentence in X for word in sentence]))
num_tags   = len(set([word.lower() for sentence in Y for word in sentence]))
print(num_words)
print(num_tags)

59448
12


In [7]:
unique_tags = list(set([word.lower() for sentence in Y for word in sentence]))
unique_tags_dict = {}
index = 0
for tag in unique_tags:
    unique_tags_dict[tag] = index 
    index += 1
print(unique_tags_dict)

{'.': 0, 'adp': 1, 'verb': 2, 'num': 3, 'noun': 4, 'det': 5, 'x': 6, 'conj': 7, 'pron': 8, 'prt': 9, 'adv': 10, 'adj': 11}


In [8]:
unique_words = list(set([word.lower() for sentence in X for word in sentence]))
unique_words_dict = {}
index = 0
for word in unique_words:
    unique_words_dict[word] = index 
    index += 1
print(len(unique_words_dict))

59448


In [9]:
def prepare_sequence(seq, to_ix):
    """Input: takes in a list of words, and a dictionary containing the index of the words
    Output: a tensor containing the indexes of the word"""
    idxs = [to_ix[w.lower()] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [10]:
EMBEDDING_SIZE  = 300  # each word in word2vec model is represented using a 300 dimensional vector
VOCABULARY_SIZE = num_words

with open('./embedding_weights.pickle', 'rb') as file:
    embedding_weights = pickle.load(file)

print(embedding_weights.shape)

torch.Size([59448, 300])


In [11]:
def create_emb_layer(weights_matrix, non_trainable=False):

    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [90]:
class RNNTagger_encoder(nn.Module):
    def __init__(self, hidden_dim, target_size):
        super(RNNTagger_encoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.target_size = target_size
        
        self.word_embeddings, vocab_size, embedding_dim = create_emb_layer(embedding_weights, True)
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        #self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence):
        #print("ENTERING ENCODER")

        #Input shape: [len(sentence)]
        embeds = self.word_embeddings(sentence)  
        #embeds shape: [len(sentence), embdeddin_dim]
  
        
        #input shape: [len(sentence),1,embedding_dim] (L,N,Hin​) when batch_first=False)
        rnn_out, hidden_state_out = self.rnn(embeds.view(len(sentence), 1, -1))  
        #rnn_out shape: [len(sentence),1,hidden_dim] 
        #hiddsen_state_out shape: [1,1,hidden_shape]  The hidden state corresponding to last time step

        #print("LEAVING ENCODER")
        
        return rnn_out,hidden_state_out

In [91]:
class RNNTagger_decoder(nn.Module):
    def __init__(self, hidden_dim, target_size):
        super(RNNTagger_decoder, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.target_size = target_size
        
        self.word_embeddings, vocab_size, embedding_dim = create_emb_layer(embedding_weights, True)
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence,hidden):
        #print("ENTERING DECODER")


        #Input shape: [len(sentence)]  -----HERE len(sentence) = 1
        embeds = self.word_embeddings(sentence)  
        #embeds shape: [len(sentence), embdeddin_dim]
  
        
        #input shape: [len(sentence),1,embedding_dim] (L,N,Hin​) when batch_first=False)
        #input deocder shape: torch.Size([1, 1, 64]
        rnn_out, hidden_state_out = self.rnn(embeds.view(len(sentence), 1, -1),hidden) 
        #rnn_out shape: [len(sentence),1,hidden_dim] 
        #hiddsen_state_out shape: [1,1,hidden_shape]

        #input shape: [len(sentence),hidden_dim]
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        #tag_shape : (len(sentence),target_size)
        
        tag_scores = F.log_softmax(tag_space, dim=1)

        #print("LEAVING DECODER")
        return tag_scores,hidden_state_out

In [92]:
class RNNTagger_seq2seq(nn.Module):
    def __init__(self, encoder, decoder, target_size,device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.target_size =  target_size
        
        
    def forward(self, sentence, gt_pos_tags):
        

        outputs = torch.zeros(len(sentence), self.target_size).to(self.device)
        
        #print("for encoder input:" , sentence.shape)  --- torch.Size([len(sentence)])
        encoder_out, hidden = self.encoder(sentence)
        #print("encoder_out shape:", encoder_out.shape)  --- torch.Size([len(sentence), 1, hidden_dim])
        #print("hidden shape:", hidden.shape)  -- torch.Size([1, 1, hidden_dim])
        

        index = 0
        for token in sentence:
            
            token = token.unsqueeze(0)
            token = token.to(device = self.device)


            #print("input to decoder:" ,token.shape)  -- torch.Size([1])
            #print("hiddden input shape:", hidden.shape) --torch.Size([1, 1, hidden_dim])
            output, hidden = self.decoder(token, hidden)
            #print("output shape:", output.shape) --torch.Size([1, target_size])
            #print("hidden shape:", hidden.shape) --torch.Size([1, 1, hidden_dim])
            
            #place predictions in a tensor holding predictions for each token
            outputs[index] = output
            index += 1

        
        return outputs

In [93]:
def train_loop(model,loss_function,optimizer,device,X,Y):
    train_length = len(X)
    epoch_train_loss = 0 
   
    model.train()
    for i in tqdm(range(train_length)):
        sentence = X[i]
        tags = Y[i]

        model.zero_grad()

        sentence_in = prepare_sequence(sentence, unique_words_dict)
        targets = prepare_sequence(tags, unique_tags_dict)
        sentence_in = sentence_in.to(device=device)
        targets = targets.to(device = device)
        #print(sentence_in)
        #print(targets)

        tag_scores = model(sentence_in,targets)

        loss = loss_function(tag_scores, targets)
        epoch_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    return model, epoch_train_loss/train_length

In [94]:
def validation_loop(model,loss_function,device,X,Y):
    val_length = len(X)
    epoch_val_loss = 0 

    for i in tqdm(range(val_length)):
        sentence = X[i]
        tags = Y[i]

        model.eval()

        sentence_in = prepare_sequence(sentence, unique_words_dict)
        targets = prepare_sequence(tags, unique_tags_dict)
        sentence_in = sentence_in.to(device=device)
        targets = targets.to(device = device)

        tag_scores = model(sentence_in,targets)
        #print(tag_scores)
        #print(targets)
        #print(tag_scores.shape)
        #print(targets.shape)
        #tag_scores shape : torch.Size([len(sentence), 12])
        #targets shape: torch.Size([len(sentence)])
        #CALL A FUNCTION WITH tag_scores and targets, GET PRECISION RECALL FScores        

        loss = loss_function(tag_scores, targets)
        epoch_val_loss += loss.item()
          
     
    
    return epoch_val_loss/val_length

In [95]:
HIDDEN_DIM = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
enc = RNNTagger_encoder(HIDDEN_DIM, len(unique_tags_dict.keys()))
dec = RNNTagger_decoder(HIDDEN_DIM,len(unique_tags_dict.keys()))
model = RNNTagger_seq2seq(enc, dec, len(unique_tags_dict.keys()),device).to(device=device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

cuda


In [96]:
TEST_SIZE = 0.1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=4)

VALID_SIZE = 0.15
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=VALID_SIZE, random_state=4)

In [97]:
epochs = 1
for epoch in range(epochs):
    model , train_loss = train_loop(model,loss_function,optimizer,device,X_train,Y_train)
    val_loss = validation_loop(model,loss_function,device,X_validation,Y_validation)
    print("For epoch {}, training loss: {}, validation loss: {}".format(epoch, train_loss, val_loss))

  2%|▏         | 831/55233 [00:14<15:56, 56.87it/s]


KeyboardInterrupt: 

In [51]:
#model = Seq2Seq(enc, dec, device).to(device)

# seq1 = "everybody read the book and ate the food".split()
# inputs = prepare_sequence(seq1, unique_words_dict)
# pos_tags = "adv adv adv"
# tag_scores = model(inputs,pos_tags)
# print(tag_scores)

# with torch.no_grad():
#     for seq in [seq1, seq2]:
#         print(seq)
#         inputs = prepare_sequence(seq, unique_words_dict)
#         #print("PRINTING INPUTS:", inputs)
#         tag_scores = model(inputs)
#         #print("TAG SCORES:", tag_scores)
#         #print("TAG SHAPE:", tag_scores.shape) #Shape is (len(senetence),target_size)


#         _, indices = torch.max(tag_scores, 1) 
#         print(indices) #Returns index of maximum
#         ret = []
#         for i in range(len(indices)):
#             for key, value in unique_tags_dict.items():
#                 if indices[i] == value:
#                     ret.append((seq[i], key))
#         print(ret)

tensor([[-2.3845, -2.3303, -2.8299, -2.4387, -2.6026, -2.7334, -2.6120, -2.3255,
         -2.3966, -2.4169, -2.4293, -2.4559],
        [-2.7175, -2.6580, -2.5621, -2.3863, -2.5782, -2.3435, -2.6712, -2.1735,
         -2.4823, -2.4160, -2.3058, -2.6970],
        [-2.5445, -2.6496, -2.4775, -2.3125, -2.3808, -2.1432, -2.7440, -2.4927,
         -2.4781, -2.6070, -2.5448, -2.5911],
        [-2.6723, -2.5064, -2.5053, -2.3066, -2.6649, -2.6963, -2.4045, -2.5247,
         -2.4774, -2.6542, -2.1041, -2.4725],
        [-2.6179, -2.5713, -2.3824, -2.4313, -2.5005, -2.4331, -2.5099, -2.5523,
         -2.4298, -2.3233, -2.3330, -2.8411],
        [-2.6331, -2.5447, -2.5990, -2.3327, -2.5631, -2.4084, -2.4967, -2.2393,
         -2.5961, -2.4588, -2.4920, -2.5318],
        [-2.5359, -2.6855, -2.8276, -2.2516, -2.3059, -2.0337, -2.7347, -2.5788,
         -2.4444, -2.6241, -2.5422, -2.5387],
        [-2.6981, -2.4284, -2.4977, -2.6246, -2.5981, -2.4926, -2.7112, -2.2066,
         -2.3961, -2.4557, -2.