In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
from gensim.models import KeyedVectors
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [19]:
treebank_corpus = treebank.tagged_sents(tagset='universal')
brown_corpus = brown.tagged_sents(tagset='universal')
conll_corpus = conll2000.tagged_sents(tagset='universal')
tagged_sentences = treebank_corpus + brown_corpus + conll_corpus

In [20]:
tagged_sentences[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

In [21]:
X = [] # store input sequence
Y = [] # store output sequence

for sentence in tagged_sentences:
    X_sentence = []
    Y_sentence = []
    for entity in sentence:         
        X_sentence.append(entity[0])  # entity[0] contains the word
        Y_sentence.append(entity[1])  # entity[1] contains corresponding tag
        
    X.append(X_sentence)
    Y.append(Y_sentence)

In [22]:
X[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [23]:
num_words = len(set([word.lower() for sentence in X for word in sentence]))
num_tags   = len(set([word.lower() for sentence in Y for word in sentence]))
print(num_words)
print(num_tags)

59448
12


In [24]:
unique_tags = list(set([word.lower() for sentence in Y for word in sentence]))
unique_tags_dict = {}
index = 0
for tag in unique_tags:
    unique_tags_dict[tag] = index 
    index += 1
print(unique_tags_dict)

{'adp': 0, 'prt': 1, 'conj': 2, 'pron': 3, 'noun': 4, 'adj': 5, '.': 6, 'adv': 7, 'num': 8, 'x': 9, 'det': 10, 'verb': 11}


In [25]:
unique_words = list(set([word.lower() for sentence in X for word in sentence]))
unique_words_dict = {}
index = 0
for word in unique_words:
    unique_words_dict[word] = index 
    index += 1
print(len(unique_words_dict))

59448


In [26]:
def prepare_sequence(seq, to_ix):
    """Input: takes in a list of words, and a dictionary containing the index of the words
    Output: a tensor containing the indexes of the word"""
    idxs = [to_ix[w.lower()] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [27]:
EMBEDDING_SIZE  = 300  # each word in word2vec model is represented using a 300 dimensional vector
VOCABULARY_SIZE = num_words

with open('./embedding_weights.pickle', 'rb') as file:
    embedding_weights = pickle.load(file)

print(embedding_weights.shape)

torch.Size([59448, 300])


In [28]:
def create_emb_layer(weights_matrix, non_trainable=False):

    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [36]:
class RNNTagger(nn.Module):
    def __init__(self, hidden_dim, target_size):
        super(RNNTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        
        self.word_embeddings, vocab_size, embedding_dim = create_emb_layer(embedding_weights, True)
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence):
        #Input shape: [len(sentence)]
        embeds = self.word_embeddings(sentence)  
        #embeds shape: [len(sentence), embdeddin_dim]
  
        
        #input shape: [len(sentence),1,embedding_dim] (L,N,Hin​) when batch_first=False)
        rnn_out, hidden_state_out = self.rnn(embeds.view(len(sentence), 1, -1)) 
        #rnn_out shape: [len(sentence),1,hidden_dim] 
        #hiddsen_state_out shape: [1,1,hidden_shape]

        #input shape: [len(sentence),hidden_dim]
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        #tag_shape : (len(sentence),target_size)
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [37]:
def train_loop(model,loss_function,optimizer,device,X,Y):
    train_length = len(X)
    epoch_train_loss = 0 

    for i in tqdm(range(train_length)):
        sentence = X[i]
        tags = Y[i]

        model.zero_grad()

        sentence_in = prepare_sequence(sentence, unique_words_dict)
        targets = prepare_sequence(tags, unique_tags_dict)
        sentence_in = sentence_in.to(device=device)
        targets = targets.to(device = device)

        tag_scores = model(sentence_in)

        loss = loss_function(tag_scores, targets)
        epoch_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    return model, epoch_train_loss/train_length

In [38]:
def validation_loop(model,loss_function,device,X,Y):
    val_length = len(X)
    epoch_val_loss = 0 

    for i in tqdm(range(val_length)):
        sentence = X[i]
        tags = Y[i]

        model.eval()

        sentence_in = prepare_sequence(sentence, unique_words_dict)
        targets = prepare_sequence(tags, unique_tags_dict)
        sentence_in = sentence_in.to(device=device)
        targets = targets.to(device = device)

        tag_scores = model(sentence_in)
        #print(tag_scores)
        #print(targets)
        #print(tag_scores.shape)
        #print(targets.shape)
        #tag_scores shape : torch.Size([len(sentence), 12])
        #targets shape: torch.Size([len(sentence)])
        #CALL A FUNCTION WITH tag_scores and targets, GET PRECISION RECALL FScores        

        loss = loss_function(tag_scores, targets)
        epoch_val_loss += loss.item()
        break   
     
    
    return epoch_val_loss/val_length

In [39]:
HIDDEN_DIM = 64
model =RNNTagger(HIDDEN_DIM, len(unique_tags_dict.keys()))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device=device)

cuda


In [40]:
TEST_SIZE = 0.1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=4)

VALID_SIZE = 0.15
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=VALID_SIZE, random_state=4)

In [41]:
epochs = 1
for epoch in range(epochs):
    model , train_loss = train_loop(model,loss_function,optimizer,device,X_train,Y_train)
    val_loss = validation_loop(model,loss_function,device,X_validation,Y_validation)
    print("For epoch {}, training loss: {}, validation loss: {}".format(epoch, train_loss, val_loss))

100%|██████████| 55233/55233 [01:22<00:00, 673.02it/s]
  0%|          | 0/9748 [00:00<?, ?it/s]

torch.Size([15, 12])
torch.Size([15])
For epoch 0, training loss: 0.5197178640512295, validation loss: 1.4906054363419005e-05





In [25]:
epochs = 5
train_length = len(X_train)
val_length = len(X_validation)

for epoch in range(epochs):

    epoch_train_loss = 0
    epoch_val_loss = 0

    for i in tqdm(range(train_length)):

        sentence = X_train[i]
        tags = Y_train[i]
        model.zero_grad()
    
        sentence_in = prepare_sequence(sentence, unique_words_dict)
        targets = prepare_sequence(tags, unique_tags_dict)
        sentence_in = sentence_in.to(device=device)
        targets = targets.to(device = device)
        
        
        tag_scores = model(sentence_in)
        #print(tag_scores)
        
        loss = loss_function(tag_scores, targets)
        epoch_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    for i in tqdm(range(val_length)):

        sentence = X_validation[i]
        tags = Y_validation[i]
        model.zero_grad()
    
        sentence_in = prepare_sequence(sentence, unique_words_dict)
        targets = prepare_sequence(tags, unique_tags_dict)
        sentence_in = sentence_in.to(device=device)
        targets = targets.to(device = device)
        
        
        tag_scores = model(sentence_in)
        #print(tag_scores)
        
        loss = loss_function(tag_scores, targets)
        epoch_val_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    print("FOR epoch {}: Training loss: {}, Vaidation Loss: {}".format(epoch,epoch_train_loss,epoch_val_loss))


100%|██████████| 55233/55233 [01:18<00:00, 700.49it/s]
100%|██████████| 9748/9748 [00:14<00:00, 672.04it/s]


FOR epoch 0: Training loss: 20653.113293953644, Vaidation Loss: 3475.480302310054


100%|██████████| 55233/55233 [01:24<00:00, 654.86it/s]
100%|██████████| 9748/9748 [00:13<00:00, 705.13it/s]


FOR epoch 1: Training loss: 18235.388299366103, Vaidation Loss: 3175.7392742898664


100%|██████████| 55233/55233 [01:21<00:00, 677.97it/s]
100%|██████████| 9748/9748 [00:13<00:00, 702.67it/s]


FOR epoch 2: Training loss: 17022.702471989578, Vaidation Loss: 2994.8544709344906


100%|██████████| 55233/55233 [01:19<00:00, 696.74it/s]
100%|██████████| 9748/9748 [00:13<00:00, 699.33it/s]


FOR epoch 3: Training loss: 16313.052920114547, Vaidation Loss: 2879.8101532939904


100%|██████████| 55233/55233 [01:20<00:00, 687.19it/s]
100%|██████████| 9748/9748 [00:13<00:00, 700.36it/s]

FOR epoch 4: Training loss: 15777.23173381162, Vaidation Loss: 2788.054566035223



