In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
from gensim.models import KeyedVectors

In [4]:
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000
treebank_corpus = treebank.tagged_sents(tagset='universal')
brown_corpus = brown.tagged_sents(tagset='universal')
conll_corpus = conll2000.tagged_sents(tagset='universal')
tagged_sentences = treebank_corpus + brown_corpus + conll_corpus

In [5]:
X = [] # store input sequence
Y = [] # store output sequence

for sentence in tagged_sentences:
    X_sentence = []
    Y_sentence = []
    for entity in sentence:         
        X_sentence.append(entity[0])  # entity[0] contains the word
        Y_sentence.append(entity[1])  # entity[1] contains corresponding tag
        
    X.append(X_sentence)
    Y.append(Y_sentence)

In [6]:
num_words = len(set([word.lower() for sentence in X for word in sentence]))
num_tags   = len(set([word.lower() for sentence in Y for word in sentence]))
print(num_words)
print(num_tags)

59448
12


In [7]:
unique_tags = list(set([word.lower() for sentence in Y for word in sentence]))
unique_tags_dict = {}
index = 0
for tag in unique_tags:
    unique_tags_dict[tag] = index 
    index += 1
print(unique_tags_dict)

{'x': 0, 'verb': 1, 'conj': 2, 'adj': 3, 'num': 4, 'prt': 5, 'adp': 6, 'det': 7, 'adv': 8, '.': 9, 'pron': 10, 'noun': 11}


In [8]:
unique_words = list(set([word.lower() for sentence in X for word in sentence]))
unique_words_dict = {}
index = 0
for word in unique_words:
    unique_words_dict[word] = index 
    index += 1
print(len(unique_words_dict))

59448


In [9]:
class RNNTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(RNNTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence):
        #Input shape: [len(sentence)]
        embeds = self.word_embeddings(sentence)  
        #embeds shape: [len(sentence), embdeddin_dim]
  
        
        #input shape: [len(sentence),1,embedding_dim] (L,N,Hin​) when batch_first=False)
        rnn_out, hidden_state_out = self.rnn(embeds.view(len(sentence), 1, -1)) 
        #rnn_out shape: [len(sentence),1,hidden_dim] 
        #hiddsen_state_out shape: [1,1,hidden_shape]

        #input shape: [len(sentence),hidden_dim]
        tag_space = self.hidden2tag(rnn_out.view(len(sentence), -1))
        #tag_shape : (len(sentence),target_size)
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [10]:
def prepare_sequence(seq, to_ix):
    """Input: takes in a list of words, and a dictionary containing the index of the words
    Output: a tensor containing the indexes of the word"""
    idxs = [to_ix[w.lower()] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [11]:
path = './GoogleNews-vectors-negative300.bin'
# load word2vec using the following function present in the gensim library
word2vec = KeyedVectors.load_word2vec_format(path, binary=True)

In [17]:
# assign word vectors from word2vec model

EMBEDDING_SIZE  = 300  # each word in word2vec model is represented using a 300 dimensional vector
VOCABULARY_SIZE = num_words


embedding_weights = torch.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))

for word, index in unique_words_dict.items():
    #print(word)
    #print(index)
    try:
        embedding_weights[index, :] = word2vec[word]
    except KeyError:
        print("wow")
        pass
    break

TypeError: can't assign a numpy.ndarray to a torch.FloatTensor

In [16]:
print("Embeddings shape: {}".format(embedding_weights.shape))

Embeddings shape: (59448, 300)


In [44]:
from tqdm import tqdm

In [46]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 64
model =RNNTagger(EMBEDDING_DIM, HIDDEN_DIM, len(unique_words_dict.keys()), len(unique_tags_dict.keys()))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [48]:
device = torch.device("cuda")
device

device(type='cuda')

In [49]:
model = model.to(device=device)

In [51]:
print("Training Started")
total_length = len(X)
for epoch in range(1):
    for i in tqdm(range(total_length)):
        sentence = X[i]
        tags = Y[i]
        model.zero_grad()
        #print(sentence)
        sentence_in = prepare_sequence(sentence, unique_words_dict)
        targets = prepare_sequence(tags, unique_tags_dict)
        sentence_in = sentence_in.to(device=device)
        targets = targets.to(device = device)
        #print(targets)
        
        tag_scores = model(sentence_in)
        #print(tag_scores)
        
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        

Training Started


 17%|█▋        | 12299/72202 [00:38<03:08, 316.96it/s]


KeyboardInterrupt: 

In [43]:
m = nn.LogSoftmax(dim=1)
loss = nn.NLLLoss()
# input is of size N x C = 3 x 5
input = torch.randn(3, 5, requires_grad=True)
#each element in target has to have 0 <= value < C
target = torch.tensor([1, 0, 4])
print(target)
print(m(input))
output = loss(m(input), target)


tensor([1, 0, 4])
tensor([[-2.1229, -1.7472, -3.3242, -0.9893, -1.2099],
        [-1.1042, -2.3910, -3.6942, -1.6905, -1.0005],
        [-1.2733, -2.6861, -1.9800, -2.6682, -0.8108]],
       grad_fn=<LogSoftmaxBackward0>)
