In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
from gensim.models import KeyedVectors
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.corpus import conll2000
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
treebank_corpus = treebank.tagged_sents(tagset='universal')
brown_corpus = brown.tagged_sents(tagset='universal')
conll_corpus = conll2000.tagged_sents(tagset='universal')
tagged_sentences = treebank_corpus + brown_corpus + conll_corpus

In [4]:
tagged_sentences[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

In [5]:
X = [] # store input sequence
Y = [] # store output sequence

for sentence in tagged_sentences:
    X_sentence = []
    Y_sentence = []
    for entity in sentence:         
        X_sentence.append(entity[0])  # entity[0] contains the word
        Y_sentence.append(entity[1])  # entity[1] contains corresponding tag
        
    X.append(X_sentence)
    Y.append(Y_sentence)

In [6]:
X[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [7]:
Y[0]

['NOUN',
 'NOUN',
 '.',
 'NUM',
 'NOUN',
 'ADJ',
 '.',
 'VERB',
 'VERB',
 'DET',
 'NOUN',
 'ADP',
 'DET',
 'ADJ',
 'NOUN',
 'NOUN',
 'NUM',
 '.']

In [8]:
num_words = len(set([word.lower() for sentence in X for word in sentence]))
num_tags   = len(set([word.lower() for sentence in Y for word in sentence]))
print(num_words)
print(num_tags)

59448
12


In [9]:
unique_tags = list(set([word.lower() for sentence in Y for word in sentence]))
unique_tags_dict = {}
index = 1
for tag in unique_tags:
    unique_tags_dict[tag] = index 
    index += 1
print(unique_tags_dict)

{'adj': 1, 'conj': 2, 'adv': 3, 'pron': 4, '.': 5, 'num': 6, 'prt': 7, 'adp': 8, 'x': 9, 'noun': 10, 'verb': 11, 'det': 12}


In [10]:
unique_words = list(set([word.lower() for sentence in X for word in sentence]))
unique_words_dict = {}
index = 1
for word in unique_words:
    unique_words_dict[word] = index 
    index += 1
print(len(unique_words_dict))

59448


In [11]:
values = []
for i in unique_words_dict.keys():
    values.append(unique_words_dict[i])

print(max(values))
max_value_dict = max(values)

59448


In [12]:
def prepare_sequence_sentence(seq, to_ix):
    """Input: takes in a list of words, and a dictionary containing the index of the words
    Output: a tensor containing the indexes of the word"""
    idxs = [to_ix[w.lower()] for w in seq]
    random_index = random.randint(0,len(idxs)-1)
    idxs[random_index] = max_value_dict + 1
    return idxs

In [13]:
def prepare_sequence_tags(seq, to_ix):
    """Input: takes in a list of words, and a dictionary containing the index of the words
    Output: a tensor containing the indexes of the word"""
    idxs = [to_ix[w.lower()] for w in seq]
    return idxs

In [14]:
from keras_preprocessing.sequence import pad_sequences
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
def split(list_a, batch_size):

  for i in range(0, len(list_a), batch_size):
    yield list_a[i:i + batch_size]

cuda


In [15]:
batch_size = 8
X_batches = list(split(X, batch_size))
Y_batches = list(split(Y,batch_size))

In [16]:
X_batches_padded = []
Y_batches_padded = []
max_length_list = []

for b_s,b_t in zip(X_batches,Y_batches):
    max_seq_length = 0
    for sentence in b_s:
        if len(sentence) > max_seq_length:
            max_seq_length = len(sentence)
    
    sen_encoded = []
    tag_encoded = []
    for sentence,tags in zip(b_s,b_t):
        sen_encoded.append(prepare_sequence_sentence(sentence, unique_words_dict))
        tag_encoded.append(prepare_sequence_tags(tags, unique_tags_dict))
    
    X_batches_padded.append(pad_sequences(sen_encoded, maxlen=max_seq_length, padding="pre", truncating="post"))
    Y_batches_padded.append(pad_sequences(tag_encoded, maxlen=max_seq_length, padding="pre", truncating="post"))
    max_length_list.append(max_seq_length)

    

In [17]:
print(len(Y_batches_padded))
print(len(X_batches_padded))
print(len(max_length_list))


9026
9026
9026


In [18]:
print(len(Y_batches_padded[0]))
len(X_batches_padded[0])

8


8

In [19]:
print(Y_batches_padded[0])
print(X_batches_padded[0])
print(max_length_list[0])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 10
  10  5  6 10  1  5 11 11 12 10  8 12  1 10 10  6  5]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0 10 10 11 10  8 10 10  5 12 10 11 10  5]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 10 10  5  6 10  1  2  1 10  8
  10 10 10 10  5 11 11  9 12  1 10  8 12  1  1 10  5]
 [12 10  8 10  3 11  9  9  7 11 10 10 10 11 11 12  1 10  8 10 10  8 12 10
   8 10 11  9  7  4  3  8  6 10  8  5 10 11  9  9  5]
 [ 0  0  0  0  0  0 12 10 10  5 10  5 11  3  1  8  4 11 12 10  5  8  3  1
  10  7  4 11 10 12  9 11  7 10  1  5 10 11  9  9  5]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 10 10  5 12 10  8  1  1 10 10
  12  9 11 10 10  5 11 11 10  8  4 10 10 10  8  6  5]
 [ 0  0  0  0  8  1 10 11 11  9  3  8 12 10  8  5 12  1 10 11  8 10  7 10
  10 10  8 10  5 12 10  1  9  7 11  1 10  7 12 10  5]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0 12 10 10 11  

In [20]:
X_final = []
Y_final = []

for index in range(len(X_batches_padded)):
    X_batch_tensor = torch.zeros((batch_size,max_length_list[index]),dtype = int).to(device= device)
    Y_batch_tensor = torch.zeros((batch_size,max_length_list[index]), dtype = int).to(device = device)

    count = 0
    for x, y in zip(X_batches_padded[index],Y_batches_padded[index]):
        X_batch_tensor[count] = torch.tensor(x).to(device = device)
        Y_batch_tensor[count] = torch.tensor(y).to(device =device)
        count += 1
    
    X_final.append(X_batch_tensor)
    Y_final.append(Y_batch_tensor)


In [21]:
print(len(X_final))
print(len(Y_final))
print(X_final[0].shape)
print(Y_final[0].shape)
print(X_final[0][0].shape)
print(Y_final[0][0].shape)
print(X_final[1].shape)
print(Y_final[1].shape)
print(X_final[0])
print(Y_final[0])

9026
9026
torch.Size([8, 41])
torch.Size([8, 41])
torch.Size([41])
torch.Size([41])
torch.Size([8, 38])
torch.Size([8, 38])
tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0, 42399, 46560,  9529, 30096, 18958, 11126,  9529,
         34297, 58179, 37456,   100, 29634,  7786, 59449, 46726, 42324, 26382,
         36207],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0, 26279, 46560,
         44323, 49559, 21652, 39255,   820,  9529, 59449, 27632,  7584, 51603,
         36207],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0, 31842, 12907,  9529,  6106, 18958, 11126,
         50302, 58189, 49559, 21652,   352, 29618, 5

In [22]:
# X_batches = []
# Y_batches = []
# batch_size = 8


# for sentence in X:
#     X_encoded.append(prepare_sequence(sentence, unique_words_dict))
# for tags in Y:
#     Y_encoded.append(prepare_sequence(tags, unique_tags_dict))

# MAX_SEQ_LENGTH = 100  # sequences greater than 100 in length will be truncated

# X_padded = pad_sequences(X_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
# Y_padded = pad_sequences(Y_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

In [23]:
# print(len(X_padded))
# X_padded[0]

In [24]:
# print(len(Y_encoded))
# print(Y_encoded)

In [25]:
EMBEDDING_SIZE  = 300  # each word in word2vec model is represented using a 300 dimensional vector
VOCABULARY_SIZE = num_words + 2

with open('./embedding_weights.pickle', 'rb') as file:
    embedding_weights = pickle.load(file)

print(embedding_weights.shape)

torch.Size([59450, 300])


In [26]:
def create_emb_layer(weights_matrix, non_trainable=False):

    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [27]:
class RNNTagger(nn.Module):
    def __init__(self, hidden_dim, target_size, batch_size):
        super(RNNTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        
        self.word_embeddings, vocab_size, embedding_dim = create_emb_layer(embedding_weights, True)
        #self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first = True)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        
    def forward(self, sentence):
        #print("REACHONG FORWARD")

        #Input shape: [batch_size,max_length in that batch]
        embeds = self.word_embeddings(sentence)
        #print("DONE embeds:", embeds.shape)  -- torch.Size([batch_Size, max_length_in_that_batch, embedding_dim])
  
        
        #input shape: [len(sentence),1,embedding_dim] (L,N,Hin​) when batch_first=False)
    
        rnn_out, hidden_state_out = self.rnn(embeds) 
        #print(rnn_out.shape)  -- torch.Size([8, 45, 64])
        #print(hidden_state_out.shape) --torch.Size([1, 8, 64])


        #input shape: -- torch.Size([8, 45, 64])
        tag_space = self.hidden2tag(rnn_out)
        #print("DONE LINEAR LAYER: ", tag_space.shape) --torch.Size([8, 45, 13])
        
        tag_scores = F.log_softmax(tag_space, dim=2)
        #print("DONE SOFTMAX:", tag_scores.shape) --torch.Size([8, 45, 13])
        
        return tag_scores

In [28]:
def train_loop(model,loss_function,optimizer,device,X,Y):
    train_length = len(X)
    epoch_train_loss = 0 
   
    model.train()
    for i in tqdm(range(train_length)):
        sentence_batch = X[i]
        tags_batch = Y[i]

        model.zero_grad()

        #print("Input shape:",  sentence_batch.shape) --torch.Size([batch_size, max_length_in_batch]) 
        tag_scores = model(sentence_batch)

       
        #print("model output shape: ", tag_scores.shape) -- torch.Size([8, 45, 13]) 
        #print("tags batch shape: ", tags_batch.shape) --torch.Size([8, 45])
        new1 = tag_scores.transpose(1,2)
        #print("new input shape:" , new1.shape) --torch.Size([8, 13, 45])

        #print("STARTING LOSS FUNCTION")
        loss = loss_function(new1 , tags_batch)
        #print("DONE WITH LOSS FUNCTION")
        #print(loss)
        epoch_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    return model, epoch_train_loss/train_length

In [29]:
def validation_loop(model,loss_function,device,X,Y):
    val_length = len(X)
    epoch_val_loss = 0 

    for i in tqdm(range(val_length)):
        sentence_batch = X[i]
        tags_batch = Y[i]

        model.eval()

        tag_scores = model(sentence_batch)

        #CALL A FUNCTION WITH tag_scores and targets, GET PRECISION RECALL FScores        

        new1 = tag_scores.transpose(1,2) 
        loss = loss_function(new1, tags_batch)
        epoch_val_loss += loss.item()
          
     
    
    return epoch_val_loss/val_length

In [30]:
HIDDEN_DIM = 64
batch_size = 4
model =RNNTagger(HIDDEN_DIM, len(unique_tags_dict.keys())+1, batch_size)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device=device)

cuda


In [31]:
TEST_SIZE = 0.1
X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y_final, test_size=TEST_SIZE, random_state=4)

VALID_SIZE = 0.15
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=VALID_SIZE, random_state=4)

In [32]:
Y_train[1]

tensor([[ 0,  0,  0,  0,  0,  0, 12, 10, 10, 11, 12,  1, 10,  8, 10,  8, 10,  2,
         10,  7,  1, 10, 10, 11,  8, 12, 10, 11, 11,  5],
        [ 0,  0,  0,  0,  0,  0,  5,  8, 12,  1, 10,  5,  4, 11,  3, 11, 10,  5,
          7, 11, 10,  7, 12, 10,  5, 11, 12, 10, 10,  5],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  4, 11,  1,  7,
         10, 11,  8,  4, 11,  3, 11,  4, 10,  8,  5,  5],
        [ 5,  4, 11,  8,  4, 11,  4,  8, 12, 10,  8, 12, 10,  2, 11,  4, 12, 11,
          6, 10,  7, 11,  5,  5, 11,  6,  1, 10, 10,  5],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  3,  4, 11,  5,  5,  8,
         12, 10,  5,  6,  8, 12, 10, 11, 11,  5,  5,  5],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0, 12, 10, 10,  3, 11,  7, 10,  5],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0, 10, 10, 11, 12,  1, 10,  5],
        [ 0,  0,  0,  0,  0

In [33]:
X_train[1]

tensor([[    0,     0,     0,     0,     0,     0, 37456, 58484, 37753, 55439,
          7786, 23037, 55329, 38802, 59449, 38802, 52997, 12909, 35681,  4565,
         56062, 37916, 42107, 21637, 29634, 37456, 30087, 49699, 59442, 36207],
        [    0,     0,     0,     0,     0,     0, 50973,  1176, 37456, 59449,
         18816,  9529,  6756, 16150, 56334, 13895, 12820, 45705, 42266,  1523,
         28622, 42266, 37456, 37229,  9529, 31269,  7786, 40107,  6742, 36207],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0, 50973, 32789,  4491, 50899, 42266, 51686,   583,
         13338, 52933, 16150, 59449, 29520, 26858, 52740, 14806, 36207, 45705],
        [50973, 32789,  4491, 22151, 52933, 21723,  6756, 38802,  7786, 59449,
          1176,  7786, 37869, 50302, 21677,  6756, 37427, 49699, 20056, 57044,
         42266, 14403,  9529, 45705,  2602, 30257, 49529, 42107,  2951, 36207],
        [    0,     0,     0,     0,     0,     

In [34]:
print(len(X_train))
print(len(Y_train))

6904
6904


In [35]:
epochs = 5
for epoch in range(epochs):
    model , train_loss = train_loop(model,loss_function,optimizer,device,X_train,Y_train)
    val_loss = validation_loop(model,loss_function,device,X_validation,Y_validation)
    print("For epoch {}, training loss: {}, validation loss: {}".format(epoch, train_loss, val_loss))

100%|██████████| 6904/6904 [00:15<00:00, 457.59it/s]
100%|██████████| 1219/1219 [00:00<00:00, 1463.12it/s]


For epoch 0, training loss: 0.47562323025884246, validation loss: 0.38053657368766763


100%|██████████| 6904/6904 [00:10<00:00, 682.70it/s]
100%|██████████| 1219/1219 [00:00<00:00, 1733.40it/s]


For epoch 1, training loss: 0.34589468784574184, validation loss: 0.32674455427556276


100%|██████████| 6904/6904 [00:10<00:00, 680.64it/s]
100%|██████████| 1219/1219 [00:00<00:00, 1702.52it/s]


For epoch 2, training loss: 0.3056178611799263, validation loss: 0.2995219011901831


100%|██████████| 6904/6904 [00:10<00:00, 651.32it/s]
100%|██████████| 1219/1219 [00:00<00:00, 1271.78it/s]


For epoch 3, training loss: 0.2825986559708902, validation loss: 0.28270342392912645


100%|██████████| 6904/6904 [00:11<00:00, 618.22it/s]
100%|██████████| 1219/1219 [00:00<00:00, 1461.27it/s]

For epoch 4, training loss: 0.26707962754496556, validation loss: 0.27118278883391467





DEMO CODE

In [36]:
import nltk
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>


False

In [37]:
def prepare_TESTsequence_sentence(seq, to_ix):
    values = []
    for i in to_ix.keys():
        values.append(to_ix[i])
    
    max_value_dict = max(values)
    
    idxs = []

    for w in seq:
        if w.lower() in to_ix.keys():
            idxs.append(to_ix[w.lower()])
        else:
            idxs.append(max_value_dict+1)
    return idxs

In [38]:
index_to_tags = {}
for tag in unique_tags_dict:
    index_to_tags[unique_tags_dict[tag]] = tag 
index_to_tags[0] = '0'
index_to_tags

{1: 'adj',
 2: 'conj',
 3: 'adv',
 4: 'pron',
 5: '.',
 6: 'num',
 7: 'prt',
 8: 'adp',
 9: 'x',
 10: 'noun',
 11: 'verb',
 12: 'det',
 0: '0'}

In [39]:
from nltk.tokenize import word_tokenize
text = "bottle is 5 years old, non executive director of CSK"
tokenized_seq = word_tokenize(text)
print(tokenized_seq)

tokens= prepare_TESTsequence_sentence(tokenized_seq,unique_words_dict)
print(tokens) 

tokens = torch.tensor(tokens).to(device = device)
tokens = tokens.unsqueeze(0)
output = model(tokens)
output = output.squeeze(0)
tag_index = torch.argmax(output,dim=1)
#print(tag_index.shape)
#print(output.shape)
for predicted_tag,word in zip(tag_index,tokenized_seq):
    predicted_tag = predicted_tag.item()
    predicted_tag = index_to_tags[predicted_tag]
    print(word, predicted_tag)

    

['bottle', 'is', '5', 'years', 'old', ',', 'non', 'executive', 'director', 'of', 'CSK']
[54310, 44323, 6796, 18958, 11126, 9529, 49443, 8565, 46726, 21652, 59449]
bottle noun
is verb
5 num
years noun
old adj
, .
non noun
executive noun
director noun
of adp
CSK noun


--------------------

In [None]:
#weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
embedding = nn.Embedding.from_pretrained(embedding_weights)
# Get embeddings for index 1
list1 = [5,6,8],
list1 = torch.tensor(list1)
list2 = [1,2,3]
list2 = torch.tensor(list2)
list3 = torch.zeros((2,3), dtype = int)
print(list1)
print(list2)
list3[0] = list1 
list3[1] = list2
print(list3)
a = embedding(list3)
a.shape -- batch_size,L_embedding_dim
#for i in list3:
#    print(i)
#    a = embedding(i)
#    print(a.shape)
#print(list1)
#input = torch.LongTensor([1])
#a = embedding(list3)
#a.shape

tensor([[5, 6, 8]])
tensor([1, 2, 3])
tensor([[5, 6, 8],
        [1, 2, 3]])


torch.Size([2, 3, 300])