In [2]:
import os 
import numpy as np
import pandas as pd
from utils import get_params, get_vocab
import random as rnd
import numpy as np
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F


In [4]:
# display original kaggle data
data = pd.read_csv("ner_dataset.csv", encoding = "ISO-8859-1") 
train_sents = open('data/small/train/sentences.txt', 'r').readline()
train_labels = open('data/small/train/labels.txt', 'r').readline()
print('SENTENCE:', train_sents)
print('SENTENCE LABEL:', train_labels)
print('ORIGINAL DATA:\n', data.head(5))
del(data, train_sents, train_labels)
del(data, train_sents, train_labels)

SENTENCE: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

SENTENCE LABEL: O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O

ORIGINAL DATA:
     Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [5]:
vocab, tag_map = get_vocab('data/large/words.txt', 'data/large/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'data/large/train/sentences.txt', 'data/large/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'data/large/val/sentences.txt', 'data/large/val/labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, 'data/large/test/sentences.txt', 'data/large/test/labels.txt')

In [6]:
# vocab translates from a word to a unique number
print('vocab["the"]:', vocab["the"])
# Pad token
print('padded token:', vocab['<PAD>'])

vocab["the"]: 9
padded token: 35180


In [7]:
# Exploring information about the data
print('The number of outputs is tag_map', len(tag_map))
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])

The number of outputs is tag_map 17
Num of vocabulary words: 35181
The vocab size is 35181
The training size is 33570
The validation size is 7194
An example of the first sentence is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
An example of its corresponding label is [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


In [48]:
# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# GRADED FUNCTION: data_generator
def data_generator(batch_size, x, y, pad, shuffle=False, verbose=False):
    '''
      Input: 
        batch_size - integer describing the batch size
        x - list containing sentences where words are represented as integers
        y - list containing tags associated with the sentences
        shuffle - Shuffle the data order
        pad - an integer representing a pad character
        verbose - Print information during runtime
      Output:
        a tuple containing 2 elements:
        X - np.ndarray of dim (batch_size, max_len) of padded sentences
        Y - np.ndarray of dim (batch_size, max_len) of tags associated with the sentences in X
    '''
    
    # count the number of lines in data_lines
    num_lines = len(x)
    
    # create an array with the indexes of data_lines that can be shuffled
    lines_index = [*range(num_lines)]
    
    # shuffle the indexes if shuffle is set to True
    if shuffle:
        rnd.shuffle(lines_index)
    
    index = 0 # tracks current location in x, y
    while True:
        buffer_x = [0] * batch_size # Temporal array to store the raw x data for this batch
        buffer_y = [0] * batch_size # Temporal array to store the raw y data for this batch
                
  ### START CODE HERE (Replace instances of 'None' with your code) ###
        
        # Copy into the temporal buffers the sentences in x[index : index + batch_size] 
        # along with their corresponding labels y[index : index + batch_size]
        # Find maximum length of sentences in x[index : index + batch_size] for this batch. 
        # Reset the index if we reach the end of the data set, and shuffle the indexes if needed.
        max_len = 0
        for i in range(batch_size):
             # if the index is greater than or equal to the number of lines in x
            if index >= num_lines:
                # then reset the index to 0
                index = 0
                # re-shuffle the indexes if shuffle is set to True
                if shuffle:
                    rnd.shuffle(index)
            
            # The current position is obtained using `lines_index[index]`
            # Store the x value at the current position into the buffer_x
            buffer_x[i] = x[lines_index[index]]
            
            # Store the y value at the current position into the buffer_y
            buffer_y[i] =  y[lines_index[index]]
            
            lenx = len(x[lines_index[index]])    #length of current x[]
            if lenx > max_len:
                max_len = lenx                   #max_len tracks longest x[]
            
            # increment index by one
            index += 1


        # create X,Y, NumPy arrays of size (batch_size, max_len) 'full' of pad value
        X = np.full((batch_size, max_len), pad)
        Y = np.full((batch_size, max_len), pad)

        # copy values from lists to NumPy arrays. Use the buffered values
        for i in range(batch_size):
            # get the example (sentence as a tensor)
            # in `buffer_x` at the `i` index
            x_i = buffer_x[i]
            
            # similarly, get the example's labels
            # in `buffer_y` at the `i` index
            y_i = buffer_y[i]
            
            # Walk through each word in x_i
            for j in range(len(x_i)):
                # store the word in x_i at position j into X
                X[i, j] = x_i[j]
                
                # store the label in y_i at position j into Y
                Y[i, j] = y_i[j]

    ### END CODE HERE ###
        if verbose: print("index=", index)
        yield((X,Y))

In [49]:
batch_size = 5
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
dg = data_generator(batch_size, mini_sentences, mini_labels, vocab["<PAD>"], shuffle=False, verbose=True)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1, "\n", Y1)

index= 5
index= 2
(5, 30) (5, 30) (5, 30) (5, 30)
[[    0     1     2     3     4     5     6     7     8     9    10    11
     12    13    14     9    15     1    16    17    18    19    20    21
  35180 35180 35180 35180 35180 35180]
 [   22     1    23    24    11     9    25    26     9    27    28    29
     30    31    32    33    34    35    36    37    38    39    35    13
     35    40     9    41    21    35]
 [   42     4    18     9    43     1    44     7    45    46    11    47
     48    21 35180 35180 35180 35180 35180 35180 35180 35180 35180 35180
  35180 35180 35180 35180 35180 35180]
 [   49    50     9    51     1    52    53    54    55    56    57    58
     59    60    21 35180 35180 35180 35180 35180 35180 35180 35180 35180
  35180 35180 35180 35180 35180 35180]
 [   61     8    62    63     9    64     1     9    65    66     1    67
     68    69    70    71    11     9    72    73    74    75     1    76
     21 35180 35180 35180 35180 35180]] 
 [[    0     

In [77]:
class Net(nn.Module):

    def __init__(self,vocab_size=35181):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=50)
        self.lstm = nn.LSTM(50, 128, 1, batch_first=True)
        self.fc = nn.Linear(128, 17)
 

    def forward(self, x,hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = out.contiguous().view(-1, 128)
        out = F.log_softmax(self.fc(out))
        return out, hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        hidden = (weight.new(1, batch_size, 128).zero_(),
                      weight.new(1, batch_size, 128).zero_())
        
        return hidden


In [78]:
net = Net()

In [79]:
net

Net(
  (embedding): Embedding(35181, 50)
  (lstm): LSTM(50, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=17, bias=True)
)

In [80]:
batch_size = 64
train_generator = data_generator(batch_size, t_sentences, t_labels, vocab['<PAD>'], False)
    
    
    

In [81]:
opt = torch.optim.Adam(net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [82]:
epochs = 10
counter = 0


In [101]:
for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in  data_generator(batch_size, t_sentences, t_labels, vocab['<PAD>'], False):
            counter += 1
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            targets = targets.view(batch_size*y.shape[1]).long()
            h = tuple([each.data for each in h])
            net.zero_grad()
            output, h = net(inputs, h)
            break
            loss = criterion(output,targets )
            loss.backward()
            opt.step()
            
            if  counter % 10 == 0:
             
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()))
    

torch.Size([2560, 17])
tensor([    0,     0,     0,  ..., 35180, 35180, 35180])


  


IndexError: Target 35180 is out of bounds.