In [20]:
import torch
import numpy as np
from string import punctuation
from collections import Counter
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from string import punctuation
from collections import Counter
import config
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import numpy as np
import config

In [33]:
SEQ_LENGTH = 50
SPLIT = 0.8
BATCH_SIZE = 100
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EMBEDDING_DIM = 150
HIDDEN_DIM = 256
N_LAYERS = 2
OUTPUT_SIZE = 1
EPOCHS = 4
PRINT_EVERY = 100
CLIP=5
MODEL_ARCH='LSTM'

In [22]:
class NERData():
    def __init__(self):
        self.data = None
        self.words = None
        self.tags = None
        self.sentences = None
        self.vocab_to_int = None
        self.tag_to_int = None

    def load_data(self):
        self.data = pd.read_csv('D:/Projects/Sessions/NER/input/ner_dataset.csv',encoding = 'latin1')
        self.data = self.data.fillna(method='ffill')
        print("Unique Words ", self.data['Word'].nunique())
        print("Unique Tags ", self.data['Tag'].nunique())
        self.words = list(set(self.data['Word'].values))
        self.tags = list(set(self.data['Tag'].values))
        self.words.append('PAD')


    def sentence_getter(self):
        agg_func = lambda s:[(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                          s["POS"].values.tolist(),
                                                          s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

        return self.sentences

    def vocab_dict(self):
        self.vocab_to_int = {w:i+1 for i,w in enumerate(self.words)}
        self.int_to_vocab = {i: w for w, i in self.vocab_to_int.items()}
        self.tag_to_int = {w:i for i,w in enumerate(self.tags)}
        self.int_to_tag = {i: w for w, i in self.tag_to_int.items()}
        return self.vocab_to_int, self.tag_to_int

    def encode_text(self):
        self.encoded_sentence = []
        self.encoded_tag = []
        for sentence in self.sentences:
            self.encoded_sentence.append([self.vocab_to_int[w[0]] for w in sentence])
            self.encoded_tag.append([self.tag_to_int[w[2]] for w in sentence])

    def pad_features(self):

        self.padded_sentence = np.zeros((len(self.sentences), config.SEQ_LENGTH),dtype=int)
        self.padded_tag = np.zeros((len(self.sentences), config.SEQ_LENGTH),dtype=int)

        print("Padding Sentence")
        for i, row in enumerate(self.encoded_sentence):
            self.padded_sentence[i, -len(row):] = np.array(row)[:config.SEQ_LENGTH]
        print("Padding Tag")
        for i, row in enumerate(self.encoded_tag):
            self.padded_tag[i, -len(row):] = np.array(row)[:config.SEQ_LENGTH]

    def process_text(self, text):
        encoded_text = []
        for word in text.split():
            code = self.vocab_to_int.get(word)
            if code != None:
                encoded_text.append(code)

        padded_text = np.zeros((1, config.SEQ_LENGTH),dtype=int)
        padded_text[0,-len(encoded_text):] = encoded_text

        return padded_text

In [23]:
def data_split(encoded_features, encoded_labels):
    split_idx = int(len(encoded_features) * config.SPLIT)
    train_x, remaining_x = encoded_features[:split_idx], encoded_features[split_idx:]
    train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

    test_idx = int(len(remaining_x) * 0.5)
    val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
    val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

    ## print out the shapes of your resultant feature data
    print("\t\t\tFeature Shapes:")
    print("Train set: \t\t{}".format(train_x.shape),
          "\nValidation set: \t{}".format(val_x.shape),
          "\nTest set: \t\t{}".format(test_x.shape))

    return train_x, train_y, val_x, val_y, test_x, test_y

In [24]:
ner_data = NERData()
ner_data.load_data()
sentences = ner_data.sentence_getter()
vocab_to_int, tag_to_int = ner_data.vocab_dict()
ner_data.encode_text()
ner_data.pad_features()
encoded_features= ner_data.padded_sentence
encoded_labels = ner_data.padded_tag

Unique Words  35178
Unique Tags  17
Padding Sentence
Padding Tag


In [25]:
encoded_features[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0, 22217,
        7116, 22965, 22326, 18829, 18981, 33587, 34436,  8261, 19864,
       10226,  9767,  2677, 26962, 16623, 19864, 24037,  7116, 32800,
       15706, 18443, 30545, 33333, 18752])

In [26]:
train_x, train_y, val_x, val_y, test_x, test_y = data_split(encoded_features, encoded_labels)
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=config.BATCH_SIZE,drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=config.BATCH_SIZE,drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=config.BATCH_SIZE,drop_last=True)


			Feature Shapes:
Train set: 		(38367, 50) 
Validation set: 	(4796, 50) 
Test set: 		(4796, 50)


In [28]:


vocab_size = len(ner_data.vocab_to_int)+1
output_size = len(ner_data.tag_to_int)+1


In [34]:
params = {}
params['vocab_size'] = vocab_size
params['output_size'] = output_size
params['embedding_dim'] = EMBEDDING_DIM
params['lstm_hidden_dim']= HIDDEN_DIM
params['n_layers'] = 1

In [35]:
class Net(nn.Module):
    def __init__(self, params):
        self.output_size = params['output_size']
        self.n_layers = params['n_layers']
        self.hidden_dim = params['lstm_hidden_dim']
        super(Net, self).__init__()
        #maps each token to an embedding_dim vector
        self.embedding = nn.Embedding(params['vocab_size'], params['embedding_dim'])

        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(params['embedding_dim'], params['lstm_hidden_dim'], batch_first=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(params['lstm_hidden_dim'], params['output_size'])

    def forward(self, x):

        # apply the embedding layer that maps each token to its embedding
        s = self.embedding(x)  # dim: batch_size x batch_max_len x embedding_dim

        # run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)  # dim: batch_size x batch_max_len x lstm_hidden_dim

        # reshape the Variable so that each row contains one token
        # print("before ",s.shape)
        # s = s.view(-1, s.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim
        s = s.reshape(-1, s.shape[2])
        # print("after ",s.shape)

        # apply the fully connected layer and obtain the output for each token
        s = self.fc(s)  # dim: batch_size*batch_max_len x num_tags

        # s = s.reshape(x.size(0),50, s.shape[2])

        return F.log_softmax(s, dim=1)  # dim: batch_size*batch_max_len x num_tags

In [36]:
net = Net(params)

In [37]:
net

Net(
  (embedding): Embedding(35179, 150)
  (lstm): LSTM(150, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
)

In [38]:
lr=0.001
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
net.to(device=config.DEVICE)
net.train()

Net(
  (embedding): Embedding(35179, 150)
  (lstm): LSTM(150, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=18, bias=True)
)

In [42]:
def loss_fn(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)

    #mask out 'PAD' tokens
    mask = (labels > 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).item())
    #pick the values corresponding to labels and multiply by mask

    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens

In [43]:
def train_fn(data_loader, valid_loader, model, optimizer, device):

    for e in range(config.EPOCHS):
        counter = 0
        for inputs, labels in data_loader:
            model.train()
            counter += 1
            inputs, labels = inputs.to(device), labels.to(device)
            model.zero_grad()
            inputs = inputs.long()
            labels = labels.long()
            output= model(inputs)

            loss = loss_fn(output, labels)
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(model.parameters(), config.CLIP)
            optimizer.step()

            # loss stats
            if counter % config.PRINT_EVERY == 0:
                # Get validation loss
                val_losses = []
                model.eval()
                for val_inputs, val_labels in valid_loader:
                    val_inputs, val_labels = val_inputs.to(config.DEVICE), val_labels.to(config.DEVICE)
                    val_inputs = val_inputs.long()
                    val_labels = val_labels.long()
                    output = model(val_inputs)
                    output = output.to(config.DEVICE)
                    # output = model(inputs)
                    val_loss = loss_fn(output, val_labels)
                    val_losses.append(val_loss.item())

                model.train()
                print("Epoch: {}/{}...".format(e+1, config.EPOCHS),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))
    return model

In [44]:
net = train_fn(train_loader, valid_loader, net, optimizer, config.DEVICE)

Epoch: 1/4... Step: 100... Loss: 0.995073... Val Loss: 1.068169
Epoch: 1/4... Step: 200... Loss: 0.832875... Val Loss: 0.838384
Epoch: 1/4... Step: 300... Loss: 0.836846... Val Loss: 0.738358
Epoch: 2/4... Step: 100... Loss: 0.576773... Val Loss: 0.633245
Epoch: 2/4... Step: 200... Loss: 0.423081... Val Loss: 0.605756
Epoch: 2/4... Step: 300... Loss: 0.564061... Val Loss: 0.576240
Epoch: 3/4... Step: 100... Loss: 0.309191... Val Loss: 0.554033
Epoch: 3/4... Step: 200... Loss: 0.350453... Val Loss: 0.547461
Epoch: 3/4... Step: 300... Loss: 0.448983... Val Loss: 0.540254
Epoch: 4/4... Step: 100... Loss: 0.304602... Val Loss: 0.534080
Epoch: 4/4... Step: 200... Loss: 0.297349... Val Loss: 0.528012
Epoch: 4/4... Step: 300... Loss: 0.362707... Val Loss: 0.530787


In [45]:
def predict(model, data_object, input):

    padded_input = data_object.process_text(input)
    padded_input = torch.from_numpy(padded_input)

    # padded_input = data_object.padded_sentence[0]
    # padded_input = torch.from_numpy(padded_input)
    padded_input = padded_input.reshape(-1, 50)
    padded_input = padded_input.long()
    padded_input = padded_input.to(config.DEVICE)
    output = model(padded_input)
    ind = torch.max(output, dim=1).indices.detach().cpu().numpy()
    tags = " ".join(data_object.int_to_tag[x] for x in ind)

    output_sentence = []
    for w, i in zip(padded_input.cpu().detach().numpy()[0], ind):
        if w != 0:
            output_sentence.append((data_object.int_to_vocab[w] + '(' + data_object.int_to_tag[i]) + ')')

    sent = " ".join(x for x in output_sentence)

    return sent

In [50]:
ner_data.int_to_tag

{0: 'O',
 1: 'B-geo',
 2: 'B-eve',
 3: 'I-art',
 4: 'B-gpe',
 5: 'B-art',
 6: 'I-gpe',
 7: 'I-tim',
 8: 'I-org',
 9: 'I-per',
 10: 'B-org',
 11: 'B-tim',
 12: 'I-nat',
 13: 'I-eve',
 14: 'B-per',
 15: 'B-nat',
 16: 'I-geo'}

In [47]:
input = " Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country."
predict(net, ner_data, input)

'Thousands(B-per) of(I-org) demonstrators(B-per) have(B-geo) marched(B-per) through(B-tim) London(B-geo) to(B-tim) protest(B-geo) the(B-geo) war(B-org) in(B-tim) Iraq(B-geo) and(I-geo) demand(B-geo) the(B-geo) withdrawal(B-org) of(B-tim) British(B-gpe) troops(I-org) from(B-tim) that(B-geo)'

In [48]:
input = " Iraq demanded withdrawal of British troops. China also extended their support to Iran and Iraq"
predict(net, ner_data, input)

'The(B-org) Boeing(B-org) Company(I-org) is(B-geo) a(B-geo) great(B-tim) organization(I-tim) and(I-tim) David(B-per) is(I-per) its(B-tim) CEO(B-gpe)'

In [49]:
input = " The Boeing Company is a great organization and David Calhoun is its CEO"
predict(net, ner_data, input)

'The(B-org) Boeing(B-org) Company(I-org) is(B-geo) a(B-geo) great(B-tim) organization(I-tim) and(I-tim) David(B-per) is(I-per) its(B-tim) CEO(B-gpe)'