In [187]:
# Required imports
import torch
import numpy as np
import pandas as pd
import pickle
from torch.nn import Linear, Embedding, RNN, GRU, LSTM
from torch.nn import Sigmoid, LogSoftmax, Softmax
from torch.optim import SGD
from torch.nn import BCELoss, NLLLoss, CrossEntropyLoss
from string import punctuation
import itertools
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [27]:
data = pd.read_pickle('../data/penn_treebank_pos.pkl')
data.head()

Unnamed: 0,text,label
0,"[pierre, vinken, ,, 61, years, old, ,, will, j...","[NOUN, NOUN, ., NUM, NOUN, ADJ, ., VERB, VERB,..."
1,"[mr., vinken, is, chairman, of, elsevier, n.v....","[NOUN, NOUN, VERB, NOUN, ADP, NOUN, NOUN, ., D..."
2,"[rudolph, agnew, ,, 55, years, old, and, forme...","[NOUN, NOUN, ., NUM, NOUN, ADJ, CONJ, ADJ, NOU..."
3,"[a, form, of, asbestos, once, used, *, *, to, ...","[DET, NOUN, ADP, NOUN, ADV, VERB, X, X, PRT, V..."
4,"[the, asbestos, fiber, ,, crocidolite, ,, is, ...","[DET, NOUN, NOUN, ., NOUN, ., VERB, ADV, ADJ, ..."


In [52]:
all_words = set(itertools.chain.from_iterable(data['text']))
all_labels = set(itertools.chain.from_iterable(data['label']))

word2idx = {word: idx for idx, word in enumerate(all_words)}
idx2word = {idx: word for word, idx in word2idx.items()}

label2idx = {word: idx for idx, word in enumerate(all_labels)}
idx2label = {idx: word for word, idx in label2idx.items()}

vocab_size = len(all_words)
label_size = len(all_labels)

In [58]:
features = data['text'].map(lambda x: [word2idx[i] for i in x]).tolist()
labels = data['label'].map(lambda x: [label2idx[i] for i in x]).tolist()

train_data, test_data = train_test_split(list(zip(features, labels)))

In [94]:
class pos_tagger(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, batch_size):
        super(pos_tagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = Embedding(num_embeddings=vocab_size, 
                                   embedding_dim=embedding_dim)
        self.rnn = LSTM(input_size=embedding_dim, 
                       hidden_size=hidden_dim)
        self.linear = Linear(hidden_dim, output_dim)
        self.batch_size = batch_size
        self.softmax = LogSoftmax(dim=2)
        self.hidden = self.init_hidden()
                
    def forward(self, x):
        e = self.embedding(x)
        e = e.view(len(x), self.batch_size, -1)
        out, self.hidden = self.rnn(e, self.hidden)
        output = self.linear(out)
        so = self.softmax(output)
        return so
                  
    def init_hidden(self):
        h0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        c0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)

In [95]:
model = pos_tagger(vocab_size = vocab_size, 
                       embedding_dim=100, 
                       hidden_dim=50, 
                       output_dim=label_size, 
                       batch_size=1)

In [99]:
X = torch.LongTensor(features[0])
y = torch.LongTensor(labels[0])

output = model.forward(X).squeeze(1)

In [102]:
criterion = NLLLoss()

criterion(output, y)

tensor(2.4618, grad_fn=<NllLossBackward>)

In [139]:
optim = SGD(params=model.parameters(), lr=0.01)
criterion = NLLLoss()

for i in range(10):
    total_loss = 0
    model.train()
    y_true_train = []
    y_pred_train = []
    for it, example in enumerate(train_data):

        f, t = example
        X = torch.LongTensor(f)
        y = torch.LongTensor(t)
        
        model.hidden = model.init_hidden()
        output = model.forward(X).squeeze(1)
        optim.zero_grad()
        prediction = torch.argmax(output, dim=1)
        loss = criterion(output, y)
        total_loss += loss.data.numpy()
        y_true_train.extend(list(y.data.numpy()))
        y_pred_train.extend(list(prediction.numpy()))
        loss.backward()

        optim.step()

    model.eval()
    y_pred = []
    y_true = []
    for example in test_data:
        optim.zero_grad()
        f, t = example
        X = torch.LongTensor(f)
        y = torch.LongTensor(t)

        model.hidden = model.init_hidden()
        output = model.forward(X).squeeze(1)
        prediction = torch.argmax(output, dim=1)

        y_true.extend(list(y.data.numpy()))
        y_pred.extend(list(prediction.numpy()))

    a = accuracy_score(y_true, y_pred)
    a_train = accuracy_score(y_true_train, y_pred_train)
    total_loss /= (it + 1)

    print("Loss: {:.2f}, Training Accuracy: {:.2f}, Validation Accuracy: {:.2f}".format(total_loss, a_train, a))

Loss: 1.17, Training Accuracy: 0.66, Validation Accuracy: 0.70
Loss: 0.95, Training Accuracy: 0.72, Validation Accuracy: 0.74
Loss: 0.81, Training Accuracy: 0.76, Validation Accuracy: 0.77
Loss: 0.72, Training Accuracy: 0.78, Validation Accuracy: 0.79
Loss: 0.66, Training Accuracy: 0.80, Validation Accuracy: 0.81
Loss: 0.60, Training Accuracy: 0.82, Validation Accuracy: 0.83
Loss: 0.56, Training Accuracy: 0.83, Validation Accuracy: 0.84
Loss: 0.52, Training Accuracy: 0.84, Validation Accuracy: 0.84
Loss: 0.49, Training Accuracy: 0.85, Validation Accuracy: 0.85
Loss: 0.46, Training Accuracy: 0.86, Validation Accuracy: 0.85


In [185]:
sentence = "we run home ."
words = sentence.lower().split()
sample = [word2idx[i] for i in words]
preds = [idx2label[i] for i in list(torch.argmax(model.forward(torch.LongTensor(sample)), dim=2).data.numpy().reshape(-1))]
for word, pred in zip(words, preds):
    print(word, pred)

we PRON
run VERB
home NOUN
. .


In [186]:
sentence = "I went for a run today"
words = sentence.lower().split()
sample = [word2idx[i] for i in words]
preds = [idx2label[i] for i in list(torch.argmax(model.forward(torch.LongTensor(sample)), dim=2).data.numpy().reshape(-1))]
for word, pred in zip(words, preds):
    print(word, pred)

i PRON
went VERB
for ADP
a DET
run NOUN
today NOUN


In [None]:
sentence = "we run home ."
words = sentence.lower().split()
sample = [word2idx[i] for i in words]
preds = [idx2label[i] for i in list(torch.argmax(model.forward(torch.LongTensor(sample)), dim=2).data.numpy().reshape(-1))]
for word, pred in zip(words, preds):
    print(word, pred)