# Preparing corpus

## Load data

In [26]:
import os
import sys
import torch
import pandas as pd

drive_path = 'drive/MyDrive/Colab Notebooks/NLU Final Project/'
sys.path.insert(0, os.path.abspath(drive_path))

import conll

train_set = [sent for sent in conll.read_corpus_conll(drive_path + 'data/conll/train.txt') if '-DOCSTART-' not in sent[0][0]]
test_set = [sent for sent in conll.read_corpus_conll(drive_path + 'data/conll/test.txt') if '-DOCSTART-' not in sent[0][0]]
val_set = [sent for sent in conll.read_corpus_conll(drive_path + 'data/conll/dev.txt') if '-DOCSTART-' not in sent[0][0]]

# reshape dataset
train_set = [[(token[0].split()[0], token[0].split()[-1]) for token in sent] for sent in train_set]
test_set = [[(token[0].split()[0], token[0].split()[-1]) for token in sent] for sent in test_set]
val_set = [[(token[0].split()[0], token[0].split()[-1]) for token in sent] for sent in val_set]

print(train_set[0][:4])
print(test_set[0][:4])
print(val_set[0][:4])

# link for comparisons
# https://spacy.io/usage/facts-figures

[('EU', 'B-ORG'), ('rejects', 'O'), ('German', 'B-MISC'), ('call', 'O')]
[('SOCCER', 'O'), ('-', 'O'), ('JAPAN', 'B-LOC'), ('GET', 'O')]
[('CRICKET', 'O'), ('-', 'O'), ('LEICESTERSHIRE', 'B-ORG'), ('TAKE', 'O')]


## Preparing data

### Define function to get vocabulary of words and tags (labels)

In [27]:
# idx specifies the index of the colunm to get (0: words, -1: tags)
def get_vocabulary(data, idx=0):
    vocab = set()
    for sent in data:
        for token in sent:
            vocab.add(token[idx])
    return sorted(list(vocab))

### Create numerical mappings for words and tags

In [28]:
def create_mapping(vocabulary, initial=None):
    idx = {} if initial is None else initial

    idx.update(
        {w: i + len(idx) for i, w in enumerate(vocabulary)}
        )
    return idx

In [29]:
# get vocab
words = get_vocabulary(train_set)
labels = get_vocabulary(train_set, -1)

# word mappings

# word to index
# word index
word2idx = create_mapping(words, initial={"<PAD>":0, "<UNK>":1})

# index to word, the inverse of the previous map
# allowed since indexes are unique
idx2word = {v: k for k, v in word2idx.items()}

# labels mappings

# label to index
label2idx = create_mapping(labels)

# index to label, the inverse of the previous map
# allowed since indexes are unique
idx2label = {v: k for k, v in label2idx.items()}

### Pad and truncate

In [30]:
def pad_and_truncate(sequences, max_len, value):

  for i, seq in enumerate(sequences):
    l = len(seq)
    if l > max_len:
      # truncate
      sequences[i] = seq[:max_len]
    else:
      # pad
      delta = max_len - l
      pad = [value for i in range(delta)]
      seq.extend(pad)
      sequences[i] = seq
  
  return sequences

Prepare in advance a function to unpad e change the format for CoNLL evaluation

In [31]:
def unpad(preds, truths):
  refs = truths.copy()
  if len(preds) != len(refs):
    raise ValueError
  
  pp = 0
  tt = 0

  hyps = []
  
  for i, sent in enumerate(refs):
    if len(sent) < len(preds[i]):
      # sentence has been padded
      hyps.append(preds[i][:len(sent)])
    
    elif len(sent) > len(preds[i]):
      # sentence has been truncated
      # in this case its necessary to truncate
      # also the ground truth so that they can be
      # compared
      refs[i] = refs[i][:len(preds[i])]
      hyps.append(preds[i])
    
    elif len(sent) == len(preds[i]):
      hyps.append(preds[i])

  return hyps, refs

In [32]:
def format_predictions_for_conll(preds, idx2label):
  return [[('_', idx2label.get(i)) for i in s] for s in preds]

## Define functions to prepare the dataset and the train step for later

### Train set

In [33]:
import torch.nn.functional as F

def prepare_train(train_set, labels, word2idx, label2idx, max_len=None):
  # vectorize data
  x_train_int = [[word2idx[w] for w, t in s] for s in train_set]
  # print("Train textual: {}".format(list(map(lambda x: x[0], train_set[0]))))
  # print("Train encoded: {}".format(x_train_int[0]))

  # padding and truncating

  # get max length
  if max_len is None:
    max_len = max(map(len, x_train_int))

  # pad the sentences to max length
  x_train_pad = pad_and_truncate(x_train_int, max_len, word2idx['<PAD>'])

  # vectorize labels
  y_train_int = [[label2idx[t] for w, t in s] for s in train_set]
  y_train_pad = pad_and_truncate(y_train_int, max_len, label2idx['O'])

  x_train_pad = torch.tensor(x_train_pad)
  y_train_pad = torch.tensor(y_train_pad)

  # 8 is the id of 'O'
  # print("Textual: {}".format(list(map(lambda x: x[1], train_set[0]))))
  # print("Encoded & Padded: {}".format(y_train_pad[0]))

  # one-hot encoding for labels
  y_train_ohv = F.one_hot(y_train_pad, num_classes=len(labels))

  return x_train_pad, y_train_ohv.float(), max_len

### Test Set

In [34]:
def prepare_test(test_set, labels, word2idx, label2idx, max_len):
  # replace words not in training with <UNK>
  # insert in X_test_int the index of the word if it is in the vocabulary
  # otherwise inserts the index for <UNK>
  x_test_int = [[word2idx.get(w, word2idx.get('<UNK>')) for w, t in s] for s in test_set]
  x_test_pad = pad_and_truncate(x_test_int, max_len, word2idx['<PAD>'])

  # replace tags not in training with 'O'
  # same way as before
  y_test_int = [[label2idx.get(t, label2idx.get('O')) for w, t in s] for s in test_set]
  y_test_pad = pad_and_truncate(y_test_int, max_len, label2idx['O'])

  x_test_pad = torch.tensor(x_test_pad)
  y_test_pad = torch.tensor(y_test_pad)

  # to one-hot encoding
  y_test_ohv = F.one_hot(y_test_pad, num_classes=len(labels))

  return x_test_pad, y_test_ohv.float()

## Putting all in a Dataset object

In [35]:
from torch.utils.data import TensorDataset, DataLoader

# mean_len = int(sum(map(len, train_set))/len(train_set))
# print(mean_len)

X_train, Y_train, max_len = prepare_train(train_set, labels, word2idx, label2idx)
X_test, Y_test = prepare_test(test_set, labels, word2idx, label2idx, max_len)
X_val, Y_val = prepare_test(val_set, labels, word2idx, label2idx, max_len)

train_dataset = TensorDataset(X_train, Y_train)
# test_dataset = TensorDataset(X_test, Y_test)
# val_dataset = TensorDataset(X_val, Y_val)

# Define train step

In [36]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [37]:
def train_step(X, Y, model, loss_f, optimizer):
  model.zero_grad()

  # Get model predictions
  pred = model(X)

  # Computes loss
  loss = loss_f(pred, Y)

  # Computes gradients
  loss.backward()

  # Updates parameters and zeroes gradients
  optimizer.step()

  # Returns the loss
  return loss.item()

# Create and train a BiLSTM

## Define the model

In [38]:
import torch.nn as nn

class BiLSTM(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, pad_value):
    super(BiLSTM, self).__init__()

    self.hidden_dim = hidden_dim

    self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_value)

    self.dropout = nn.Dropout(0.5)

    # The LSTM takes word embeddings as inputs, and outputs hidden states
    # with dimensionality hidden_dim.
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)

    # The linear layer that maps from hidden state space to tag space
    self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    self.softmax = nn.Softmax(dim=2)

  def forward(self, sentences):
    # INPUT: batch size (= number of sentences) x len (= number of tokens)

    # EMBEDS: # batch size x len x embedding_dim
    # B sentences composed of L tokens, one embedding for token
    embeds = self.word_embeddings(sentences)
    embeds = self.dropout(embeds)

    # LSTM_OUT: batch x len x hidden_dim * 2 (because it is bidirectional)
    lstm_out, _ = self.lstm(embeds)

    # TAG_SPACE: batch x len x tagset_size
    # for each token in each sentence Linear gives a vector of scores, one for each class/label
    tag_space = self.hidden2tag(lstm_out)

    # convert to probabilities
    tag_scores = self.softmax(tag_space)
    
    return tag_scores

In [39]:
model = BiLSTM(
    embedding_dim=64, # embedding output
    hidden_dim=64, # lstm output
    vocab_size=len(words)+2,
    tagset_size=len(labels),
    pad_value=word2idx['<PAD>']
).to(device)

model_saved = False

if os.path.isfile(drive_path + "models/bilstm"):
  model.load_state_dict(torch.load(drive_path + "models/bilstm", map_location=torch.device(device)))
  model_saved = True

model

BiLSTM(
  (word_embeddings): Embedding(23625, 64, padding_idx=0)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(64, 64, batch_first=True, bidirectional=True)
  (hidden2tag): Linear(in_features=128, out_features=9, bias=True)
  (softmax): Softmax(dim=2)
)

## Training the model

In [40]:
def train_model(model, train_dataset, X_validation, Y_validation, bs, epochs, print_every, device):
  train_dataloader = DataLoader(
    train_dataset,
    batch_size=bs,
    shuffle=True
    )
  
  loss_f = nn.BCELoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
  loss = 0
  losses = []

  tot_batches = len(train_dataloader)
  
 
  for e in range(epochs):
    processed = 0

    for X_batch, Y_batch in train_dataloader:
      model.train()

      loss = train_step(X_batch.to(device), Y_batch.to(device), model, loss_f, optimizer)
      losses.append(loss)

      processed += 1


      if processed % print_every == 0:
        # validation
        acc, f1, _ = evaluate_model(model, X_validation, Y_validation, device)

        print(f"\rEpoch:{e+1}/{epochs}, Batches: {processed}/{tot_batches}, Loss: {loss}, Val: acc: {acc}, f1: {f1}")
  
  return losses

In [41]:
import numpy as np
import sklearn.metrics as mt

def evaluate_model(model, X, Y, device):
  model.eval()

  predictions = None
  truths = None

  with torch.no_grad():

    # recover labels from one-hot encoding
    truths = Y.argmax(dim=2)

    # recover labels from probabilities
    preds = model(X.to(device))
    predictions = preds.argmax(dim=2)

    # flatten labels
    truths = truths.reshape(-1).cpu().numpy()

    predictions = predictions.reshape(-1).cpu().numpy()

    return mt.accuracy_score(truths, predictions), mt.f1_score(truths, predictions, average='macro'), preds

In [42]:
import matplotlib.pyplot as plt

if not model_saved:
  losses = train_model(model, train_dataset, X_val, Y_val, 60, 10, 100, device)

  plt.grid()
  plt.plot(range(len(losses)), losses)

In [43]:
# saving the model
if not model_saved:
  print("Saving the model for later use")
  torch.save(model.state_dict(), drive_path + "models/bilstm")

## Evaluating the model

In [44]:
acc, f1, preds_prob = evaluate_model(model, X_test, Y_test, device)
print(f'Accuracy: {acc*100}, f1: {f1*100}')

Accuracy: 98.87054735013032, f1: 72.35326577136543


## CoNLL evaluation

In [45]:
preds = preds_prob.argmax(dim=2).cpu().numpy()
preds_unpadded, truncated_truths = unpad(preds, test_set)

results = conll.evaluate(truncated_truths, format_predictions_for_conll(preds_unpadded, idx2label))
pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
MISC,0.567,0.637,0.6,702
LOC,0.866,0.761,0.81,1667
ORG,0.753,0.613,0.676,1661
PER,0.355,0.847,0.5,1616
total,0.55,0.727,0.626,5646


# CRF

## Features

In [46]:
def word2features(sent, i):
    word = sent[i][0]
    return {'bias': 1.0, 'word.lower()': word.lower()}

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

{'bias': 1.0, 'word.lower()': 'eu'}

In [47]:
train_crf_feats = [sent2features(s) for s in train_set]
train_crf_labels = [sent2labels(s) for s in train_set]

CPU times: user 150 ms, sys: 28.4 ms, total: 179 ms
Wall time: 188 ms


In [59]:
train_crf_feats[0][0]

{'bias': 1.0, 'word.lower()': 'eu'}

## Training

In [48]:
!pip install -U 'scikit-learn<0.24'
!pip install python_crfsuite sklearn_crfsuite



In [49]:
from sklearn_crfsuite import CRF

crf = CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)

repr(crf)



"CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,\n    keep_tempfiles=None, max_iterations=100)"

In [50]:
%%time
crf.fit(train_crf_feats, train_crf_labels)

CPU times: user 14.7 s, sys: 49.7 ms, total: 14.8 s
Wall time: 14.8 s




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

## Evaluation

In [54]:
test_feats = [sent2features(s) for s in test_set]
pred_crf = crf.predict(test_feats)

# convert to tuples for conll evaluation
pred_crf = [[(test_feats[i][j], t) for j, t in enumerate(tokens)] for i, tokens in enumerate(pred_crf)]

pred_crf[0]

[({'bias': 1.0, 'word.lower()': 'soccer'}, 'O'),
 ({'bias': 1.0, 'word.lower()': '-'}, 'O'),
 ({'bias': 1.0, 'word.lower()': 'japan'}, 'B-LOC'),
 ({'bias': 1.0, 'word.lower()': 'get'}, 'O'),
 ({'bias': 1.0, 'word.lower()': 'lucky'}, 'O'),
 ({'bias': 1.0, 'word.lower()': 'win'}, 'O'),
 ({'bias': 1.0, 'word.lower()': ','}, 'O'),
 ({'bias': 1.0, 'word.lower()': 'china'}, 'B-LOC'),
 ({'bias': 1.0, 'word.lower()': 'in'}, 'O'),
 ({'bias': 1.0, 'word.lower()': 'surprise'}, 'O'),
 ({'bias': 1.0, 'word.lower()': 'defeat'}, 'O'),
 ({'bias': 1.0, 'word.lower()': '.'}, 'O')]

In [57]:
results = conll.evaluate(test_set, pred_crf)

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)

Unnamed: 0,p,r,f,s
MISC,0.78,0.61,0.684,702
LOC,0.717,0.657,0.686,1668
ORG,0.791,0.494,0.608,1661
PER,0.763,0.398,0.523,1617
total,0.755,0.529,0.622,5648


# BiLSTM-CRF


