In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)
[K     |████████████████████████████████| 757kB 3.9MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6


In [3]:
!pip install pytorch_transformers

Collecting pytorch_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |█▉                              | 10kB 26.1MB/s eta 0:00:01[K     |███▊                            | 20kB 1.7MB/s eta 0:00:01[K     |█████▋                          | 30kB 2.6MB/s eta 0:00:01[K     |███████▍                        | 40kB 3.4MB/s eta 0:00:01[K     |█████████▎                      | 51kB 2.1MB/s eta 0:00:01[K     |███████████▏                    | 61kB 2.5MB/s eta 0:00:01[K     |█████████████                   | 71kB 2.9MB/s eta 0:00:01[K     |██████████████▉                 | 81kB 3.3MB/s eta 0:00:01[K     |████████████████▊               | 92kB 3.7MB/s eta 0:00:01[K     |██████████████████▋             | 102kB 2.8MB/s eta 0:00:01[K     |████████████████████▍           | 112kB 2.8MB/s eta 0:00:01[K     |██████████████████████▎     

In [0]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from pytorch_transformers import AdamW, WarmupLinearSchedule

In [5]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [0]:
import torch
import pandas as pd
import numpy as np
from torchcrf import CRF

In [0]:
device = torch.device('cuda')

## CRF-suite

In [8]:
!pip install pyprind

Collecting pyprind
  Downloading https://files.pythonhosted.org/packages/1e/30/e76fb0c45da8aef49ea8d2a90d4e7a6877b45894c25f12fb961f009a891e/PyPrind-2.11.2-py3-none-any.whl
Installing collected packages: pyprind
Successfully installed pyprind-2.11.2


In [0]:
import math
import warnings
import pyprind

In [10]:
ner_data = pd.read_csv("eng.train.txt", sep=" ", header=None, skip_blank_lines=False, encoding="utf-8")
ner_data.columns = ["token", "pos", "chunk", "ne"]

# Explore the distribution of NE tags in the dataset
tag_distribution = ner_data.groupby("ne").size().reset_index(name='counts')
print(tag_distribution)

       ne  counts
0   B-LOC      11
1  B-MISC      37
2   B-ORG      24
3   I-LOC    8286
4  I-MISC    4556
5   I-ORG   10001
6   I-PER   11128
7       O  168346


In [11]:
ner_data.head()

Unnamed: 0,token,pos,chunk,ne
0,-DOCSTART-,-X-,O,O
1,,,,
2,EU,NNP,I-NP,I-ORG
3,rejects,VBZ,I-VP,O
4,German,JJ,I-NP,I-MISC


In [12]:
classes = list(filter(lambda x: x not in ["O", np.nan], list(ner_data["ne"].unique())))

print(classes)

['I-ORG', 'I-MISC', 'I-PER', 'I-LOC', 'B-LOC', 'B-MISC', 'B-ORG']


In [13]:
sentences, sentence = [], []
pbar = pyprind.ProgBar(len(ner_data))

for index, row in ner_data.iterrows():
    # If the row is empty (no string in the token column)
    if type(row["token"]) != str:
        # If the current sentence is not empty, append it to the sentences and create a new sentence
        if len(sentence) > 0:
            sentences.append(sentence)
            sentence = []
    else:
        # If the row does not indicate the start of a document, add the token to the current sentence
        if type(row["token"]) != float and type(row["pos"]) != float and type(row["ne"]) != float:
            if not row["token"].startswith("-DOCSTART-"):
                sentence.append([row["token"], row["pos"], row["chunk"], row["ne"]])
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:32


In [0]:
def word_features(sentence, i, use_chunks=False):
    # Get the current word and POS
    word = sentence[i][0]
    pos = sentence[i][1]
    
    features = { "bias": 1.0,
                 "word.lower()": word.lower(),
                 "word[-3:]": word[-3:],
                 "word[-2:]": word[-2:],
                 "word.isupper()": word.isupper(),
                 "word.istitle()": word.istitle(),
                 "word.isdigit()": word.isdigit(),
                 "pos": pos,
                 "pos[:2]": pos[:2],
               }
    # If chunks are being used, add the current chunk to the feature dictionary
    if use_chunks:
        chunk = sentence[i][2]
        features.update({ "chunk": chunk })
    # If this is not the first word in the sentence...
    if i > 0:
        # Get the sentence's previous word and POS
        prev_word = sentence[i-1][0]
        prev_pos = sentence[i-1][1]
        # Add characteristics of the sentence's previous word and POS to the feature dictionary
        features.update({ "-1:word.lower()": prev_word.lower(),
                          "-1:word.istitle()": prev_word.istitle(),
                          "-1:word.isupper()": prev_word.isupper(),
                          "-1:pos": prev_pos,
                          "-1:pos[:2]": prev_pos[:2],
                        })
        # If chunks are being used, add the previous chunk to the feature dictionary
        if use_chunks:
            prev_chunk = sentence[i-1][2]
            features.update({ "-1:chunk": prev_chunk })
    # Otherwise, add 'BOS' (beginning of sentence) to the feature dictionary
    else:
        features["BOS"] = True
    # If this is not the last word in the sentence...
    if i < len(sentence)-1:
        # Get the sentence's next word and POS
        next_word = sentence[i+1][0]
        next_pos = sentence[i+1][1]
        # Add characteristics of the sentence's previous next and POS to the feature dictionary
        features.update({ "+1:word.lower()": next_word.lower(),
                          "+1:word.istitle()": next_word.istitle(),
                          "+1:word.isupper()": next_word.isupper(),
                          "+1:pos": next_pos,
                          "+1:pos[:2]": next_pos[:2],
                        })
        # If chunks are being used, add the next chunk to the feature dictionary
        if use_chunks:
            next_chunk = sentence[i+1][2]
            features.update({ "+1:chunk": next_chunk })
    # Otherwise, add 'EOS' (end of sentence) to the feature dictionary
    else:
        features["EOS"] = True
    # Return the feature dictionary
    return features

In [0]:
def sentence_features(sentence, use_chunks=False):
    return [word_features(sentence, i, use_chunks) for i in range(len(sentence))]

# Return the label (NER tag) for each word in a given sentence
def sentence_labels(sentence):
    return [label for token, pos, chunk, label in sentence]

In [16]:
from sklearn.model_selection import train_test_split
X = [sentence_features(sentence) for sentence in sentences]
y = [sentence_labels(sentence) for sentence in sentences]

# Split X and y into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("First token features:\n{}\n{}".format("-"*21, X_train[0][0]))
print("\nFirst token label:\n{}\n{}".format("-"*18, y_train[0][0]))

First token features:
---------------------
{'bias': 1.0, 'word.lower()': 'the', 'word[-3:]': 'The', 'word[-2:]': 'he', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'pos': 'DT', 'pos[:2]': 'DT', 'BOS': True, '+1:word.lower()': 'entire', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:pos': 'JJ', '+1:pos[:2]': 'JJ'}

First token label:
------------------
O


In [17]:
crf = sklearn_crfsuite.CRF(algorithm="lbfgs",
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True)

# Train the CRF model on the supplied training data
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [0]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=classes))

             precision    recall  f1-score   support

      I-ORG       0.90      0.88      0.89      2082
     I-MISC       0.92      0.84      0.88       932
      I-PER       0.93      0.94      0.94      2220
      I-LOC       0.91      0.92      0.92      1614
      B-LOC       1.00      1.00      1.00         2
     B-MISC       0.00      0.00      0.00         5
      B-ORG       1.00      1.00      1.00         8

avg / total       0.92      0.90      0.91      6863



  'precision', 'predicted', average, warn_for)


## CRF-pytorch

In [0]:
from sklearn.feature_extraction import DictVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import DataLoader
import numpy as np
from torch.optim import Adam
from pytorch_transformers import WarmupLinearSchedule
from tqdm import trange

In [0]:
for i in range(len(y_train)):
    y_train[i] = np.where(y_train[i] == 'B-LOC', 'I-LOC', y_train[i])
    y_train[i] = np.where(y_train[i] == 'B-LOC', 'I-LOC', y_train[i])
    y_train[i] = np.where(y_train[i] == 'B-MISC', 'I-MISC', y_train[i])
    y_train[i] = np.where(y_train[i] == 'B-ORG', 'I-ORG', y_train[i])

In [0]:
tags_vals = ['I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
tag2idx = {t : i for i, t in enumerate(tags_vals)}

In [21]:
tag2idx

{'I-LOC': 0, 'I-MISC': 1, 'I-ORG': 2, 'I-PER': 3, 'O': 4}

In [22]:
concat_train = np.concatenate(X_train)
vectorizer = DictVectorizer()                               
vectorizer.fit(concat_train) 

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
               sparse=True)

In [23]:
vectorizer.transform(concat_train[:100]).toarray().shape

(100, 61544)

In [24]:
X_lens = [len(X_train[i]) for i in range(len(X_train))]
y_lens = [len(y_train[i]) for i in range(len(y_train))]
X_lens == y_lens

True

In [0]:
def vectorize_feats_tags(features, tag_seq, bs=32):
              
    seq_len = np.max([len(features[i]) for i in range(len(features))])
    vectorized_features = [vectorizer.transform(f).toarray() for f in features]
    tag_seq = [[tag2idx.get(t) for t in one_seq] for one_seq in tag_seq] 
    
    for i in range(len(tag_seq)):
        if seq_len - len(tag_seq[i]) > 0:
            padding = [-1] * (seq_len - len(tag_seq[i]))
            tag_seq[i] += padding 
    
    vectorized_features_ = vectorized_features.copy()
    for i in range(len(vectorized_features_)):
        shape_0 = len(vectorized_features_[i])
        if seq_len - shape_0 > 0:
            vectorized_features_[i] = np.vstack((vectorized_features_[i], np.zeros(shape=(seq_len - shape_0, vectorized_features_[0].shape[1]))))
            
    
    tag_seq_tens = torch.tensor(tag_seq, dtype=torch.long)
    lab_mask = tag_seq_tens >= 0
    out_features = torch.tensor(vectorized_features_)

    tag_seq_tens = tag_seq_tens.to(device)
    lab_mask = lab_mask.to(device)
    out_features = out_features.to(device)
    
    return out_features, tag_seq_tens, lab_mask

In [0]:
class LogReg(nn.Module):
    def __init__(self, num_labels=5, bs=32, feature_dim=61544):
        super().__init__()
        self.linear = nn.Linear(feature_dim, num_labels)

    def forward(self, inp_vec):
        return F.log_softmax(self.linear(inp_vec), dim=1)

In [0]:
class CRF_baseline(nn.Module):
    def __init__(self, num_classes=5):
        super().__init__()
        self.lr_layer = LogReg()
        self.crf_layer = CRF(num_classes, batch_first=True)
    
    def forward(self, features, tags, mask):
        logits = self.lr_layer(features.float())
        return -self.crf_layer(logits, tags, mask=mask)

    def decode(self, features, tags, mask):
        emissions = self.lr_layer(features.float())
        return self.crf_layer.decode(emissions, mask=mask)

In [0]:
def batch_accuracy(preds, *true_lab):
    accs_batch = []
    for i in range(len(preds)):
        preds_tens = torch.tensor(preds[i])
        preds_tens = preds_tens.to(device)
        accs_batch.append(sum(true_lab[1][i][true_lab[2][i]] == preds_tens).item() / len(preds[i]))
    return np.mean(accs_batch)

In [0]:
model = CRF_baseline()
model = model.to(device)
MAX_N_EPOCHS = 7
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0.1, t_total=(len(X_train)//BATCH_SIZE) * MAX_N_EPOCHS)

def make_tensors(dataset_row):
    tokens, labels = tuple(zip(*dataset_row))
    return vectorize_feats_tags(tokens, labels)

def train(model, optimizer, lr_scheduler, train_dataset, y):
    train_dataloader = DataLoader(tuple(zip(train_dataset, y)), batch_size=BATCH_SIZE, shuffle=True, collate_fn=make_tensors)
    train_loss = dev_loss = []
    train_acc = dev_acc = []
    
    for epoch in trange(MAX_N_EPOCHS, desc='Epoch'):
        model.train()
        cur_loss = 0
        cur_acc = 0 
        for batch_tensors in tqdm(train_dataloader):
            model.train()
            model.zero_grad()
            loss = model(*batch_tensors)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

            cur_loss += loss.item()

            model.eval()
            pred = model.decode(*batch_tensors)
            cur_acc += batch_accuracy(pred, *batch_tensors)

        train_loss.append(cur_loss / (len(train_dataset)/BATCH_SIZE))
        train_acc.append(cur_acc / (len(train_dataset)/BATCH_SIZE))

        print('Loss: %.4f' % np.mean(np.array(train_loss)))
        print('Accuracy: %.4f' % np.mean(np.array(train_acc)))
    return train_loss, train_acc

In [0]:
train_loss_accs = train(model, optimizer, lr_scheduler, X_train, y_train)

Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, max=352), HTML(value='')))

Epoch:  14%|█▍        | 1/7 [47:33<4:45:23, 2853.87s/it]


Loss: 414.7999
Accuracy: 0.7866


HBox(children=(IntProgress(value=0, max=352), HTML(value='')))

Epoch:  29%|██▊       | 2/7 [1:35:56<3:59:03, 2868.64s/it]


Loss: 296.3996
Accuracy: 0.8293


HBox(children=(IntProgress(value=0, max=352), HTML(value='')))

Epoch:  43%|████▎     | 3/7 [2:24:21<3:11:57, 2879.27s/it]


Loss: 238.6805
Accuracy: 0.8571


HBox(children=(IntProgress(value=0, max=352), HTML(value='')))

In [0]:
train_loss_accs

[209.02602129806917,
 71.94053528734001,
 49.260082841545625,
 41.81062358800847,
 38.63594850648663]