# Structured perceptron

### Load libraries and data

In [1]:
from pathlib import Path
import scipy
import numpy as np
import os,sys
import pandas as pd
import sklearn
from sklearn import *
import os
import pickle
import skseq
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.label_dictionary import LabelDictionary

currentdir = Path.cwd()
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 


In [46]:
currentdir

WindowsPath('c:/Users/Car/Documents/UB_Master/NLP/nlp_name_entity_recognition')

In [2]:
path_data =  os.path.expanduser('~') 

In [3]:
train_df = pd.read_csv("./data/train_data_ner.csv")
test_df = pd.read_csv("./data/test_data_ner.csv")
tiny_test_df = pd.read_csv("./data/tiny_test.csv")

### Generate feature and label vectors

We need a sequence with each sentence, x having every word in a string and the tags in another vector

In [4]:
X_tr = train_df.groupby('sentence_id')['words'].apply(list).values
y_tr = train_df.groupby('sentence_id')['tags'].apply(list).values

In [5]:
X_test = test_df.groupby('sentence_id')['words'].apply(list).values
y_test = test_df.groupby('sentence_id')['tags'].apply(list).values
X_tiny_test = tiny_test_df.groupby('sentence_id')['words'].apply(list).values
y_tiny_test = tiny_test_df.groupby('sentence_id')['tags'].apply(list).values

In [6]:
print(X_tiny_test[1])
print(y_tiny_test[1])

['The', 'programmers', 'from', 'Barchelona', 'cannot', 'write', 'a', 'sentence', 'without', 'a', 'spell', 'checker', '.']
['O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


To create a sequence list, first we need a dictionary for the words and another for the tags

In [7]:
def dictionary(sentences, tags):
    from collections import defaultdict

    word_dict = defaultdict(lambda: len(word_dict))  # Dictionary for unique words
    tag_dict = defaultdict(lambda: len(tag_dict))  # Dictionary for unique tags

    # Word dictionary. We go through the word in each sentence and if it isn't there, we add it
    for sentence in sentences:
        for word in sentence:
            word_dict[word]

    # Tag dictionary. Same as with words but it will be shorter (there are less tags)
    for tag_list in tags:
        for tag in tag_list:
            tag_dict[tag]

    word_dict = dict(word_dict)
    tag_dict = dict(tag_dict)
    #tag_dict_rev = {v: k for k, v in tag_dict.items()}  # Reverse tag dictionary

    return word_dict, tag_dict #, tag_dict_rev


In [26]:
word_dict, tag_dict = dictionary(X_tr, y_tr)

In [27]:
print(word_dict)
print(tag_dict)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-tim': 3, 'B-org': 4, 'I-geo': 5, 'B-per': 6, 'I-per': 7, 'I-org': 8, 'B-art': 9, 'I-art': 10, 'I-tim': 11, 'I-gpe': 12, 'B-nat': 13, 'I-nat': 14, 'B-eve': 15, 'I-eve': 16}


We use the class SequenceList from the skseq package used in the code provided in class. This package also includes a class for dictionaries called LabelDictionary defines some useful functions needed for creating the sequence, so we need to transform our dictionaries into these ones.

In [29]:
seq = SequenceList(LabelDictionary(word_dict), LabelDictionary(tag_dict))

for i in range(len(X_tr)):
    # Add the sequence (X[i], y[i]) to the sequence list
    seq.add_sequence(X_tr[i], y_tr[i], LabelDictionary(word_dict), LabelDictionary(tag_dict))

In [30]:
len(word_dict)

31979

In [31]:
print(seq)

[0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 , 22/0 23/0 24/3 25/0 26/0 27/0 11/0 9/0 28/1 29/0 30/0 31/0 32/0 33/0 34/4 35/0 36/0 37/0 7/0 3/0 38/0 7/0 39/0 40/0 41/0 42/0 43/0 11/0 44/0 45/1 46/5 21/0 , 47/0 48/0 49/0 50/0 51/0 52/0 53/0 54/0 55/0 56/0 21/0 , 57/1 58/0 59/0 60/6 61/7 62/0 63/3 31/0 64/1 31/0 65/2 13/0 66/2 42/0 67/0 36/0 68/0 69/0 70/0 13/0 71/0 7/0 72/0 73/0 1/0 74/0 75/1 76/0 19/0 77/0 78/0 79/0 80/0 81/0 21/0 , 82/6 61/7 62/0 9/0 83/0 84/0 85/0 86/0 87/0 88/0 36/0 11/0 89/0 1/0 70/0 90/0 91/0 54/0 9/0 89/0 92/0 11/0 93/3 31/0 94/6 95/2 31/0 9/0 96/1 13/0 97/1 21/0 , 98/0 62/0 99/0 100/0 101/0 102/0 13/0 9/0 103/0 104/0 105/0 19/0 106/0 107/0 108/0 109/0 110/0 11/0 111/1 13/0 112/1 21/0 , 113/0 114/0 88/0 36/0 115/0 116/0 21/0 , 117/2 118/0 36/0 119/0 120/3 101/0 121/0 122/0 11/0 50/0 123/0 124/0 1/0 125/1 126/0 40/0 127/0 7/0 128/0 129/0 130/0 11/0 9/0 131/0 132/0 20/0 21/0 , 133/0 125/1 31/0 50/0 134

In [32]:
print(seq[0])
print(seq[0].to_words(sequence_list=seq))

0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 
Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O 


We need to inizialice the feature mapper first with the sequence created:

In [33]:
type(seq), type(seq[0])

(skseq.sequences.sequence_list.SequenceList, skseq.sequences.sequence.Sequence)

In [34]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(seq)

In [35]:
feature_mapper.build_features()

In [36]:
print ("Initial features:",     feature_mapper.feature_list[1][0])
print ("Transition features:",  feature_mapper.feature_list[1][1])
print ("Final features:",       feature_mapper.feature_list[1][2])
print ("Emission features:",    feature_mapper.feature_list[1][3])

Initial features: [[0]]
Transition features: [[3], [32], [34], [3], [3], [3], [3], [9], [11], [3], [3], [3], [3], [44], [46], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [9], [58], [59]]
Final features: [[28]]
Emission features: [[29], [30], [31], [33], [35], [36], [15], [13], [37], [38], [39], [40], [41], [42], [43], [45], [47], [48], [10], [5], [49], [10], [50], [51], [52], [53], [54], [15], [55], [56], [57], [27]]


In [37]:
inv_feature_dict = {word: pos for pos, word in feature_mapper.feature_dict.items()}

In [38]:
import skseq.sequences.structured_perceptron as spc

sp = spc.StructuredPerceptron(word_dict, tag_dict, feature_mapper)
sp.num_epochs = 5

In [39]:
def evaluate_corpus(sequences, sequences_predictions):
    """Evaluate classification accuracy at corpus level, comparing with
    gold standard."""
    total = 0.0
    correct = 0.0
    for i, sequence in enumerate(sequences):
        pred = sequences_predictions[i]
        for j, y_hat in enumerate(pred.y):
            if sequence.y[j] == y_hat:
                correct += 1
            total += 1
    return correct / total

In [40]:
pred_train = sp.viterbi_decode_corpus(seq)
eval_train = evaluate_corpus(seq.seq_list, pred_train)
print("SP -  Accuracy Train: %.3f "%(eval_train))

SP -  Accuracy Train: 0.847 


In [41]:
sp.fit(feature_mapper.dataset, 10)

Epoch: 0 Accuracy: 0.893815
Epoch: 1 Accuracy: 0.931674
Epoch: 2 Accuracy: 0.940913
Epoch: 3 Accuracy: 0.946175
Epoch: 4 Accuracy: 0.950018
Epoch: 5 Accuracy: 0.952577
Epoch: 6 Accuracy: 0.954425
Epoch: 7 Accuracy: 0.956033
Epoch: 8 Accuracy: 0.957185
Epoch: 9 Accuracy: 0.958481


In [50]:
sp.save_model("c:/Users/Car/Documents/UB_Master/NLP/nlp_name_entity_recognition/sp_base_")