# Sequence Tagging with Tensorflow

In [2]:
from collections import Counter

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

import ner_data_utils
import ner_model_utils
import SequenceTagger
import sklearn

# GermanEval 2014 Dataset
## Preprocessing

### Reading in data and getting it in the right format

In [3]:
train_file_path = '/Users/thomas/Jupyter_Notebooks/SBWL - Project/data/wikipedia/NER-de-train.tsv'
test_file_path = '/Users/thomas/Jupyter_Notebooks/SBWL - Project/data/wikipedia/NER-de-test.tsv'

text_as_list, words, unique_chars = ner_data_utils.preprocess_data(train_file_path, False, True)
test_text_as_list, test_words, test_unique_chars = ner_data_utils.preprocess_data(test_file_path, False, True)

File loaded: /Users/thomas/Jupyter_Notebooks/SBWL - Project/data/wikipedia/NER-de-train.tsv !
File loaded: /Users/thomas/Jupyter_Notebooks/SBWL - Project/data/wikipedia/NER-de-test.tsv !


In [4]:
# O - entity is dominating. this maked training harder
ents = [sent[-1] for sentence in text_as_list for sent in sentence]
Counter(ents).most_common()

[('O', 408728),
 ('B-LOC', 11585),
 ('B-PER', 7906),
 ('B-ORG', 6047),
 ('I-PER', 4425),
 ('I-ORG', 3753),
 ('I-OTH', 3493),
 ('B-OTH', 3413),
 ('I-LOC', 1181)]

In [5]:
len(text_as_list)

23999

In [6]:
text_as_list[0]

[((['S', 'c', 'h', 'a', 'r', 't', 'a', 'u'], 'Schartau'), 'B-PER'),
 ((['s', 'a', 'g', 't', 'e'], 'sagte'), 'O'),
 ((['d', 'e', 'm'], 'dem'), 'O'),
 ((["'", "'"], "''"), 'O'),
 ((['T', 'a', 'g', 'e', 's', 's', 'p', 'i', 'e', 'g', 'e', 'l'],
   'Tagesspiegel'),
  'B-ORG'),
 ((["'", "'"], "''"), 'O'),
 ((['v', 'o', 'm'], 'vom'), 'O'),
 ((['F', 'r', 'e', 'i', 't', 'a', 'g'], 'Freitag'), 'O'),
 (([','], ','), 'O'),
 ((['F', 'i', 's', 'c', 'h', 'e', 'r'], 'Fischer'), 'B-PER'),
 ((['s', 'e', 'i'], 'sei'), 'O'),
 ((["'", "'"], "''"), 'O'),
 ((['i', 'n'], 'in'), 'O'),
 ((['e', 'i', 'n', 'e', 'r'], 'einer'), 'O'),
 ((['W', 'e', 'i', 's', 'e'], 'Weise'), 'O'),
 ((['a', 'u', 'f', 'g', 'e', 't', 'r', 'e', 't', 'e', 'n'], 'aufgetreten'),
  'O'),
 (([','], ','), 'O'),
 ((['d', 'i', 'e'], 'die'), 'O'),
 ((['a', 'l', 'l', 'e', 's'], 'alles'), 'O'),
 ((['a', 'n', 'd', 'e', 'r', 'e'], 'andere'), 'O'),
 ((['a', 'l', 's'], 'als'), 'O'),
 ((['ü', 'b', 'e', 'r', 'z', 'e', 'u', 'g', 'e', 'n', 'd'], 'überzeug

In [7]:
test_text_as_list[0]

[((['1', '9', '5', '1'], '1951'), 'O'),
 ((['b', 'i', 's'], 'bis'), 'O'),
 ((['1', '9', '5', '3'], '1953'), 'O'),
 ((['w', 'u', 'r', 'd', 'e'], 'wurde'), 'O'),
 ((['d', 'e', 'r'], 'der'), 'O'),
 ((['n', 'ö', 'r', 'd', 'l', 'i', 'c', 'h', 'e'], 'nördliche'), 'O'),
 ((['T', 'e', 'i', 'l'], 'Teil'), 'O'),
 ((['a', 'l', 's'], 'als'), 'O'),
 ((['J', 'u', 'g', 'e', 'n', 'd', 'b', 'u', 'r', 'g'], 'Jugendburg'), 'O'),
 ((['d', 'e', 's'], 'des'), 'O'),
 ((['K', 'o', 'l', 'p', 'i', 'n', 'g', 'w', 'e', 'r', 'k', 'e', 's'],
   'Kolpingwerkes'),
  'B-OTH'),
 ((['g', 'e', 'b', 'a', 'u', 't'], 'gebaut'), 'O'),
 ((['.'], '.'), 'O')]

In [8]:
# concatenate the list of words and get them ordered
words = np.concatenate((words, test_words), axis = 0)
most_common = Counter(words).most_common()

### Create lookup dicts

In [9]:
specials = ['B-OTH', 'I-OTH', 'B-LOC', 'I-LOC', 'B-ORG','I-ORG','B-PER', 'I-PER', 'O','<PAD>', '<UNK>']

word2ind, ind2word, vocab_size = ner_data_utils.create_lookup_dicts(most_common, specials=specials)
char2ind, ind2char, char_vocab_size = ner_data_utils.create_char_lookup_dicts(sorted(list(unique_chars)))



In [10]:
# 71382 --> vocab_size with only train sentences
# 81720 --> with both, training and testing sentences
# 84835 --> both, not lower case. 
vocab_size , char_vocab_size

(84827, 295)

In [11]:
# seems alright
count = 0
for (k,i), (ii,kk) in zip(word2ind.items(), ind2word.items()):
    count +=1
    if count > 15:
        break
    print(k, i, end = '\t\t'), print(kk, ii)



B-OTH 0		B-OTH 0
I-OTH 1		I-OTH 1
B-LOC 2		B-LOC 2
I-LOC 3		I-LOC 3
B-ORG 4		B-ORG 4
I-ORG 5		I-ORG 5
B-PER 6		B-PER 6
I-PER 7		I-PER 7
O 8		O 8
<PAD> 9		<PAD> 9
<UNK> 10		<UNK> 10
. 11		. 11
, 12		, 12
der 13		der 13
die 14		die 14


### Convert 

In [12]:
converted_inputs, converted_targets, unk_words, n_unks = ner_data_utils.convert_inputs_and_targets(text_as_list,
                                                                                                   word2ind,
                                                                                                   test_text_as_list,
                                                                                                   char2ind=char2ind,
                                                                                                   chars=True)


In [13]:
converted_inputs[0]

[(([48, 60, 65, 58, 75, 77, 58, 78], 25830), 6),
 (([76, 58, 64, 77, 62], 116), 8),
 (([61, 62, 70], 30), 8),
 (([6, 6], 19), 8),
 (([49, 58, 64, 62, 76, 76, 73, 66, 62, 64, 62, 69], 6643), 4),
 (([6, 6], 19), 8),
 (([79, 72, 70], 81), 8),
 (([35, 75, 62, 66, 77, 58, 64], 581), 8),
 (([11], 12), 8),
 (([35, 66, 76, 60, 65, 62, 75], 1515), 6),
 (([76, 62, 66], 114), 8),
 (([6, 6], 19), 8),
 (([66, 71], 16), 8),
 (([62, 66, 71, 62, 75], 55), 8),
 (([52, 62, 66, 76, 62], 806), 8),
 (([58, 78, 63, 64, 62, 77, 75, 62, 77, 62, 71], 7702), 8),
 (([11], 12), 8),
 (([61, 66, 62], 14), 8),
 (([58, 69, 69, 62, 76], 290), 8),
 (([58, 71, 61, 62, 75, 62], 189), 8),
 (([58, 69, 76], 35), 8),
 (([143, 59, 62, 75, 83, 62, 78, 64, 62, 71, 61], 11547), 8),
 (([80, 58, 75], 51), 8),
 (([6, 6], 19), 8),
 (([13], 11), 8)]

In [14]:
# it seems to work well. 
for (word, ent) in converted_inputs[0]:
    print([ind2char[ch] for ch in word[0]],ind2word[word[1]], ind2word[ent])

['S', 'c', 'h', 'a', 'r', 't', 'a', 'u'] Schartau B-PER
['s', 'a', 'g', 't', 'e'] sagte O
['d', 'e', 'm'] dem O
["'", "'"] '' O
['T', 'a', 'g', 'e', 's', 's', 'p', 'i', 'e', 'g', 'e', 'l'] Tagesspiegel B-ORG
["'", "'"] '' O
['v', 'o', 'm'] vom O
['F', 'r', 'e', 'i', 't', 'a', 'g'] Freitag O
[','] , O
['F', 'i', 's', 'c', 'h', 'e', 'r'] Fischer B-PER
['s', 'e', 'i'] sei O
["'", "'"] '' O
['i', 'n'] in O
['e', 'i', 'n', 'e', 'r'] einer O
['W', 'e', 'i', 's', 'e'] Weise O
['a', 'u', 'f', 'g', 'e', 't', 'r', 'e', 't', 'e', 'n'] aufgetreten O
[','] , O
['d', 'i', 'e'] die O
['a', 'l', 'l', 'e', 's'] alles O
['a', 'n', 'd', 'e', 'r', 'e'] andere O
['a', 'l', 's'] als O
['ü', 'b', 'e', 'r', 'z', 'e', 'u', 'g', 'e', 'n', 'd'] überzeugend O
['w', 'a', 'r'] war O
["'", "'"] '' O
['.'] . O


## Pretrained embeddings

In [15]:
embed = hub.Module("https://tfhub.dev/google/nnlm-de-dim128-with-normalization/1")
emb = embed([key for key in word2ind.keys()])

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    embedding = sess.run(emb)

INFO:tensorflow:Using /var/folders/lg/w88j2v8x1x33jfgh34ltmljw0000gn/T/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/nnlm-de-dim128-with-normalization/1'.
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/nnlm-de-dim128-with-normalization/1'.
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/var/folders/lg/w88j2v8x1x33jfgh34ltmljw0000gn/T/tfhub_modules/92b9fb774490e712dd3427a83d7dd17b11786803/variables/variables' with embeddings


In [16]:
embedding.shape

(84827, 128)

In [17]:
np.save('./embeddings/my_embedding_tfhub.npy', embedding)

# The Model


We can now use our class. First we have to create and instance (tagger), and give it the lookup dicts and save and restore paths.
Then we can call .build_graph() and then train the model by calling .train() and giving it the train and testing data. 
In every epoch we print out the models loss score on the training data. 
In every fifth epoch of the training process we print out the accs and classification reports on the testing data.
The trained model will then be saved to save_path. 

From there we can easily restore the trained model and run evuations, without retraining the model.


In [18]:
# this creates an input dataset that only uses sentences that contain at least one entity apart from 'O'
# this might force the model to learn the differentiation better. 
ents = ['B-OTH', 'I-OTH', 'B-LOC', 'I-LOC', 'B-ORG','I-ORG','B-PER', 'I-PER']
ents_inds = [word2ind[ent] for ent in ents]

converted_inputs_new = []

for i, sentence in enumerate(converted_inputs):
    sent_entities = [sent[1] for sent in sentence]
    for ent in ents_inds:
        if ent in sent_entities:
            converted_inputs_new.append(sentence)
            break
            
# BUT those are 10000 less sentences than before.
# seems to enahance performance.
len(converted_inputs_new)



14034

### Training the model

In [69]:
# hyperparams
word2ind = word2ind
ind2word = ind2word
char2ind = char2ind

save_path = './models/sequence_tagger_1/my_model'
np_embedding_matrix_path = './embeddings/my_embedding_tfhub.npy'
summary_dir = './models/tensorboard/sequence_tagger_1'

num_layers = 1
n_tags = 10
batch_size = 128

embedding_dim = 300
char_embedding_dim = 50
rnn_size = 150
char_rnn_size = 40
clip = 5

learning_rate=0.0001
learning_rate_decay_steps=400
max_lr = 0.005

use_chars = True
use_crf = True
use_cyclic=True

In [70]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
ner_model_utils.reset_default_graph()
tagger = SequenceTagger.SequenceTagger(word2ind,
                                       ind2word,
                                       save_path = save_path,
                                       np_embedding_matrix_path = np_embedding_matrix_path,
                                       summary_dir=summary_dir,
                                       char2ind = char2ind,
                                       use_chars = use_chars,
                                       rnn_size = rnn_size,
                                       char_rnn_size = char_rnn_size,
                                       embedding_dim = embedding_dim,
                                       char_embedding_dim = char_embedding_dim,
                                       n_tags = n_tags,
                                       batch_size = batch_size,
                                       use_crf = use_crf,
                                       num_layers = num_layers,
                                       clip = clip,
                                       learning_rate=learning_rate,
                                       learning_rate_decay_steps=learning_rate_decay_steps,
                                       max_lr=max_lr,
                                       use_cyclic=use_cyclic)


tagger.build_graph()
tagger.train(converted_inputs, converted_targets)

### Testing the model

In [19]:
# hyperparams
word2ind = word2ind
ind2word = ind2word
char2ind = char2ind
np_embedding_matrix_path = './embeddings/my_embedding_tfhub.npy'
restore_path = './models/sequence_tagger/my_model'

num_layers = 1
n_tags = 10
batch_size = 128

embedding_dim = 300
char_embedding_dim = 50
rnn_size = 150
char_rnn_size = 40
clip = 5

use_chars = True
use_crf = True
use_cyclic=True
train_embeddings = False

In [21]:
ner_model_utils.reset_default_graph()
tagger = SequenceTagger.SequenceTagger(word2ind,
                                        ind2word,
                                        char2ind=char2ind,
                                        np_embedding_matrix_path = np_embedding_matrix_path,
                                        restore_path=restore_path,
                                        use_chars=use_chars,
                                        rnn_size=rnn_size,
                                        char_rnn_size=char_rnn_size,
                                        embedding_dim=embedding_dim,
                                        char_embedding_dim=char_embedding_dim,
                                        n_tags=n_tags,
                                        batch_size=batch_size,
                                        keep_probability_i=1.0,
                                        keep_probability_o=1.0,
                                        keep_probability_h=1.0,
                                        keep_probability_d=1.0,
                                        keep_probability_e=1.0,
                                        use_crf=use_crf,
                                        num_layers=num_layers,
                                        train_embeddings=train_embeddings)

tagger.build_graph()

actuals, preds, accuracy, accuracy_without, n_zeros, n_other_ents, classif_report, classif_report_without = tagger.run_evaluate(
    converted_targets,
    restore_sess=True)
print('Accuracy: {:.5f}\nAccuracy without "O"s:{}\nNumber of "O"s in data: {}\n' \
      'nNumber of other entitites in data: {}\n--> {:.3f}\n' \
      'Classifcation Report:\n {}\n\nClassification Report without zeros: \n{}'.format(accuracy,
                                                                                       accuracy_without,
                                                                                       n_zeros,
                                                                                       n_other_ents,
                                                                                       n_zeros / (
                                                                                           n_zeros + n_other_ents) * 100,
                                                                                       classif_report,
                                                                                       classif_report_without))

Loaded embeddings from: ./embeddings/my_embedding_tfhub.npy


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Graph built.
INFO:tensorflow:Restoring parameters from ./models/sequence_tagger/my_model
Restored from ./models/sequence_tagger/my_model
Accuracy: 0.94897
Accuracy without "O"s:0.7645742419156316
Number of "O"s in data: 87452
nNumber of other entitites in data: 8937
--> 90.728
Classifcation Report:
              precision    recall  f1-score   support

          0       0.41      0.58      0.48       773
          1       0.49      0.47      0.48       858
          2       0.69      0.87      0.77      2385
          3       0.62      0.63      0.62       307
          4       0.62      0.74      0.68      1324
          5       0.70      0.59      0.64       694
          6       0.61      0.88      0.72      1693
          7       0.91      0.92      0.92       903
          8       0.99      0.97      0.98     87452

avg / total       0.96      0.95      0.95     96389


Classification Report without zeros: 
             precision    recall  f1-score   support

          0       0.

  'recall', 'true', average, warn_for)


In [24]:
# confusion matrix 
print(sklearn.metrics.confusion_matrix(actuals, preds))

[[  445    20    51     1    73     5    36     1   141]
 [   28   403    19     0     5    31    28    16   328]
 [   21     5  2073    28    37    29    69     2   121]
 [    1     8    23   192     0    15     4    12    52]
 [   47     6   104     1   979    10    53     0   124]
 [    3    31    29    26    15   411    10    19   150]
 [   29     8    51     4    26     4  1495    12    64]
 [    1    10     5     3     1     4    17   835    27]
 [  514   337   649    53   440    80   722    20 84637]]


In [40]:
ner_data_utils.print_examples(converted_targets[0:20], preds, ind2word, True)




Sentence n.0:
['1951', 'bis', '1953', 'wurde', 'der', 'nördliche', 'Teil', 'als', 'Jugendburg', 'des', 'Kolpingwerkes', 'gebaut', '.']
Actual entities:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OTH', 'O', 'O']
Predicted entites:
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O']



Sentence n.1:
['Da', 'Muck', 'das', 'Kriegsschreiben', 'nicht', 'überbracht', 'hat', ',', 'wird', 'er', 'als', 'Retter', 'des', 'Landes', 'ausgezeichnet', 'und', 'soll', 'zum', 'Schatzmeister', 'ernannt', 'werden', '.']
Actual entities:
['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted entites:
['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Sentence n.2:
['Mit', 'Jänner', '2007', 'wurde', 'Robert', 'Schörgenhofer', ',', 'als', 'Nachfolger', 'des', 'ausgeschiedenen', 'Dietmar', 'Drabek', ',', 'in', 'die', 'Kaderliste', 'der', 'FIFA-S