Inspired from https://www.depends-on-the-definition.com/named-entity-recognition-with-residual-lstm-and-elmo/

In [None]:
from google.colab import drive
drive.mount('/content/drive',  force_remount=True)

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

# Reading the dataset and dealing with missing values 

In [0]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.tail(10)

Unnamed: 0.1,Unnamed: 0,Sentence #,Word,Tag,POS
2486,2486,Sentence: 77,the,O,POS
2487,2487,Sentence: 77,Fertiliser,O,POS
2488,2488,Sentence: 77,Industry,O,POS
2489,2489,Sentence: 77,antonella..harrison@gmail.com,B-eml,POS
2490,2490,Sentence: 77,+44,B-phn,POS
2491,2491,Sentence: 77,7799,I-phn,POS
2492,2492,Sentence: 77,895082,I-phn,POS
2493,2493,Sentence: 77,https://uk.linkedin.com/in/antonellaharrison,B-web,POS
2494,2494,Sentence: 77,https://twitter.com/antonellaharr,B-web,POS
2495,2495,Sentence: 77,.,O,POS


# Counting individual words

In [0]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

1491

# Counting individual tags

In [0]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

15

# Using the SentenceGetter class

In [0]:
getter = SentenceGetter(data)

# Retrieve a sentence and go to the next one

In [0]:
sent = getter.get_next()
sent

[('Yassine', 'POS', 'B-person'),
 ('Hamdouch', 'POS', 'I-person'),
 ('Colliers', 'POS', 'O'),
 ('General', 'POS', 'B-job'),
 ('Manager', 'POS', 'I-job'),
 ('Head', 'POS', 'B-job'),
 ('of', 'POS', 'I-job'),
 ('Hotels', 'POS', 'I-job'),
 ('&', 'POS', 'I-job'),
 ('Hospitality', 'POS', 'I-job'),
 ('INTERNATIONAL', 'POS', 'O'),
 ('63', 'POS', 'O'),
 ('Bd', 'POS', 'O'),
 ('Moulay', 'POS', 'O'),
 ('Youssef', 'POS', 'O'),
 ('20100', 'POS', 'O'),
 ('Casablanca', 'POS', 'B-gpe'),
 ('Morocco', 'POS', 'B-gpe'),
 ('+212', 'POS', 'B-phn'),
 ('(0)', 'POS', 'I-phn'),
 ('5', 'POS', 'I-phn'),
 ('20', 'POS', 'I-phn'),
 ('30', 'POS', 'I-phn'),
 ('36', 'POS', 'I-phn'),
 ('31', 'POS', 'I-phn'),
 ('MAIN', 'POS', 'O'),
 ('MOBILE', 'POS', 'O'),
 ('+212', 'POS', 'B-phn'),
 ('(0)', 'POS', 'I-phn'),
 ('6', 'POS', 'I-phn'),
 ('65', 'POS', 'I-phn'),
 ('64', 'POS', 'I-phn'),
 ('65', 'POS', 'I-phn'),
 ('78', 'POS', 'I-phn'),
 ('+212', 'POS', 'B-phn'),
 ('(0)', 'POS', 'I-phn'),
 ('5', 'POS', 'I-phn'),
 ('20', 'POS', '

# Retrieve all sentences

In [0]:
sentences = getter.sentences

In [0]:
sentences

[[('Yassine', 'POS', 'B-person'),
  ('Hamdouch', 'POS', 'I-person'),
  ('Colliers', 'POS', 'O'),
  ('General', 'POS', 'B-job'),
  ('Manager', 'POS', 'I-job'),
  ('Head', 'POS', 'B-job'),
  ('of', 'POS', 'I-job'),
  ('Hotels', 'POS', 'I-job'),
  ('&', 'POS', 'I-job'),
  ('Hospitality', 'POS', 'I-job'),
  ('INTERNATIONAL', 'POS', 'O'),
  ('63', 'POS', 'O'),
  ('Bd', 'POS', 'O'),
  ('Moulay', 'POS', 'O'),
  ('Youssef', 'POS', 'O'),
  ('20100', 'POS', 'O'),
  ('Casablanca', 'POS', 'B-gpe'),
  ('Morocco', 'POS', 'B-gpe'),
  ('+212', 'POS', 'B-phn'),
  ('(0)', 'POS', 'I-phn'),
  ('5', 'POS', 'I-phn'),
  ('20', 'POS', 'I-phn'),
  ('30', 'POS', 'I-phn'),
  ('36', 'POS', 'I-phn'),
  ('31', 'POS', 'I-phn'),
  ('MAIN', 'POS', 'O'),
  ('MOBILE', 'POS', 'O'),
  ('+212', 'POS', 'B-phn'),
  ('(0)', 'POS', 'I-phn'),
  ('6', 'POS', 'I-phn'),
  ('65', 'POS', 'I-phn'),
  ('64', 'POS', 'I-phn'),
  ('65', 'POS', 'I-phn'),
  ('78', 'POS', 'I-phn'),
  ('+212', 'POS', 'B-phn'),
  ('(0)', 'POS', 'I-phn'),
  ('

# Creating a dictionary of tags

In [0]:
max_len = 73
tag2idx = {t:i for i, t in enumerate(tags)}
tag2idx

{'B-eml': 7,
 'B-gpe': 10,
 'B-job': 14,
 'B-org': 3,
 'B-person': 1,
 'B-phn': 4,
 'B-web': 11,
 'I-eml': 5,
 'I-gpe': 8,
 'I-job': 6,
 'I-org': 12,
 'I-person': 0,
 'I-phn': 13,
 'I-web': 9,
 'O': 2}

# Creating a list of all sentences word by word

In [0]:
X = [[w[0] for w in s] for s in sentences]

In [0]:
X

[['Yassine',
  'Hamdouch',
  'Colliers',
  'General',
  'Manager',
  'Head',
  'of',
  'Hotels',
  '&',
  'Hospitality',
  'INTERNATIONAL',
  '63',
  'Bd',
  'Moulay',
  'Youssef',
  '20100',
  'Casablanca',
  'Morocco',
  '+212',
  '(0)',
  '5',
  '20',
  '30',
  '36',
  '31',
  'MAIN',
  'MOBILE',
  '+212',
  '(0)',
  '6',
  '65',
  '64',
  '65',
  '78',
  '+212',
  '(0)',
  '5',
  '20',
  '30',
  '29',
  '93',
  'FAX',
  'Yassine.Hamdouch@Colliers.com',
  'EMAIL',
  'www.colliers.com',
  '.'],
 ['STANFORD',
  'Electrical',
  'Engineering',
  '17',
  'Comstock',
  'Circle',
  'ELECTRICAL',
  'ENGINEERING',
  'Apt',
  '101',
  'Stanford',
  ',',
  'CA',
  '94305',
  'Phone:',
  '916.221.0411',
  'VIJAY',
  'CHANDRASEKHAR',
  'E-mail:',
  'vijayc@stanford.edu',
  '.'],
 ['Hilton',
  'Feras',
  'FERAS',
  'HASBINI',
  'DEVELOPMENT',
  'DIRECTOR',
  ',',
  'MIDDLE',
  'EAST',
  '&',
  'NORTH',
  'AFRICA',
  'feras.hasbini@hiton.com',
  'M',
  '+212',
  '6',
  '0766',
  '8765',
  'M',
  '

# Padding sentences to a length of 73

In [0]:
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("__PAD__")
    new_X.append(new_seq)
X = new_X

In [0]:
len(X)

77

# Doing the same for tags but saving it in an numpy array

In [0]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y

[[1,
  0,
  2,
  14,
  6,
  14,
  6,
  6,
  6,
  6,
  2,
  2,
  2,
  2,
  2,
  2,
  10,
  10,
  4,
  13,
  13,
  13,
  13,
  13,
  13,
  2,
  2,
  4,
  13,
  13,
  13,
  13,
  13,
  13,
  4,
  13,
  13,
  13,
  13,
  13,
  13,
  2,
  7,
  2,
  11,
  2],
 [3, 12, 12, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 1, 0, 2, 7, 2],
 [3,
  2,
  1,
  0,
  14,
  6,
  2,
  2,
  2,
  2,
  2,
  2,
  7,
  2,
  4,
  13,
  13,
  13,
  2,
  4,
  13,
  13,
  2,
  4,
  13,
  13,
  13,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  10,
  2,
  10,
  11,
  2],
 [3, 12, 3, 12, 1, 0, 14, 6, 2, 4, 13, 13, 2, 4, 13, 13, 2, 4, 13, 13, 7, 2],
 [2,
  2,
  1,
  0,
  14,
  6,
  6,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  4,
  13,
  2,
  2,
  4,
  13,
  13,
  7,
  10,
  2,
  8,
  2,
  2,
  2,
  2,
  2,
  2,
  2],
 [3,
  12,
  12,
  12,
  2,
  1,
  0,
  14,
  6,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  10,
  2,
  10,
  2,
  2,
  2,
  2,
  4,
  13,
  13,
  13,
  13,
  13,


In [0]:
from keras.preprocessing.sequence import pad_sequences
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

Using TensorFlow backend.


# Setting batch size

In [0]:
batch_size = 2

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

# Creating a TensorFlow session

In [0]:
sess = tf.Session()
K.set_session(sess)

# Downloading the pretrained ELMo model

In [0]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

# Creating a function that takes a sequence of strings a returns a sequence of 1024-dimensional vectors

In [0]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [0]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

# Training the model and visualising results

In [0]:
L_class_report = [] # List containing all the metrics for evaluation

In [0]:
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
X = np.array(X)
kf = KFold(n_splits=7, shuffle=True)
for train_index, test_index in kf.split(X):
  print("TRAIN:", train_index, "TEST:", test_index)
  X_tr, X_te = X[train_index], X[test_index]
  y_tr, y_te = y[train_index], y[test_index]
  X_tr = X_tr.tolist()
  X_te = X_te.tolist()
  
  # Creating a residual LSTM network with an ELMo embedding layer
  
  input_text = Input(shape=(max_len,), dtype=tf.string)
  embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text)
  x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
  x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
  x = add([x, x_rnn])  # residual connection to the first biLSTM
  out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)
  
  model = Model(input_text, out)
  
  model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['categorical_accuracy'])
  
  # Training the model
  
  y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
  history = model.fit(np.array(X_tr), y_tr, validation_split=0, batch_size=batch_size, epochs=20, verbose=1)
  
  # Evaluating the model
  
  predicted_values = []
  true_values = []
  for i in range(10):
    p = model.predict(np.array(X_te[i:i+batch_size]))[0]
    p = np.argmax(p, axis=-1)
    for w, true, pred in zip(X_te[i], y_te[i], p):
      if w != "__PAD__":
        predicted_values += [tags[pred]]
        true_values += [tags[true]]
  
  L_class_report.append(classification_report(true_values, predicted_values, output_dict=True))
  
  # Printing model predictions
  
  for i in range(10):
    p = model.predict(np.array(X_te[i:i+batch_size]))[0]
    p = np.argmax(p, axis=-1)
    print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
    print("="*30)
    for w, true, pred in zip(X_te[i], y_te[i], p):
      if w != "__PAD__":
        print("{:15}:{:5} ({})".format(w, tags[pred], tags[true]))

W0808 14:36:38.925683 139654838101888 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0808 14:36:38.926905 139654838101888 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



TRAIN: [ 0  2  3  4  5  6  8  9 10 11 12 14 15 16 17 18 19 20 21 22 23 24 25 26
 27 28 29 30 31 32 33 34 35 37 38 39 40 42 43 45 46 47 48 49 52 53 54 55
 56 57 58 59 60 61 62 63 64 65 68 69 70 71 72 74 75 76] TEST: [ 1  7 13 36 41 44 50 51 66 67 73]


W0808 14:36:39.357119 139654838101888 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0808 14:36:40.498467 139654838101888 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0808 14:36:40.508963 139654838101888 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0808 14:36:42.645157 139654838101888 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimize

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Word            Pred : (True)
STANFORD       :O     (B-org)
Electrical     :O     (I-org)
Engineering    :O     (I-org)
17             :O     (O)
Comstock       :O     (O)
Circle         :O     (O)
ELECTRICAL     :O     (O)
ENGINEERING    :O     (O)
Apt            :O     (O)
101            :O     (O)
Stanford       :O     (O)
,              :O     (O)
CA             :B-gpe (O)
94305          :O     (O)
Phone:         :O     (O)
916.221.0411   :B-phn (B-phn)
VIJAY          :O     (B-person)
CHANDRASEKHAR  :O     (I-person)
E-mail:        :O     (O)
vijayc@stanford.edu:B-eml (B-eml)
.              :O     (O)
Word            Pred : (True)
SAP            :B-org (B-org)
Nabil          :B-person (B-person)
BADDOU         :I-person (I-person)
Senior         :B-job (

  'precision', 'predicted', average, warn_for)


Word            Pred : (True)
packetvideo    :B-org (B-org)
Richard        :B-person (B-person)
Svienty        :I-person (I-person)
member         :B-job (B-job)
of             :I-job (I-job)
technical      :I-job (I-job)
staff          :I-job (I-job)
5407           :O     (O)
trillium       :O     (O)
boulevard      :O     (O)
,              :O     (O)
suite          :O     (O)
130            :O     (O)
t              :O     (O)
847            :B-phn (B-phn)
273-5617       :I-phn (I-phn)
hoffman        :O     (O)
estates        :O     (O)
,              :O     (O)
illinois       :B-gpe (B-gpe)
60192          :I-phn (O)
f              :O     (O)
847            :B-phn (B-phn)
273-5610       :I-phn (I-phn)
svienty@pv.com :B-eml (B-eml)
www.pv.com     :B-web (B-web)
.              :O     (O)
Word            Pred : (True)
ANTEC          :B-org (B-org)
SYSTEMS        :O     (I-org)
AVTEC          :O     (O)
SYSTEMS        :O     (O)
,              :O     (O)
INC.           :O     (O)
14432 

  'recall', 'true', average, warn_for)


Word            Pred : (True)
Hilton         :B-org (B-org)
Feras          :O     (O)
FERAS          :O     (B-person)
HASBINI        :O     (I-person)
DEVELOPMENT    :O     (B-job)
DIRECTOR       :O     (I-job)
,              :O     (O)
MIDDLE         :O     (O)
EAST           :O     (O)
&              :O     (O)
NORTH          :O     (O)
AFRICA         :O     (O)
feras.hasbini@hiton.com:B-web (B-eml)
M              :O     (O)
+212           :B-phn (B-phn)
6              :I-phn (I-phn)
0766           :I-phn (I-phn)
8765           :I-phn (I-phn)
M              :O     (O)
+337           :B-phn (B-phn)
6226           :I-phn (I-phn)
4459           :I-phn (I-phn)
M              :O     (O)
+971           :B-phn (B-phn)
56             :I-phn (I-phn)
413            :I-phn (I-phn)
8810           :I-phn (I-phn)
Casablanca     :O     (O)
Twin           :O     (O)
Towers         :O     (O)
Center         :O     (O)
Tour           :O     (O)
Ouest          :O     (O)
16eme          :O     (O)
etag

  'precision', 'predicted', average, warn_for)


Word            Pred : (True)
Yassine        :B-person (B-person)
Hamdouch       :I-person (I-person)
Colliers       :I-person (O)
General        :B-job (B-job)
Manager        :I-job (I-job)
Head           :I-job (B-job)
of             :I-job (I-job)
Hotels         :O     (I-job)
&              :O     (I-job)
Hospitality    :I-job (I-job)
INTERNATIONAL  :O     (O)
63             :O     (O)
Bd             :O     (O)
Moulay         :O     (O)
Youssef        :O     (O)
20100          :O     (O)
Casablanca     :B-gpe (B-gpe)
Morocco        :B-gpe (B-gpe)
+212           :B-phn (B-phn)
(0)            :I-phn (I-phn)
5              :I-phn (I-phn)
20             :I-phn (I-phn)
30             :I-phn (I-phn)
36             :I-phn (I-phn)
31             :I-phn (I-phn)
MAIN           :O     (O)
MOBILE         :O     (O)
+212           :B-phn (B-phn)
(0)            :I-phn (I-phn)
6              :I-phn (I-phn)
65             :I-phn (I-phn)
64             :I-phn (I-phn)
65             :I-phn (I-phn)
7

  'precision', 'predicted', average, warn_for)


Word            Pred : (True)
lpsos          :B-org (B-org)
Morocco        :B-org (I-org)
&              :I-org (I-org)
Algeria        :I-org (I-org)
Ipsos          :I-org (O)
Nabil          :B-person (B-person)
Abouzaid       :I-person (I-person)
Managing       :B-job (B-job)
Director       :I-job (I-job)
16             :O     (O)
,              :O     (O)
Rue            :O     (O)
des            :O     (O)
Asphodeles     :O     (O)
,              :O     (O)
Maarif         :O     (O)
Casablanca     :B-gpe (B-gpe)
-              :O     (O)
Morocco        :B-gpe (B-gpe)
,              :O     (O)
20             :O     (O)
380            :I-phn (O)
Tel            :O     (O)
212            :B-phn (B-phn)
522            :I-phn (I-phn)
98             :I-phn (I-phn)
57             :I-phn (I-phn)
02             :I-phn (I-phn)
/              :I-phn (I-phn)
12             :I-phn (I-phn)
Fax            :O     (O)
:              :O     (O)
+212           :B-phn (B-phn)
522            :I-phn (I-phn