<img align=center src="https://rhyme.com/assets/img/logo-dark.png"></img>
<h2 align=center> Named Entity Recognition (NER) using LSTMs with Keras</h2>

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ner_dataset.csv


In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
np.random.seed(0)
plt.style.use("ggplot")

import tensorflow as tf
print('Tensorflow version:', tf.__version__)

Tensorflow version: 2.1.0


### Loading and Exploring the NER Dataset

*Essential info about the tagged entities*:
- geo = Geographical Entity
- org = Organization
- per = Person
- gpe = Geopolitical Entity
- tim = Time indicator
- art = Artifact
- eve = Event
- nat = Natural Phenomenon

In [4]:
df=pd.read_csv("/kaggle/input/ner_dataset.csv",encoding='latin1')
df.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [6]:
df.isnull().sum()

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64

Looking at the dataset here we will use ffill feature to fill the null values.

In [7]:
df=df.fillna(method='ffill')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [8]:
df.isnull().sum()

Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64

In [9]:
df['Tag'].value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [10]:
n_tags=df['Tag'].nunique()
n_tags

17

In [11]:
n_words=df['Word'].nunique()
n_words

35178

In [30]:
words=list(set(df['Word']))

In [32]:
# words

In [28]:
tags=list(set(df['Tag']))

In [29]:
tags

['I-tim',
 'I-nat',
 'I-geo',
 'I-art',
 'B-geo',
 'B-per',
 'B-gpe',
 'I-per',
 'B-org',
 'B-nat',
 'B-tim',
 'I-gpe',
 'O',
 'I-org',
 'B-eve',
 'I-eve',
 'B-art']

In [23]:
class GetSentence(object):
    def __init__(self, data):
        self.n_sentence=1
        self.data=data
        self.empty = False
        function=lambda d:[(w, p, t) for w, p, t in zip(d["Word"].values.tolist(),
                                                        d["POS"].values.tolist(),
                                                        d["Tag"].values.tolist())]
        
        self.group_sent = self.data.groupby("Sentence #").apply(function)
        self.all_sentences = [d for d in self.group_sent] 

In [24]:
get=GetSentence(df)
sentences=get.all_sentences

In [25]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [26]:
sentences[6]

[('He', 'PRP', 'O'),
 ('said', 'VBD', 'O'),
 ('last', 'JJ', 'O'),
 ('week', 'NN', 'O'),
 ("'s", 'POS', 'O'),
 ('tsunami', 'NN', 'O'),
 ('and', 'CC', 'O'),
 ('the', 'DT', 'O'),
 ('massive', 'JJ', 'O'),
 ('underwater', 'NN', 'O'),
 ('earthquake', 'NN', 'O'),
 ('that', 'WDT', 'O'),
 ('triggered', 'VBD', 'O'),
 ('it', 'PRP', 'O'),
 ('has', 'VBZ', 'O'),
 ('affected', 'VBN', 'O'),
 ('millions', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('Asia', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('Africa', 'NNP', 'B-geo'),
 ('.', '.', 'O')]

In [34]:
len(sentences)

47959

# For the input of LSTM model all the sentences must be padded to same length,for that we must know the maximum length of the sequence in th elist of sentences.

In [36]:
maxl = max([len(s) for s in sentences])
print ('Maximum sequence length in the list of sentences:', maxl)

Maximum sequence length in the list of sentences: 104


# We had splitted each sentences as a list of tuples of the word,POS and tags.

In [42]:
w_index={w:i+1 for i,w in enumerate(words)}
t_index={t:j+1 for j,t in enumerate(tags)}

In [44]:
# w_index

In [45]:
# t_index

### Task 6: Build and Compile a Bidirectional LSTM Model

In [None]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

### Task 7: Train the Model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from livelossplot.tf_keras import PlotLossesCallback

### Task 8: Evaluate Named Entity Recognition Model