<a href="https://colab.research.google.com/github/aham-uzoma/named_entity_recognition/blob/main/NER__Named_Entity_Recorgnition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Problem**

Real World Named Entity Recognition (NER) with some entity tags in 1354149 total word count

In [None]:
import os
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Embedding, Dropout, Dense, TimeDistributed
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [None]:
file = os.path.join('drive','MyDrive','ML_Datasets', 'NER','NER_dataset.csv' )
df = pd.read_csv(file, encoding='ISO-8859-1')

In [None]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  47959 non-null    object
 1   Word        1048565 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [None]:
len(df)

1048575

In [None]:
df['Tag'].unique()

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [None]:
data = df.copy()
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [None]:
len(data['Word'].tolist())

1048575

In [None]:
len(list(set(data['Word'].tolist())))

35178

In [None]:
testVocab = list(set(data['Word'].tolist()))
token_id = {tokn:id for id, tokn in enumerate(testVocab)}
token_id

{'Though': 0,
 'recovered': 1,
 'modified': 2,
 'Existence': 3,
 'Ciampino': 4,
 'colliding': 5,
 'swiftly': 6,
 'intercepts': 7,
 'Fuad': 8,
 'magnate': 9,
 'surrendered': 10,
 'Apr-29': 11,
 'MIddle': 12,
 'Yarkas': 13,
 'Fratto': 14,
 'public-private': 15,
 're-energize': 16,
 'Serious': 17,
 'Khazaee': 18,
 'Vines': 19,
 'al-Fahd': 20,
 'careless': 21,
 '2,40,000': 22,
 'fireworks': 23,
 'implicate': 24,
 'Branco': 25,
 'Buro': 26,
 'squeezed': 27,
 '20,000': 28,
 'Nadia': 29,
 'Aviv': 30,
 '6.5': 31,
 'Dynamics': 32,
 'Qom': 33,
 '1494': 34,
 'job': 35,
 'feat': 36,
 'Ipsos': 37,
 'mayor': 38,
 'superstar': 39,
 'Annexation': 40,
 'clay': 41,
 'spring': 42,
 'grossly': 43,
 'Newspapers': 44,
 'Geelani': 45,
 'Mogotio': 46,
 'unproven': 47,
 'Munoz': 48,
 'online': 49,
 'Sudanese': 50,
 'Martyrs': 51,
 'carve': 52,
 'biggest': 53,
 'quarterfinals': 54,
 'militia': 55,
 'landmark': 56,
 'Abdulatif': 57,
 'wrought': 58,
 'Juarez': 59,
 'Jamia': 60,
 'Facebook': 61,
 'basis': 62,
 'pu

Create function to set up a dictionary

In [None]:
def token_tag_dict(data, token_or_tag):
  tok2idx = {}
  idx2tok = {}

  if token_or_tag == 'token':
    vocab = list(set(data['Word'].tolist()))
  else:
    vocab = list(set(data['Tag'].tolist()))

  tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
  idx2tok = {idx:tok for idx, tok in enumerate(vocab)}

  return tok2idx, idx2tok



In [None]:
token2idx, idx2token = token_tag_dict(data, 'token')
tag2idx,  idx2tag = token_tag_dict(data, 'tag')

In [None]:
tag2idx

{'I-nat': 0,
 'B-eve': 1,
 'B-org': 2,
 'O': 3,
 'B-nat': 4,
 'B-tim': 5,
 'I-per': 6,
 'B-per': 7,
 'I-tim': 8,
 'B-gpe': 9,
 'I-art': 10,
 'I-geo': 11,
 'B-art': 12,
 'I-eve': 13,
 'B-geo': 14,
 'I-gpe': 15,
 'I-org': 16}



1.   Map token to the word column and create a new numerical column
2.   Map tag to the tag column and create a new numerical column





In [None]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)

In [None]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,17766,3
1,,of,IN,O,17535,3
2,,demonstrators,NNS,O,25196,3
3,,have,VBP,O,18382,3
4,,marched,VBN,O,26333,3


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  47959 non-null    object
 1   Word        1048565 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
 4   Word_idx    1048575 non-null  int64 
 5   Tag_idx     1048575 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 48.0+ MB


In [None]:
data_fillna = data.ffill()
data_fillna

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,17766,3
1,Sentence: 1,of,IN,O,17535,3
2,Sentence: 1,demonstrators,NNS,O,25196,3
3,Sentence: 1,have,VBP,O,18382,3
4,Sentence: 1,marched,VBN,O,26333,3
...,...,...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O,8990,3
1048571,Sentence: 47959,responded,VBD,O,6320,3
1048572,Sentence: 47959,to,TO,O,12998,3
1048573,Sentence: 47959,the,DT,O,23551,3


In [None]:
data_fillna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  1048575 non-null  object
 1   Word        1048575 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
 4   Word_idx    1048575 non-null  int64 
 5   Tag_idx     1048575 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 48.0+ MB


In [None]:
data_group = data_fillna.groupby(['Sentence #'],as_index=False)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x:list(x))
data_group

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[17766, 17535, 25196, 18382, 26333, 22409, 171...","[3, 3, 3, 3, 3, 3, 14, 3, 3, 3, 3, 3, 14, 3, 3..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[31220, 11629, 27973, 8990, 18092, 12998, 3082...","[9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[14940, 831, 10730, 32508, 30531, 19843, 7099,...","[3, 3, 5, 3, 3, 3, 3, 3, 14, 3, 3, 3, 3, 3, 2,..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[26259, 6328, 28018, 3354, 13243, 32052, 15756...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[9346, 22471, 541, 8772, 32526, 20361, 9418, 2...","[14, 3, 3, 7, 6, 3, 5, 3, 14, 3, 9, 3, 9, 3, 3..."
...,...,...,...,...,...,...
47954,Sentence: 9995,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[NNP, NN, NNP, NNP, NNP, VBZ, VBN, PRP, VBZ, T...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O...","[12752, 17351, 903, 15457, 16721, 31976, 20361...","[3, 3, 3, 7, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
47955,Sentence: 9996,"[On, Thursday, ,, Iranian, state, media, publi...","[IN, NNP, ,, JJ, NN, NNS, VBN, DT, NN, IN, DT,...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B...","[10791, 13613, 21699, 31220, 1816, 917, 21591,...","[3, 5, 3, 9, 3, 3, 3, 3, 3, 3, 3, 3, 2, 16, 3,..."
47956,Sentence: 9997,"[Following, Iran, 's, disputed, June, 12, elec...","[VBG, NNP, POS, JJ, NNP, CD, NNS, ,, NNS, NNS,...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ...","[13308, 11222, 26649, 13180, 24126, 30880, 322...","[3, 14, 3, 3, 5, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
47957,Sentence: 9998,"[Since, then, ,, authorities, have, held, publ...","[IN, RB, ,, NNS, VBP, VBN, JJ, NNS, IN, DT, VB...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[9154, 31247, 21699, 8678, 18382, 26734, 2662,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."


Create a function for padding encoding and splitting to train test val sets

In [None]:
n_toks = len(list(set(data['Word'].to_list())))
toks = data_group['Word_idx'].tolist()
maxlen = max([len(s) for s in toks])
#maxlen
paddy = pad_sequences(toks, maxlen, dtype='int32', padding='post', value= n_toks -1)
paddy

array([[17766, 17535, 25196, ..., 35177, 35177, 35177],
       [31220, 11629, 27973, ..., 35177, 35177, 35177],
       [14940,   831, 10730, ..., 35177, 35177, 35177],
       ...,
       [13308, 11222, 26649, ..., 35177, 35177, 35177],
       [ 9154, 31247, 21699, ..., 35177, 35177, 35177],
       [29521, 10155,  5315, ..., 35177, 35177, 35177]], dtype=int32)

In [None]:
maxlen

104

In [None]:
taggs = data_group['Tag_idx'].tolist()
maxlen = max([len(s) for s in taggs])
maxlen
taggy = pad_sequences(taggs, maxlen, dtype='int32', padding='post',
                      value= tag2idx["O"])
taggy

array([[ 3,  3,  3, ...,  3,  3,  3],
       [ 9,  3,  3, ...,  3,  3,  3],
       [ 3,  3,  5, ...,  3,  3,  3],
       ...,
       [ 3, 14,  3, ...,  3,  3,  3],
       [ 3,  3,  3, ...,  3,  3,  3],
       [ 3,  2, 16, ...,  3,  3,  3]], dtype=int32)

In [None]:
n_tagg = len(tag2idx)
taggy_encoded = [to_categorical(x, num_classes=n_tagg) for x in taggy]
taggy_encoded

[array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 arr

In [None]:
#The function
def pad_encode_split(data, data_group):
  n_token = len(list(set(data['Word'].to_list())))
  n_tag = len(list(set(data['Tag'].to_list())))

  #Padding Tokens (X)
  tokens = data_group['Word_idx'].tolist()
  maxlen = max([len(s) for s in tokens])
  pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32',
                             padding='post', value= n_token-1)
  #Paddin Tags (y)
  tags = data_group['Tag_idx'].tolist()
  pad_tags = pad_sequences(tags, maxlen, dtype='int32',
                           padding='post', value=tag2idx['O'])
  #Convert Tags to numbers with one hot encoder
  n_tags = len(tag2idx)
  pad_tags_encoded = [to_categorical(x, num_classes = n_tags) for x in pad_tags]

  #train test val splits
  tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens,
                                                           pad_tags_encoded,
                                                           train_size=0.8,
                                                           random_state=42)
  train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,
                                                                    tags_,
                                                                    train_size=0.8,
                                                                    random_state=42)
  # #print
  print(
        'train_tokens length:'  , len(train_tokens),
        '\ntrain_tags:         ', len(train_tags),
        '\ntest_tokens length: ', len(test_tokens),
        '\ntest_tags:          ', len(test_tags),
        '\nval_tokens:         ', len(val_tokens),
        '\nval_tags:           ', len(val_tags),
    )

  #return
  return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

In [None]:
train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags=pad_encode_split(data, data_group)

train_tokens length: 30693 
train_tags:          30693 
test_tokens length:  9592 
test_tags:           9592 
val_tokens:          7674 
val_tags:            7674


# Training the Deep-Learning Model

In [None]:
len(list(set(data['Word'].tolist()))) +1

35179

In [None]:
len(tag2idx)

17

In [None]:
tag2idx

{'I-nat': 0,
 'B-eve': 1,
 'B-org': 2,
 'O': 3,
 'B-nat': 4,
 'B-tim': 5,
 'I-per': 6,
 'B-per': 7,
 'I-tim': 8,
 'B-gpe': 9,
 'I-art': 10,
 'I-geo': 11,
 'B-art': 12,
 'I-eve': 13,
 'B-geo': 14,
 'I-gpe': 15,
 'I-org': 16}

In [None]:
input_dim= len(list(set(data['Word'].tolist()))) +1
output_dim= 64
input_length= max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags=len(tag2idx)

model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=output_dim,
                    input_length=input_length, mask_zero=True))
model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(units=output_dim, return_sequences=True))
model.add(Dropout(0.3))
model.add(TimeDistributed(Dense(n_tags, activation='softmax')))

#Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.build(input_shape=(None, input_length))

model.summary();



In [None]:
# model.fit(X_train, y_train)