# Reading the data


In [None]:
train_data = open(file='/content/train.txt',mode='r',encoding='utf-8')
test_data = open(file='/content/test.txt',mode='r',encoding='utf-8')
val_data = open(file='/content/validation.txt',mode='r',encoding='utf-8')

In [None]:
def read_data(file_path):
  tokens = []
  tags = []
  
  tweet_token = []
  tag_token = []

  for line in open(file_path,encoding='utf-8'):
    line = line.strip()   #removing extra spaces
    if not line: 
        tokens.append(tweet_token)
        tags.append(tag_token)
        tweet_token=[]
        tag_token=[]  #empty line
    else:
      token,tag = line.split()
      if token[0] == '@':
        token = '<USR>'
      if token[0:7]=='http://'or token[0:8]=='https://':
        token = '<URL>'
      tweet_token.append(token)
      tag_token.append(tag)

  return tokens , tags
     

In [None]:
train_tokens, train_tags = read_data('/content/train.txt')
validation_tokens, validation_tags = read_data('/content/validation.txt')
test_tokens, test_tags = read_data('/content/test.txt')

In [None]:
for i in range(2):
  for token,tag in zip(train_tokens[i],train_tags[i]):
    print('%s\t%s '%(token,tag))

RT	O 
<USR>	O 
:	O 
Online	O 
ticket	O 
sales	O 
for	O 
Ghostland	B-musicartist 
Observatory	I-musicartist 
extended	O 
until	O 
6	O 
PM	O 
EST	O 
due	O 
to	O 
high	O 
demand	O 
.	O 
Get	O 
them	O 
before	O 
they	O 
sell	O 
out	O 
...	O 
Apple	B-product 
MacBook	I-product 
Pro	I-product 
A1278	I-product 
13.3	I-product 
"	I-product 
Laptop	I-product 
-	I-product 
MD101LL/A	I-product 
(	O 
June	O 
,	O 
2012	O 
)	O 
-	O 
Full	O 
read	O 
by	O 
eBay	B-company 
<URL>	O 
<URL>	O 


# building the dictionaries


In [None]:
from collections import defaultdict
import numpy as np

In [None]:
from tqdm import tqdm
# setting special token for padding and unknown words
def build_dict(tokens_or_tags, special_tokens):
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []
    count = 0
    for i in special_tokens:
      tok2idx[i]=count
      count = count + 1
      idx2tok.append(i)
    for words in tqdm(tokens_or_tags):
      for word in words:
        if word not in tok2idx.keys():
          tok2idx[word]=count
          count=count+1
          idx2tok.append(word)
    return tok2idx, idx2tok

In [None]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']


In [None]:
token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags, special_tags)

100%|██████████| 6519/6519 [00:00<00:00, 173770.84it/s]
100%|██████████| 5795/5795 [00:00<00:00, 385075.91it/s]


In [None]:
print(token2idx['<UNK>'],tag2idx['O'])

0 0


In [None]:
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

# Generating mini-batches

In [None]:
def batches_generator(batch_size, tokens, tags,
                      shuffle=True, allow_smaller_last_batch=True):
    
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size   # no of batches
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
            
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        yield x, y, lengths

# Building a network / Model

In [None]:
import tensorflow as tf
import numpy as np

In [None]:
# imporing library
import keras
import keras.utils
from keras import initializers
from keras.layers import Bidirectional,Input,LSTM
from keras.layers import Dense,Activation,Lambda,Embedding,Dropout,concatenate
from keras.optimizers import Adam
from keras.losses import categorical_crossentropy
from keras.metrics import PrecisionAtRecall
from keras.models import Model, Sequential
from keras.backend import placeholder
import keras.backend as K
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
input_batch = placeholder(dtype=tf.int32 , shape=[None , None], name = 'input_batch' )
ground_truth_tags = placeholder(dtype=tf.int32 , shape=[None ,None],name='ground-truth_tags')
lengths = placeholder(dtype=tf.int32, shape= [None], name ='lengths') #for the length of sequences
#dropout_ph = placeholder_with_default(tf.cast(1.0, tf.float32), shape=[]) # dropout placeholder
learning_rate_ph = placeholder(dtype=tf.float32,name='learning_rate_ph')

In [None]:
def ner(vocabulary_size,embedding_dim,rnn_units,n_tags):
  # gnerating embedding layer
  # length = max length of the correspoding batches in training set
  #input_batch = placeholder(shape=[None,max_length],dtype=tf.float64,name='input_batch')

  sentence_indices = Input(shape=(None,))
  initial_embedding_matrix = np.random.rand(vocabulary_size,embedding_dim)/np.sqrt(embedding_dim)
  embedding_layer = Embedding(input_dim = vocabulary_size,output_dim=embedding_dim,embeddings_initializer=initializers.RandomNormal())
  #embedding_layer.astype('tf.float64')
  # propagating setences_indices through embedding layer
  embeddings = embedding_layer(sentence_indices)
  # dropout
  dropout = Dropout(rate=0.8)(embeddings)
  # bi-LSTm layer where output is the concatenation of both the forward and backward state
  BLSTM = Bidirectional(LSTM(units=rnn_units,return_sequences=True),merge_mode='concat')(dropout)
  # Dense layer foloowed by softmax
  dense = Dense(units = n_tags)(BLSTM)
  out = Activation('softmax')(dense)
  # taking argmax
  #out = keras.backend.argmax(activation,axis=-1)
  # definfing the model
  model = Model(inputs=sentence_indices,outputs= out)
  
  return model



In [None]:
model = ner(len(token2idx),200,200,len(tag2idx))

In [None]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 200)         4101000   
_________________________________________________________________
dropout (Dropout)            (None, None, 200)         0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 400)         641600    
_________________________________________________________________
dense (Dense)                (None, None, 21)          8421      
_________________________________________________________________
activation (Activation)      (None, None, 21)          0         
Total params: 4,751,021
Trainable params: 4,751,021
Non-trainable params: 0
____________________________________________

In [None]:
opt = Adam(learning_rate=0.005,decay=np.sqrt(2))
#loss= categorical_crossentropy(y_true=y_train,)
#loss = categorical_crossentropy(from_logits=True)
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'],)

# Training of Model

In [None]:
n_epochs = 4
batch_size = 32

In [None]:
len(tag2idx)

21

In [None]:
print('Start training... \n')

for epoch in tqdm(range(n_epochs)):
  print('-' * 20 + ' Epoch {} '.format(epoch+1) + 'of {} '.format(n_epochs) + '-' * 20)

  for x_batch,y_batch,_ in batches_generator(batch_size,train_tokens,train_tags):
    y_batch = tf.keras.backend.one_hot(y_batch,num_classes=len(tag2idx))
    model.fit(x_batch,y_batch)


  0%|          | 0/4 [00:00<?, ?it/s]

Start training... 

-------------------- Epoch 1 of 4 --------------------


 25%|██▌       | 1/4 [00:56<02:50, 56.69s/it]

-------------------- Epoch 2 of 4 --------------------


 50%|█████     | 2/4 [01:45<01:48, 54.47s/it]

-------------------- Epoch 3 of 4 --------------------


 75%|███████▌  | 3/4 [02:35<00:52, 52.85s/it]

-------------------- Epoch 4 of 4 --------------------


100%|██████████| 4/4 [03:25<00:00, 51.36s/it]


# predicition of model

In [None]:
def prediction_on_bacth(model,tokens,tags):
  pred_tags_batch = []
  tokens_batch = []

  for x_batch,_,_ in batches_generator(1,tokens,tags,shuffle=False):
    prediction = model.predict(x_batch)
    pred_tags = []
    padded_batch = []
    tags_idxs_batch = np.argmax(prediction,axis=-1)
    for tags_idxs,token_idxs in zip(tags_idxs_batch,x_batch):
      pred_tags.append(idxs2tags(tags_idxs))
      padded_batch.append(idxs2words(token_idxs))
    pred_tags_batch.append(pred_tags)
    tokens_batch.append(padded_batch)
  
  return pred_tags_batch,tokens_batch




In [None]:
train_pred,pad_train = prediction_on_bacth(model,train_tokens,train_tags)
validation_pred,pad_val = prediction_on_bacth(model,validation_tokens,validation_tags)
test_pred,pad_test = prediction_on_bacth(model,test_tokens,test_tags)

In [None]:
def pred_lookup(tokens,pred_tags,actual_tags):
  for i in range(3):
    for token,pred,actual in zip(tokens[i][0],pred_tags[i][0],actual_tags[i]):
      print('%s\t%s\t%s' % (token, pred, actual))
    print('--------------')

In [None]:
pred_lookup(pad_train,train_pred,train_tags)

RT	O	O
<USR>	O	O
:	O	O
Online	O	O
ticket	O	O
sales	O	O
for	O	O
Ghostland	B-other	B-musicartist
Observatory	I-other	I-musicartist
extended	O	O
until	O	O
6	O	O
PM	O	O
EST	O	O
due	O	O
to	O	O
high	O	O
demand	O	O
.	O	O
Get	O	O
them	O	O
before	O	O
they	O	O
sell	O	O
out	O	O
...	O	O
--------------
Apple	B-company	B-product
MacBook	I-product	I-product
Pro	I-product	I-product
A1278	I-product	I-product
13.3	I-product	I-product
"	I-product	I-product
Laptop	I-product	I-product
-	O	I-product
MD101LL/A	I-product	I-product
(	O	O
June	O	O
,	O	O
2012	O	O
)	O	O
-	O	O
Full	O	O
read	O	O
by	O	O
eBay	O	B-company
<URL>	O	O
<URL>	O	O
--------------
Happy	O	O
Birthday	O	O
<USR>	O	O
!	O	O
May	O	O
Allah	B-other	B-person
s.w.t	O	O
bless	O	O
you	O	O
with	O	O
goodness	O	O
and	O	O
happiness	O	O
.	O	O
--------------


# Evaluation 

In [None]:
from sklearn.metrics import accuracy_score, f1_score , recall_score , average_precision_score

In [None]:
def eval(tags,pred):
  acc, f1 , recall, precision = [],[],[],[]
  for i in range(len(tags)):
    acc.append(accuracy_score(tags[i],pred[i][0]))
    f1.append(f1_score(tags[i],pred[i][0],average='weighted'))
    recall.append(recall_score(tags[i],pred[i][0],average='weighted'))
    #precision.append(average_precision_score(tags[i],pred[i][0],average='micro'))
  print('acc:',np.mean(acc),' f1_score :',np.mean(f1),'recall :',np.mean(recall))

In [None]:
print('-'*20 + ' Train Set quality:'+ '-'*20)
eval(train_tags,train_pred)

print('-'*20 + ' validation Set quality:'+ '-'*20)
eval(validation_tags,validation_pred)

print('-'*20 + ' Test Set quality:'+ '-'*20)
eval(test_tags,test_pred)

-------------------- Train Set quality:--------------------


  _warn_prf(average, modifier, msg_start, len(result))


acc: 0.9359819580002369  f1_score : 0.9280366734042187 recall : 0.9359819580002369
-------------------- validation Set quality:--------------------
acc: 0.9209864380034457  f1_score : 0.9103569844285917 recall : 0.9209864380034457
-------------------- Test Set quality:--------------------
acc: 0.9130420368235233  f1_score : 0.9030675345765647 recall : 0.9130420368235233
