<a href="https://colab.research.google.com/github/astromad/MyDeepLearningRepo/blob/master/NER_MODEL_NOCRF_CORRECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sklearn-crfsuite

In [None]:
!pip install seqeval

In [None]:
import numpy as np
import tensorflow.keras as k
import tensorflow as tf
from sklearn_crfsuite.metrics import flat_classification_report
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint
#from keras_contrib.layers import CRF
# from keras_contrib.losses import crf_loss
# from keras_contrib.metrics import crf_accuracy
#from tf2CRF import CRF
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Input
from tensorflow.keras.models import Model, Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from future.utils import iteritems
from math import nan
import random
import os
import pandas as pd
from nltk import word_tokenize


In [None]:
class SentenceGetter(object):

    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        def agg_func(s): return [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [None]:
def loadData():
    # df1 = pd.read_csv("/home/radmad/Desktop/OLD-DATA/MY-DeepLearning/KERAS/CUSTOM-NER/ner.csv",
    #              encoding="ISO-8859-1", error_bad_lines=False)
    df = pd.read_csv("/content/drive/My Drive/ColabData/data_keras.csv",
                 encoding="ISO-8859-1", error_bad_lines=False)
    #df2 = None
    # frames = [df1,df2]
    # df = pd.concat(frames)
    # df.head()

    data = df[['sentence_idx', 'word', 'tag']]
    print(data.head(30))
    print('total tags',df['tag'].value_counts())
    getter = SentenceGetter(data)
    sentences = getter.sentences
    print('Printing sentences')
    for i in range(3):
        print(sentences[i])
    words = list(set(data["word"].values))
    n_words = len(words)
    tags = []
    for tag in set(data["tag"].values):
        if tag is nan or isinstance(tag, float):
            tags.append('unk')
        else:
            tags.append(tag)
    n_tags = len(tags)
    print('number of tags=',n_tags)
    word2idx = {w: i for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}


    maxlen = max([len(s) for s in sentences])
    random.shuffle(sentences)

    X = [[word2idx[w[0]] for w in s] for s in sentences]
    X = pad_sequences(maxlen=maxlen, sequences=X, padding="post")
    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
    y = [to_categorical(i, num_classes=n_tags) for i in y]

    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    print(X_train.shape)
    print(np.array(y_train).shape)
    print("&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&",len(words))
    return (X_train, X_test, y_train, y_test, words, tags, maxlen,word2idx,tag2idx)


In [None]:
def plot_history(history):
    plt.style.use('ggplot')
    accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(accuracy) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, accuracy, 'b', label='Training acc')
    plt.plot(x, val_accuracy, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
def predict_Model(model1,X_test,y_test,word2idx,tag2idx,tags):
    idx2tag = {v: k for k, v in iteritems(tag2idx)}
    test_pred = model1.predict(X_test, verbose=1)
    pred_labels = pred2label(test_pred,idx2tag)
    test_labels = pred2label(y_test,idx2tag)
    #! pip install seqeval
    print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))
    #! pip install sklearn_crfsuite
    report = flat_classification_report(y_pred=pred_labels, y_true=test_labels)
    print(report)

    TP = {}
    TN = {}
    FP = {}
    FN = {}
    for tag in tag2idx.keys():
        TP[tag] = 0
        TN[tag] = 0
        FP[tag] = 0
        FN[tag] = 0

    for i, sentence in enumerate(X_test):
        y_hat = np.argmax(test_pred[0], axis=-1)
        gt = np.argmax(y_test[0], axis=-1)
        for idx, (w, pred) in enumerate(zip(sentence, y_hat)):
            accumulate_score_by_tag(idx2tag[gt[idx]], tags[pred],TP,TN,FP,FN)

    for tag in tag2idx.keys():
        print(f'tag:{tag}')
        print('\t TN:{:10}\tFP:{:10}'.format(TN[tag], FP[tag]))
        print('\t FN:{:10}\tTP:{:10}'.format(FN[tag], TP[tag]))


In [None]:
def classify(str,model,word2idx,words,maxlen,idx2Label):
    print('number of words',len(words))
    #sentence =word= word_tokenize(str)
    #print(sentence)
    #X = getSenIndex(sentence,word2idx,words)
    X = getSenIndex(str,word2idx,words)
    n_words = len(words)
    print('number of words',len(words))

    word2idx = {w: i for i, w in enumerate(words)}
    print(X)
    #X = pad_sequences(maxlen=maxlen, sequences=X, padding="post", value=len(words) - 1)
    X = pad_sequences(maxlen=maxlen, sequences=X, padding="post")
    #X = padding(X,maxlen)
    print(np.array(X))
    pred = model.predict(np.array(X), verbose=False)[0] 
    pred = pred.argmax(axis=-1)
    pred = [idx2Label[x].strip() for x in pred]
    return list(zip(str,pred))

In [17]:
def build_Embedding(n_words,n_tags,maxlen,word_embedding_size = 150):
    # inputs = Input(shape=(n_words,))
    # embedding = Embedding(input_dim=n_words,
    #                 output_dim=word_embedding_size, trainable = False, input_length=maxlen)(inputs)
    inputs = Input(shape=(maxlen,))
    embedding = Embedding(input_dim=n_words+100,
                    output_dim=word_embedding_size, trainable = True, input_length=maxlen)(inputs)
    return tf.keras.Model(inputs,embedding)

def build_RNN(n_words,n_tags,maxlen,word_embedding_size = 150):
    #input_emb = Input(shape=(maxlen, word_embedding_size))
    input_emb = Input(shape=(maxlen, word_embedding_size))    
    
    X = Bidirectional(LSTM(units=word_embedding_size, unroll=False, activation='tanh', recurrent_activation='sigmoid',
                             use_bias=True, return_sequences=True, dropout=0.5, recurrent_dropout=0.0, kernel_initializer=k.initializers.he_normal()))(input_emb)
    X = LSTM(units=word_embedding_size * 2, unroll=False, activation='tanh', recurrent_activation='sigmoid',
               use_bias=True, return_sequences=True, dropout=0.5, recurrent_dropout=0.0, kernel_initializer=k.initializers.he_normal())(X)
    X = TimeDistributed(Dense(n_tags, activation="sigmoid"))(X)

    return tf.keras.Model(input_emb, X)

def run_model(model,X_train,y_train):    
    # Saving the best model only
    # filepath = "ner-bi-lstm-td-model-{val_accuracy:.2f}.hdf5"
    # checkpoint = ModelCheckpoint(
    #     filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    # callbacks_list = [checkpoint]

    # Fit the best model
    # history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=20,
    #                 validation_split=0.1, verbose=1, callbacks=callbacks_list)
    history = model.fit(X_train, np.array(y_train), batch_size=128, epochs=5,
                    validation_split=0.1,verbose=1,shuffle=True)
    return history



# def idx2tag(tag2idx):
#     return {v: k for k, v in iteritems(tag2idx)}

def pred2label(pred,idx2tag):
    
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out

def accumulate_score_by_tag(gt,pred,TP,TN,FP,FN):
    """
    For each tag keep stats
    """
    if gt == pred:
        TP[gt] += 1
    elif gt != 'O' and pred == 'O':
        FN[gt] += 1
    elif gt == 'O' and pred != 'O':
        FP[gt] += 1
    else:
        TN[gt] += 1


def getSenIndex(sen,word2Idx,words):
    unknownIdx = len(words)-1
    
    wordIndices = []    
    for word in sen: 
        #print(word) 
        #word = str(word)
        if word in word2Idx:
            wordIdx = word2Idx[word]
            #print('word found',word)
        elif word.lower() in word2Idx:
            wordIdx = word2Idx[word.lower()] 
            #print('word found',word)                
        else:
            print('word NOT found, but adding',words.append(word))
            print(word)
            wordIdx = len(words)
           
        wordIndices.append(wordIdx) 
    print(wordIndices)           
    return [wordIndices]

In [None]:
np.random.seed(1)
random.seed(1)
tf.random.set_seed(1)
os.environ['PYTHONHASHSEED']= '0'
# print(X_test[0],X_test[0].shape)
# print(X_test[1],X_test[1].shape)
# idx2word = {v: k for k, v in iteritems(word2idx)}
# word2idx = {w: i for i, w in enumerate(words)}
# sentenses = ['my name is  and my ssn is 123-98-1122','my name is  and my ssn is 123-98-1122']
# X = [[word2idx[w] for w in s.split()] for s in sentenses]
# X = pad_sequences(maxlen=100, sequences=X, padding="post", value=100)
# print(X,X.shape)
# exit(0)

X_train, X_test, y_train, y_test, words, tags, maxlen, word2idx,tag2idx = loadData()
print("after from data call &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&",len(words))
#model= build_Model(len(words),len(tags),maxlen,word_embedding_size = 150)
embedding_model = build_Embedding(len(words),len(tags),maxlen,word_embedding_size = 150)
RNN_model = build_RNN(len(words),len(tags),maxlen,word_embedding_size = 150)
model = tf.keras.Model(embedding_model.input, RNN_model(embedding_model.output))
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
history = run_model(model,X_train,y_train)
print("after running model&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&",len(words))
print('Saving Model Weights')
RNN_model.save_weights('/content/drive/My Drive/ColabData/NER-MODEL-WEIGHTS')
plot_history(history)
plt.show()
# commenting for now predict_Model(model,X_test,y_test,word2idx,tag2idx,tags)
    
# tf.keras.models.save_model(
#     model, "/content/drive/My Drive/ColabData/NER-MODEL-WEIGHTS-NOCRF", overwrite=True, include_optimizer=True, save_format="tf",
#     signatures=None, options=None
# )

# sentences = [idx2word[x] for x in X_test[0]]
# print(sentences)
# X = [word2idx[s]  for s in sentences]
# print(np.array(X))
# print(X_test[0])
#print('---------------',word2idx.shape[0])
#print('---------------',word2idx.shape[1])

#     tf.keras.backend.clear_session()
#     print("-----------------------------------------------")
#     X_train, X_test, y_train, y_test, words, tags, maxlen, word2idx,tag2idx = loadData()
#     classifier = k.models.load_model('/home/radmad/Desktop/OLD-DATA/MY-DeepLearning/KERAS/CUSTOM-NER/NER-MODEL-WEIGHTS-NOCRF',compile=False)
#     predict_Model(classifier,X_test,y_test,word2idx,tag2idx,tags)


In [11]:
idx2tag = {v: k for k, v in iteritems(tag2idx)}
idx2word = {v: k for k, v in iteritems(word2idx)}
print('predicting-----------------------------')
test =np.array([X_test[0]])
test_pred=model.predict(test)
#print(pred2label(test_pred,idx2tag))
test_pred = pred2label(test_pred,idx2tag)[0]
#print(pred2label(test,idx2word))
test_words = [idx2word[x].strip() for x in X_test[0]]
print(test_pred)
print(test_words)
for x,y in zip(test_words,test_pred):
        print(x,y)
print('Done predicting-----------------------------')

predicting-----------------------------
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-NAME', 'I-NAME', 'O', 'O', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'I-SSN', 'O', 'O', 'O', 'O', 'I-CCARD', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PHONE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-NAME', 'I-NAME', 'O', 'O', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'I-SSN', 'O', 'O', 'O', 'O', 'I-CCARD', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PHONE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-NAME', 'I-NAME', 'O', 'O', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'I-SSN', 'O', 'O', 'O', 'I-CCARD', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PHONE', 'O', 'O', 'O', 'O', 'O', 'O', 'I-NAME', 'I-NAME', 'O', 'O', 'I-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-CCARD', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PHONE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['House', 'interest', 'have', 'there.,', 'Her', 'name', 'is', 'Janice', 'Berry', 'Born', 'on', '1990-03-16', ',', 'Her', 'social', 'security', 'number', '

In [19]:
print('predicting orbitrary text')
str =[idx2word[x] for x in X_test[0]]
print('99999999999',str)
output =classify(str,model,word2idx,words,maxlen,idx2tag)
print(output)
# str = 'my name is madhava avvari my ssn is 435-33-0687 and my phone number is 408-306-1500'.split()
# print('99999999999',str)
# output =classify(str,model,word2idx,words,maxlen,idx2tag)
# print(output)


predicting orbitrary text
99999999999 ['House', 'interest', 'have', 'there.,', 'Her', 'name', 'is', 'Janice', 'Berry', 'Born', 'on', '1990-03-16', ',', 'Her', 'social', 'security', 'number', 'is', '097-13-9852', 'Her', 'card', 'number', 'is', '676203221368', ',', 'and', 'her', 'phone', 'number', 'is', '(901)471-5843x32893', 'Seem', 'may', 'seek', 'fire', 'paper.,', 'Her', 'name', 'is', 'Alexandra', 'Edwards', 'Born', 'on', '1988-08-21', ',', 'Her', 'S', 'S', 'N', 'is', '388-27-7260', 'Her', 'card', 'number', 'is', '180027084371470', ',', 'and', 'her', 'phone', 'number', 'is', '001-430-615-2852x3637', 'Step', 'order', 'now', 'hot', 'various', 'western', 'look', 'fire.,', 'Her', 'name', 'is', 'Thomas', 'Simmons', 'Born', 'on', '2014-02-21', ',', 'Her', 'S', 'S', 'N', 'is', '232 52 7780', 'Her', 'credit', 'is', '30566540963869', ',', 'and', 'her', 'phone', 'number', 'is', '987-277-6275', 'Choice', 'much', 'anyone.,', 'Her', 'name', 'is', 'Kevin', 'Hester', 'Born', 'on', '2007-05-09', ',',