# Predicting Entities<br>
Adam Klie<br>
11/02/2019<br>
Script to predict entities in from trained model

## Set-up

### Import necessary packages

In [1]:
# Data processing
import numpy as np
import pandas as pd
from sklearn import preprocessing

# Data visualization
from tqdm import tqdm
import matplotlib
import seaborn as sns

# NLP
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from string import punctuation

# Neural nets
from keras.models import load_model

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


### Helper functions

In [2]:
# Function to embed tokens from text into word embedding space
def get_features(docs, max_length):
    docs = list(docs)
    Xs = np.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        for token in doc:
            vector_id = token.vocab.vectors.find(key=token.orth)
            if vector_id >= 0:
                Xs[i, j] = vector_id
            else:
                Xs[i, j] = 0
            j += 1
            if j >= max_length:
                break
    return Xs

### Import trained model, word embedding, and data to build validation data of off

#### Trained NER model

In [3]:
model_iter = '11_class'
model_date = '2020_07_09'
grouping = pd.read_csv('../results/embedding/{model}/entity_merging.csv'.format(model=model_iter), index_col=0)
groups = grouping[grouping["I"] == 0]["GroupName"].values

In [4]:
le = preprocessing.LabelEncoder()
le.classes_ = np.load('../results/training/{model}/classes.npy'.format(model = model_iter))
model = load_model('../models/{model}_{date}.h5'.format(model = model_iter, date=model_date))

#### Word embedding model

In [5]:
nlp = spacy.load('../data/wikipedia-pubmed-and-PMC-w2v')

  return f(*args, **kwds)
  return f(*args, **kwds)


#### Metadata

In [6]:
SRS_dir = "../data/allSRS_05_15_2018.pickle"
allSRS = pd.read_pickle(SRS_dir)

## Predict metadata for each class

In [51]:
iter_groups = iter(groups)

### Iterate through each class (run the following chunk for each class)

In [207]:
validation_class = next(iter_groups)
validation_class = validation_class.replace(' ', '_')
validation_class = validation_class.replace('/', '_')
validation_class

StopIteration: 

In [None]:
# Read in validation data for a specific class to predict on
filename = '../results/validation/{model}/{myclass}_validation_set.pickle'.format(model = model_iter, 
                                                                                  myclass = validation_class)
validation_data = pd.read_pickle(filename)

### Breaking up into sentences

In [None]:
processed_test = validation_data.str.split('[;.,]', expand = True).stack()
processed_test = processed_test.str.replace('\s+', ' ')

#### Baseline emission

In [None]:
# Predict the empty state to use as baseline probability emission
val_docs = list(nlp.pipe(' '))
val_X = get_features(val_docs, max_length = model.input_shape[1])
emptyState = model.predict_proba(val_X)[0,:]

#### Prediction loop

In [None]:
stopWords = set(stopwords.words('english'))
rows = []
key_list = []
for i, (key, sent) in enumerate(tqdm(processed_test.items(), total=len(processed_test))):
    
    # Sentence preprocessing
    #sent = re.sub(r'[^a-zA-Z0-9]+', ' ', sent)  # remove non alpha numeric characters
    tokens = re.split(pattern = ' ', string = sent)  # tokenize the description
    tokens = list(filter(lambda token:(token!='') and (token not in stopWords), tokens))  # filter out stopwords
    sent = ' '.join(tokens)
    
    n_gram_max = min([len(tokens), 7])
    for n_gram in range(2, n_gram_max + 1):
        
        # Get prediction for all current n-grams
        grams = list(map(lambda L:" ".join(L), list(ngrams(tokens, n_gram))))  
        val_docs = list(nlp.pipe(grams))  # get spacy objects for each token passed in
        val_X = get_features(val_docs, max_length = model.input_shape[1])
        predictM = model.predict_proba(val_X)
        
        # Take only those n-grams that have a total probability greater than the empty state + 0.01
        # and also have two tokens present in word-embedding
        tmp_df = pd.DataFrame(data = predictM, columns = le.classes_, index = grams)
        empty_mask = (tmp_df - emptyState).abs().sum(axis=1) < 0.01
        moreThanTwoValToken_mask = (val_X != 0).sum(axis=1) >= 2
        tmp_df[empty_mask&moreThanTwoValToken_mask] = 0
        
        # Set up keys for dataframe with probabilities of each n-gram, will be useful later
        for j, gram in enumerate(tmp_df.index):
            i_end = j + n_gram
            textBefore = " ".join(tokens[:j]) + ('' if j==0 else ' ')
            start_char_pos = len(textBefore)
            key_list.append(key + (i, sent, n_gram, j, i_end, gram, start_char_pos)) 
            rows.append(tmp_df.iloc[j])

In [None]:
proba_df = pd.concat(rows, keys = key_list, axis = 1).T
proba_df.index.names = ['srs', 'attribute', 'sentence_number', 'kthSrs', 
                        'orig_text', 'n-gram_length', 'word_start', 'word_end', 'token', 
                        'starting_char_pos']

#### Get only those greater than 2-grams

In [None]:
textS = pd.Series(proba_df.index.get_level_values('orig_text').unique())
textM = textS.str.count(' ') >= 0
selectedTexts = textS[textM].values # get the original texts

In [None]:
n_threshold = 2
proba_sub = proba_df[(proba_df.index.get_level_values('n-gram_length') >= n_threshold) &
                     (proba_df.index.get_level_values('orig_text').isin(selectedTexts))]

#### Take highest probability class

In [None]:
max_proba = proba_sub.max(axis=1)
second_proba = proba_sub.quantile(0.999, interpolation='lower', axis = 1)
scoreMargin_m = (max_proba-second_proba) > 0.1  # proba difference between 1st and 2nd must be greater than 0.1
m_val = scoreMargin_m & (~proba_sub.index.get_level_values('token').str.contains('[0-9 ]+ [0-9 ]+'))

In [None]:
tmpDf = pd.DataFrame({'predicted':proba_sub[m_val].idxmax(axis=1),'score':proba_sub[m_val].max(axis=1)})

In [None]:
scoreSortedDf = tmpDf[m_val].sort_values(['orig_text','word_start','score'], ascending = False).reset_index()

#### Take highest probability overlapping n-gram

In [None]:
v = scoreSortedDf.copy()
scoreSortedDf = scoreSortedDf.assign(OverlapGroup=(len(processed_test)*(v.kthSrs)+ 
                                          (v.word_end - v.word_start.shift(-1)).shift().lt(0).cumsum()))

In [None]:
hitDf=scoreSortedDf.sort_values(['OverlapGroup','score'],ascending=False).drop_duplicates(['OverlapGroup','predicted']
                                                                                   ).sort_values('orig_text')
hitDf['token_len']=hitDf['token'].str.len()
hitDf['recovered_txt']=hitDf.apply(
    lambda tmpS2:tmpS2.loc['orig_text'][tmpS2.loc['starting_char_pos']:(tmpS2.loc['starting_char_pos']+tmpS2.loc['token_len'])],axis=1)

#### Save to pickle object

In [None]:
hitDf.to_pickle('../results/prediction/{model}/{myclass}_prediction.pickle'.format(model = model_iter, 
                                                                                   myclass = validation_class))