## 0 Library Import

In [1]:
import pandas as pd
import numpy as np
import re
import pickle

from collections import defaultdict
import ast

from numpy import array
from numpy import argmax
from matplotlib import pyplot

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Embedding, Input, TimeDistributed, Activation
from tensorflow.keras.layers import MaxPooling1D, SpatialDropout1D, Dropout, Concatenate, Flatten, RepeatVector, Permute
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Multiply, Lambda
from tensorflow.keras.backend import sum as Ksum

from tensorflow.keras.models import load_model, model_from_json
from tensorflow.keras.utils import plot_model

### 1 Data

#### 1.1 Data load

In [2]:
labeled_data_df=pd.read_csv('labeled_data.txt',sep='|')
labeled_data_df[['Addresses']]=pd.DataFrame(labeled_data_df['Addresses'].apply(lambda t:ast.literal_eval(t)))
labeled_data_df[['Notice']]=pd.DataFrame(labeled_data_df['Notice'].apply(lambda t:t.replace('\n',' ')))
labeled_data_df[['Notice']]=pd.DataFrame(labeled_data_df['Notice'].apply(lambda t:' '.join(t.split())))

In [3]:
def unnest(df, tile, explode):
    vals = df[explode].sum(1)
    rs = [len(r) for r in vals]
    a = np.repeat(df[tile].values, rs, axis=0)
    b = np.concatenate(vals.values)
    d = np.column_stack((a, b))
    return pd.DataFrame(d, columns = tile +  ['_'.join(explode)])
def join_textseq(seq_list,start_char='<',end_char='>'):
    new_seq_list=[]
    for seq in seq_list:
        new_seq_list.append('<'+seq+'>')
    return ''.join(new_seq_list)

In [4]:
data_df=unnest(labeled_data_df, ['Notice'], ['Addresses'])

In [5]:
data_df.head()

Unnamed: 0,Notice,Addresses
0,"An Eviction Notice, also known as a Notice to ...","PSC 8763, Box 5203 APO AE 49080"
1,This record may be useful in case of future le...,"311 Rosario Haven Suite 194 Jessicamouth, HI 1..."
2,This record may be useful in case of future le...,"81965 Harris Fall Suite 885 Hallhaven, WV 96622"
3,"At the end, you receive it in Word and PDF for...","81888 Hart Turnpike Apt. 641 West Jesse, MT 11649"
4,"At the end, you receive it in Word and PDF for...","5770 Martinez Mountain Apt. 482 Matthewland, K..."


In [6]:
data_df_m2=labeled_data_df.copy(deep=True)

In [7]:
data_df_m2['Addresses']=labeled_data_df['Addresses'].apply(lambda l:list(l)).apply(lambda l:join_textseq(l))

In [8]:
labeled_data_df.iloc[1413]

Notice       to hany renter bradford address subject evicti...
Addresses                                                   ()
Name: 1413, dtype: object

#### 1.2 Feature Dictionary

In [9]:
## Add hash and dash sign as well
features='1234567890abcdefghijklmnopqrstuvwxyz <>'
feature_dict=defaultdict(int)
count=0
for f in features:
    count+=1
    feature_dict[f] += count # increment element's value by 1

inv_feature_dict = {v: k for k, v in feature_dict.items()}

#### 1.3 Encode/Decode

In [91]:
def encode_label_seq(text, feature_dict=feature_dict):
    code=[]
    text=text.lower()
    text='<'+text+'>'
    for charac in text:
        code.append(feature_dict[charac])
    return code

def decode_label_seq(seq, mapping=inv_feature_dict):
    seq_d=[]
    for num in seq:
        n=int(np.round(num))
        if n!= 0:
            seq_d.append(mapping[n])
        else:
            seq_d.append('_')
    return ''.join(seq_d)

#### 1.4 Utility function

In [11]:
def get_max_length(seq_array):
    MAX_LABEL_SEQ_LEN=0
    for seq in seq_array:
        if len(seq) > MAX_LABEL_SEQ_LEN:
            MAX_LABEL_SEQ_LEN=len(seq) 
    return MAX_LABEL_SEQ_LEN

In [12]:
def onehot_encode_vector(np_arr, onehot_length=800):
    np_arr=np.array(np_arr)
    #flat_arr=[item for seq in np_arr for item in seq]
    if onehot_length < max(np_arr)+1:
        onehot_length=max(np_arr)+1
    encoded_matrix=np.zeros((np_arr.shape[0],onehot_length))
    for i in range(np_arr.shape[0]):
            encoded_matrix[i,np_arr[i]]
    return encoded_matrix
def onehot_encode_matrix(np_arr):
    flat_arr=[item for seq in np_arr for item in seq]
    len_enc=max(flat_arr)+1
    encoded_matrix=np.zeros((np_arr.shape[0],np_arr.shape[1],len_enc))
    for i in range(np_arr.shape[0]):
        for j in range(np_arr.shape[1]):
            encoded_matrix[i,j,np_arr[i,j]]=1
    return encoded_matrix   

### 2 Feature Prep

- M1 - Modelling approach: Spearate Out multiple addresses 
- M2 - Modelling approach: Do not spearate Out multiple addresses 

### M1

#### 2.1.1 Encoder Input

In [14]:
inp_encoder_seq=data_df['Notice'].apply(lambda l:encode_label_seq(l))

In [15]:
print("Max sequence length:",get_max_length(inp_encoder_seq))

Max sequence length: 8040


In [16]:
X=pad_sequences(inp_encoder_seq, 10000, padding='post')

#### 2.1.2 Decoder Input

In [17]:
decoder_seq=data_df['Addresses'].apply(lambda l:encode_label_seq(l))

In [18]:
print("Max decoder sequence length:",get_max_length(decoder_seq))

Max decoder sequence length: 67


In [19]:
D_in=pad_sequences([s[:-1] for s in decoder_seq],100,padding='post')
D_ou=onehot_encode_matrix(pad_sequences([s[1:] for s in decoder_seq],100,padding='post'))

### M2


#### 2.2.1 Encoder Input

In [20]:
inp_encoder_seq_m2=data_df_m2['Notice'].apply(lambda l:encode_label_seq(l))

In [21]:
X_m2=pad_sequences(inp_encoder_seq_m2, 10000, padding='post')

#### 2.2.2 Decoder Input 

In [22]:
decoder_seq_m2=data_df_m2['Addresses'].apply(lambda l:encode_label_seq(l))

In [23]:
print("Max decoder sequence length:",get_max_length(decoder_seq_m2))

Max decoder sequence length: 175


In [24]:
D_in_m2=pad_sequences([s[:-1] for s in decoder_seq_m2],200,padding='post')
D_ou_m2=onehot_encode_matrix(pad_sequences([s[1:] for s in decoder_seq_m2],200,padding='post'))

### 3 Modelling

In [25]:
def save_model(model,name_suffix=''):
    model_json = model.to_json()
    model_name='model'+name_suffix
    json_filename=model_name+'.json'
    hdf5_filename=model_name+'.h5'
    with open(json_filename, "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(hdf5_filename)
    print("Saved model to disk")

In [38]:
def reload_model(name_suffix):
    model_name='model'+name_suffix
    json_filename=model_name+'.json'
    hdf5_filename=model_name+'.h5'
    json_file = open(json_filename, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(hdf5_filename)
    print("Loaded model from disk")
    return loaded_model

### M1

In [26]:
num_encoder_tokens=len(feature_dict)+1 # +1 to default character
num_decoder_tokens=len(feature_dict)+1

Embedding size rule based on https://forums.fast.ai/t/embedding-layer-size-rule/50691

In [27]:
min(600, round(1.6 * num_encoder_tokens ** .56))

19

In [28]:
latent_dim=20

In [29]:
print('# Encoder tokens',num_encoder_tokens)
print('# Decoder tokens',num_decoder_tokens)
print('# embdedding dims',latent_dim)
print('# Encoder Input',X.shape)
print('# Decoder Input',D_in.shape)
print('# Decoder Input',D_ou.shape)

# Encoder tokens 85
# Decoder tokens 85
# embdedding dims 20
# Encoder Input (2514, 10000)
# Decoder Input (2514, 100)
# Decoder Input (2514, 100, 40)


In [30]:
encoder_inputs = Input(shape=(None,))
x = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
x, state_h, state_c = LSTM(latent_dim,
                           return_state=True)(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
x = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
x, _,_ = LSTM(latent_dim, return_sequences=True, return_state=True)(x, initial_state=encoder_states)
#x = Lambda(lambda xin: Ksum(xin, axis=1))(x)
decoder_outputs = Dense(40, activation='softmax')(x)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!

In [31]:
epochs=120
batch_size=96

In [32]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 20)     1700        input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 20)     1700        input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
history=model.fit([X, D_in], D_ou,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

In [None]:
save_model(model,'m1_')
with open('m1_history_dict', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

### M2

In [33]:
print('# Encoder Input',X_m2.shape)
print('# Decoder Input',D_in_m2.shape)
print('# Decoder Input',D_ou_m2.shape)

# Encoder Input (1443, 10000)
# Decoder Input (1443, 200)
# Decoder Input (1443, 200, 40)


In [34]:
encoder_inputs = Input(shape=(None,))
x = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
x, state_h, state_c = LSTM(latent_dim,
                           return_state=True)(x)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
x = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
x, _,_ = LSTM(latent_dim, return_sequences=True, return_state=True)(x, initial_state=encoder_states)
#x = Lambda(lambda xin: Ksum(xin, axis=1))(x)
decoder_outputs = Dense(40, activation='softmax')(x)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model_m2 = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile & run training
model_m2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Note that `decoder_target_data` needs to be one-hot encoded,
# rather than sequences of integers like `decoder_input_data`!

In [35]:
epochs=120
batch_size=96

In [36]:
model_m2.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 20)     1700        input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 20)     1700        input_4[0][0]                    
____________________________________________________________________________________________

In [None]:
history=model_m2.fit([X_m2, D_in_m2], D_ou_m2,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

In [None]:
save_model(model_m2,'m2_')
with open('m1_history_dict', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

In [None]:
m1=reload_model('m1_')

### 4 Validation

#### 4.1 Inference Setup

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

#### 4.1 Validate over Train Data

In [125]:
def compare_output(seq_num, model, train_df=labeled_data_df):
    tr=train_df.iloc[1]['Notice']
    tl=train_df.iloc[1]['Addresses'][0]
    tr_in=np.array(encode_label_seq(tr))
    tr_in2=np.zeros(99)
    tr_in2[0]=38
    #tr_in2=np.array(encode_label_seq('311 Rosario Haven Suite 194 Jessicamouth, HI 19398'))
    print('Train Label:',tl)
    tp_ou=m1.predict([[tr_in],[tr_in2]])
    tp=argmax(tp_ou,axis=2)
    print('Predicted Label\n',decode_label_seq(tp[0]))

In [171]:
tr=labeled_data_df.iloc[1]['Notice']
tl=labeled_data_df.iloc[1]['Addresses'][0]
tr_in=np.array(encode_label_seq(tr))
tr_in2=np.zeros(99)
tr_in2[0]=38
#tr_in2=np.array(encode_label_seq('311 Rosario Haven Suite 194 Jessicamouth, HI 19398'))
print('Train Label:',tl)
tp_ou=m1.predict([[tr_in],[tr_in2]])
tp=argmax(tp_ou,axis=2)
print('Predicted Label\n',decode_label_seq(tp[0]))

Train Label: 311 Rosario Haven Suite 194 Jessicamouth, HI 19398
Predicted Label
 83                                                                _________________________________


In [172]:
#def predict_feed(model, encoder_input, decoder_input)
encoder_input=tr_in
decoder_input=tr_in2

for i in range(len(decoder_input)-1):
    next_char=argmax(m1.predict([[encoder_input],[decoder_input]]),axis=2)[0][i+1]
    decoder_input[i+1]=next_char


In [175]:
decode_label_seq(decoder_input)

'<3 777 a pa 777 777________________________________________________________________________________'

In [165]:
argmax(m1.predict([[encoder_input],[decoder_input]]),axis=2)[0]

array([ 8,  8,  8, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 37, 37, 37, 37, 37,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int64)

In [156]:
argmax(m1.predict([[encoder_input],[decoder_input]]),axis=2)[0][0]

8

In [135]:
tp[0]

array([ 8,  3, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
       37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int64)

In [126]:
compare_output(11,m1)

Train Label: 311 Rosario Haven Suite 194 Jessicamouth, HI 19398
Predicted Label
 83                                                                _________________________________
