# Purpose
The purpose of this notebook is to evaluate using a neural network to identify up to three different variable phrases in a sentence.

## Import

### Packages

In [1]:
# General
import codecs, io, os, re, sys, time
from collections import OrderedDict 
from scipy.stats import uniform
from tqdm import tqdm

# Analysis
import numpy as np
import pandas as pd
from sklearn.metrics import \
    accuracy_score, classification_report, confusion_matrix, \
    precision_recall_fscore_support
from sklearn.model_selection import \
    ParameterGrid, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.utils.class_weight import compute_class_weight

# Visual
import matplotlib.pyplot as plt
import seaborn as sn

# Deep Learning
import tensorflow as tf
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping
from keras.layers.experimental.preprocessing import TextVectorization

### Custom Functions

In [2]:
sys.path.append('./')
from source_entity_extraction import *

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\canfi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\canfi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data
The training data is imported and the necessary columns are converted to lists.

In [3]:
#import data
path_dir_data ="./../data/input/"
file_training_data = 'causality_extraction_2021-02-27 -human_validation_and_correction_erc.xlsx'
path_training_data = os.path.join(path_dir_data, file_training_data)
df = pd.read_excel(path_training_data,  sheet_name = "GroundTruth", engine='openpyxl')

# inspect
df.sample(5)

Unnamed: 0,file_name,hypothesis_num,hypothesis,variable_1,variable_2,direction_12,variable_3,direction_3,causal_relationship
512,wb93smj.pdf,h_5c,environmental complexity will be positively as...,environmental complexity,strategic change,pos,,,0.0
22,bb95jibs.pdf,h_6,japanese subsidiaries with ideal matches betwe...,japanese subsidiaries with ideal matches betwe...,subsidiaries with mismatches,pos,,,0.0
148,ct05crr.pdf,h_2b,Organizational reputation is positively relate...,organizational reputation,customers’ orders (ACO),pos,,,0.0
408,ntll98ijhrm.pdf,,Japanese and local Chinese mncs will emphasize...,,,,,,
453,sgd05amj.pdf,h_2a,the direct relationship between voluntary turn...,voluntary turnover,financial performance,neg,workforce performance,non-lin,1.0


## Randomness
To better control and compare results of the Entity Extraction model between the environments where the model is trainined (Python) and where it will be implemented (R/Shiny), we will attempt to control any random actions by the process to maintain consistent results.

In [4]:
random_state = 5590
np.random.seed(random_state)
tf.random.set_seed(random_state)

# Functions

In [5]:
def text_processing(string):
    # Remove commas
    string = string.replace(",","")
    
    # Remove non-compound word hyphens
    string = string.replace(" - ","")
    
    return string


def encode_hypothesis_str(hypo, var_map):
    '''
    Identifies variable segments in the hypothesis statement and returns a numerical list
    with each word in the hypothesis labeled 0, 1, 2, or 3.
    '''

    # Convert hypothesis to list of words
    hypo_w = hypo.split(" ")
    
    # Create numerical encoding of hypothesis
    hypo_enc = [0] * len(hypo_w)
    
    # Extract variable
    for var_num, var_text in var_map.items():
        # stop if text is na
        if pd.isna(var_text): break
        
        # Determine variable length
        l_var = len(var_text.split(" "))
    
        # Search for variable in hypothesis by inspecting each 
        # segment equal in word length to the variable
        for i in range(l_var, len(hypo_w) + 1):
            
            # Extract segment
            segment = " ".join(hypo_w[i-l_var:i])
           
            # Capture start/stop indices and replace numerical hypothesis
            # label if the segment matches the variable
            if segment == var_text:
                idx_s = i-l_var
                idx_e = i

                hypo_enc[idx_s:idx_e] = [var_num] * l_var
            
                # Once identified move on to next variable
                break
    return hypo_enc


def encode_hypothesis(row):
    # Extract inputs
    ## Hypothesis
    hypothesis = row['hypothesis']
    
    ## Variable maps
    var_map = {
        1: row['variable_1'],
        2: row['variable_2'], 
        3: row['variable_3'],
    }
    
    # Perform text processing
    hypothesis = text_processing(hypothesis)
    
    for var_num, var_text in var_map.items():
        if not pd.isna(var_text):
            var_map[var_num] = text_processing(var_text)
    
    # Generate encoded hypothesis
    hypothesis_encoded = encode_hypothesis_str(hypothesis, var_map)
    
    return hypothesis_encoded


# Target Generation

In [22]:
df['hypothesis_encoded'] = df.apply(lambda row: encode_hypothesis(row), axis=1)

In [32]:
df['hypothesis_encoded_check'] = df['hypothesis_encoded'].apply(lambda x:set(x))
df['n_var_actual'] = df['hypothesis_encoded_check'].apply(lambda x: len(x))

# How many variable numbers we should see
df['n_var_goal'] = [3 if pd.isna(x) else 4 for x in  df['variable_3']]

In [None]:
[f(x) if condition else g(x) for x in sequence]

In [34]:
df['verify_var_count'] = df['n_var_actual'] == df['n_var_goal']

In [39]:
df[~df['verify_var_count'] ]

Unnamed: 0,file_name,hypothesis_num,hypothesis,variable_1,variable_2,direction_12,variable_3,direction_3,causal_relationship,hypothesis_encoded,hypothesis_encoded_check,n_var,n_var_actual,n_var_goal,verify_var_count
12,ay14ijbs.pdf,h_5,The level of reverse logistics adoption has a ...,level of reverse logistics adoption,firm performance,pos,,,1.0,"[0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]","{0, 1}",2,2,3,False
13,azw06meq.pdf,h_1,There is a positive relationship between a fir...,There is a positive relationship between a fir...,performance,pos,,,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{0},1,1,3,False
14,azw06meq.pdf,h_2,There is a positive relationship between a fir...,There is a positive relationship between a fir...,performance,pos,,,0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","{0, 1}",2,2,3,False
15,azw06meq.pdf,h_3,There is a positive relationship between a fir...,There is a positive relationship between a fir...,performance,pos,,,0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","{0, 1}",2,2,3,False
16,azw06meq.pdf,h_4,The implementation cost of ISO 14001 is well w...,implementation cost of iso 14001,firm's performance,pos,,,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",{0},1,1,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,zi06amj.pdf,h_2,Continued investments in\nhigh-involvement wor...,,practices lessen the negative consequences of ...,neg,,,1.0,"[0, 0, 0, 0, 0]",{0},1,1,3,False
534,zmzttj11jcp.pdf,h_2a,Government’s driving forces impact significant...,Government’s driving forces,economic performance of smes,non_lin,,,1.0,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0]","{0, 1}",2,2,3,False
540,zmzttj11jcp.pdf,h_3c,for smes of the two pollution levels: high-pol...,,relationship between environmental performance...,non_lin,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",{0},1,1,3,False
544,zn02smj.pdf,h_4,use of external technical manufacturing source...,use of external technical manufacturing,successful TC,pos,,,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 2, 2]","{0, 2}",2,2,3,False
