# Experiment 2: Using text in the dataset CSV
## Approach: 
* Universal Sentence Encoder Large 
* Template matrix build from text encodings 
* Quantiphi text preprocesseing.
* Similarity measured between template and document.
* Similarity below 0.7 ruled 'other.

## Dataset:
* interim_output_template_500_documents_rev4.xlsx.
* 491 documents tested.
* 19 templates used - [acdbcf, ahwcf, ahwcf_v3, ahwcf_v4, aicf_pg1, aicf_pg2, aicf_v1, aicf_v2, aicf_v3, canscr, clmapp, hicf_pg1, hicf_pg2, init_pg1_v2, init_pg3, phystmt, ptscf, pvbcf]
* Available [here]("s3://aaca-ani-experiments-data/aaca-docdig-test/offline_cf_classification/templates/template_images/")

# Result: 75% F1 Score -- Seems like the preprocessing used to extract text in csv is detrimental.

In [26]:
from utils import (s3_2_pil, text_from_response, 
                   load_pickle_from_s3, read_template_labels,
                   fix_others)
import configparser
import boto3
import pandas as pd
import tensorflow_hub as hub
from offline_cf_classification import get_similar_form, classify_offline_cf
from sklearn.metrics import classification_report

In [9]:
config = configparser.ConfigParser()
config.read('config.ini')

BUCKET_NAME = config['TextTest']['UPLOAD_BUCKET_NAME']
MATRIX_DIR = config['Templates']['upload_path']
MATRIX_PATH = MATRIX_DIR + config['Templates']['matrix_file']
JSON_PATH = MATRIX_DIR + config['Templates']['templates_file']

dataset_path = config['TextTest']['DATASET']

s3resource = boto3.resource('s3')

In [7]:
use_matrix = load_pickle_from_s3(BUCKET_NAME, MATRIX_PATH)
template_labels = read_template_labels(BUCKET_NAME, JSON_PATH, s3resource)

In [14]:
MODEL_URL = config['Templates']['MODEL_URL']
model = hub.load(MODEL_URL)

In [31]:
df = pd.read_excel(dataset_path, index_col=0, engine='openpyxl')
df.head()

Unnamed: 0,json,template,textarct_key_value_dictionary,text,png_path,template_from_interim_logic,Unnamed: 7,revised_template
0,doc-digitization-pipeline/AD/P0X024W1_02751999...,ahwcf,"{'ZIP:': '3 2 5 0 4', 'Primary Pollcyholder': ...",02-26-21:11:174M; ;11 # 2/ 2 20 ACCIDENT WELLN...,doc-digitization-pipeline/AD/P0X024W1_02751999...,ahwcf,,ahwcf_v3
1,doc-digitization-pipeline/AD/PX372696_02641791...,aicf_pg2,"{'*Date of Birth (mm/dd/yy)': '/ /', 'DATE': '...",PX372696 Policyholder Information: *Last Name ...,doc-digitization-pipeline/AD/PX372696_02641791...,aicf_pg2,,aicf_pg2
2,doc-digitization-pipeline/AD/P0342280_02744692...,aicf_v1,"{'State': 'IL', 'Initial': 'R.', 'ZIP': '61234...",ACCIDENTAL INJURY CLAIM FORM Failure to comple...,doc-digitization-pipeline/AD/P0342280_02744692...,other,,aicf_v1
3,doc-digitization-pipeline/AD/P0L700T5_02844807...,aicf_pg2,"{'No': 'X', '*First Name': 'M I C H A E L', 'D...",if you have additional bills or medical docume...,doc-digitization-pipeline/AD/P0L700T5_02844807...,aicf_pg2,,aicf_pg2
4,doc-digitization-pipeline/AD/P0P9L5Y4_02716028...,aicf_pg2,"{'""Date of Birth (mm/dd/yy)': 'D 8 / I 8 / 0 6...",02-10-21:02:42PM; AFLAC Claims ;7149869600 3/ ...,doc-digitization-pipeline/AD/P0P9L5Y4_02716028...,aicf_pg2,,aicf_pg2


In [43]:
results, score = [], []

for n, row in df.iterrows():
    text = row['text']
    label = row['revised_template']
    
    predicted_class, probability = classify_offline_cf(text, model, use_matrix, template_labels)

    results.append(predicted_class)
    score.append(probability)

df["results"] = results
df["score"] = score

In [45]:
df.head()

Unnamed: 0,json,template,textarct_key_value_dictionary,text,png_path,template_from_interim_logic,Unnamed: 7,revised_template,results,score
0,doc-digitization-pipeline/AD/P0X024W1_02751999...,ahwcf,"{'ZIP:': '3 2 5 0 4', 'Primary Pollcyholder': ...",02-26-21:11:174M; ;11 # 2/ 2 20 ACCIDENT WELLN...,doc-digitization-pipeline/AD/P0X024W1_02751999...,ahwcf,,ahwcf_v3,ahwcf_v3,0.825596
1,doc-digitization-pipeline/AD/PX372696_02641791...,aicf_pg2,"{'*Date of Birth (mm/dd/yy)': '/ /', 'DATE': '...",PX372696 Policyholder Information: *Last Name ...,doc-digitization-pipeline/AD/PX372696_02641791...,aicf_pg2,,aicf_pg2,aicf_pg2,0.926063
2,doc-digitization-pipeline/AD/P0342280_02744692...,aicf_v1,"{'State': 'IL', 'Initial': 'R.', 'ZIP': '61234...",ACCIDENTAL INJURY CLAIM FORM Failure to comple...,doc-digitization-pipeline/AD/P0342280_02744692...,other,,aicf_v1,aicf_v1,0.939173
3,doc-digitization-pipeline/AD/P0L700T5_02844807...,aicf_pg2,"{'No': 'X', '*First Name': 'M I C H A E L', 'D...",if you have additional bills or medical docume...,doc-digitization-pipeline/AD/P0L700T5_02844807...,aicf_pg2,,aicf_pg2,aicf_pg2,0.893503
4,doc-digitization-pipeline/AD/P0P9L5Y4_02716028...,aicf_pg2,"{'""Date of Birth (mm/dd/yy)': 'D 8 / I 8 / 0 6...",02-10-21:02:42PM; AFLAC Claims ;7149869600 3/ ...,doc-digitization-pipeline/AD/P0P9L5Y4_02716028...,aicf_pg2,,aicf_pg2,ptscf,0.775516


In [46]:
df['results'] = df.apply(lambda row: fix_others(row), axis=1)

In [47]:
print(classification_report(list(df['revised_template']),list(df['results'])))

              precision    recall  f1-score   support

      acdbcf       0.11      1.00      0.20         1
       ahwcf       0.00      0.00      0.00         6
    ahwcf_v2       0.00      0.00      0.00         1
    ahwcf_v3       1.00      0.50      0.67         2
    ahwcf_v4       0.50      1.00      0.67         2
    ahwcf_v5       0.00      0.00      0.00         0
    aicf_pg1       1.00      0.43      0.60       204
    aicf_pg2       1.00      0.94      0.97       192
     aicf_v1       0.92      0.92      0.92        25
     aicf_v2       0.05      1.00      0.09         5
     aicf_v3       1.00      1.00      1.00         1
      canscr       1.00      1.00      1.00         1
      clmapp       1.00      1.00      1.00         5
    hicf_pg1       0.00      0.00      0.00         0
    hicf_pg2       1.00      0.86      0.92         7
 hicf_pg2_v2       0.00      0.00      0.00         1
 init_pg1_v2       1.00      1.00      1.00         1
    init_pg3       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
