In [1]:
import os
import importlib
import logging
importlib.reload(logging)
import framework
importlib.reload(framework)
import bert_ner
importlib.reload(bert_ner)
import infer_bert_classifier
importlib.reload(infer_bert_classifier)
import bert_utils
importlib.reload(bert_utils)
import pandas as pd
import webbrowser
from framework import DataCuration, FeatureEngineering
from bert_ner import TaskNER, FeatureEngineeringNER, BERTNER

# Define some constants and configurations
logging.getLogger().setLevel(logging.INFO)
ACCESS_TOKEN = 'WUpGevbWC9lsnTW8quNUtmWRdAEM89'

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahsaasbajaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahsaasbajaj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Set up the task details. This notebook handles NER (for labeling person and company names)

In [2]:
DATASET = 'w2' # supports w2 and resume
TASK_CONFIG = {
    'task': 'ner',
    'num_labels': 3,
    'labels_dict': {'person' : 0, 'org' : 1, 'none': 2}
}

task = TaskNER(TASK_CONFIG)

## Set paths for datasets and goldens (local or ib, both work).
### Specify configurations

In [3]:
W2_DATA = [
   '/Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records'
]
W2_GOLDEN = [
   '/Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv'
]

GOLDEN_CONFIG = {
    'path': W2_GOLDEN,
    'is_local': True,
    'index_field_name':'filename',
    'file_type': 'csv',
    'identifier': 'file'
}
DATASET_CONFIG = {
    'path': W2_DATA,
    'is_local': True, 
    'file_type': 'ibocr',
    'identifier': lambda path: os.path.basename(path).split('.ibocr')[0],
    'convert2txt': True
}

data = DataCuration(ACCESS_TOKEN, DATASET_CONFIG, GOLDEN_CONFIG)

INFO:root:Loading dataset from /Users/ahsaasbajaj/Documents/Data/w2-instabase/flow/s2_map_records
INFO:root:142 files loaded
INFO:root:Loading goldens from /Users/ahsaasbajaj/Documents/Data/w2-instabase/golden/goldens.csv
INFO:root:Total files Goldens: (154, 25)
INFO:root:Total files found in the source with unique index: (142, 25)


In [4]:
data.golden.head()

Unnamed: 0_level_0,employee_ssn,box5_medicare_wages,box3_ss_wage,box6_medicare_withholding,box4_ss_withholding,box2_fed_withhold,box17_state_income_tax,box1_wage,box8_allocated_tips,box14_other,...,box12c_amount,box12d_code,box12d_amount,employer_federal_ein,document_type,template_name,employer_name,employee_name,w2_year,gross_pay
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
last_year_w2_1493334985571.PDF,561-87-0728,36505.83,36505.83,529.33,2263.36,4093.92,519.22,36505.83,,"[""328.55""]",...,,,,01-0726495,W2,general_w2,BROKER SOLUTIONS,PATRICIA HEREDIA,2016.0,39105.41
last_year_w2_1493334989467.PDF,408-31-3195,51350.25,51350.25,744.58,3183.72,6940.69,,47242.23,,,...,,,,06-1102358,W2,general_w2,FORMAN INDUSTRIES INC,THOMAS V. MOORE,2016.0,51350.25
last_year_w2_1493334998968.PDF,261-77-1595,105916.49,105916.49,1535.82,6566.82,24471.02,,105916.49,,,...,,,,36-4248787,W2,general_w2,"YASH-LUJAN CONSULTING INC Y & L CONSULTING, INC",STACY L STUMETZ,2016.0,110240.0
last_year_w2_1493335006405.PDF,452-93-6475,35987.53,35987.53,521.82,2231.23,2814.31,,35987.53,,,...,,,,74-2482708,W2,general_w2,TECO-WESTINGHOUSE MOTOR COMPANY,HENRY COTTLE,2016.0,43827.05
last_year_w2_1493752474038.PDF,365-04-7683,85245.86,85245.86,1236.06,5285.24,13629.89,3129.87,77722.96,,"[""2069.50"", ""9.00""]",...,10815.96,,,75-2778918,W2,general_w2,FLOWSERVE US INC,JASON ALLEN JERZ,2016.0,88420.2


In [5]:
PROCESSING_CONFIG = {
    'X_DIST_THRESHOLD': 200
}

DATA_ARGS = {
    'task': task,
    'dataset': data,
    'candidates_fields': {
        'person':'employee_name',
        'org':'employer_name'
    }
}

data.generate_candidates_phrases(PROCESSING_CONFIG)
data.compare_candidates_and_goldens(DATA_ARGS['candidates_fields'])

INFO:root:Generating candidates for 142 files
INFO:root:For X_DIST_THRESHOLD configuraion: 200
INFO:root:total files: 142
person names found in candidates: 130
org names found in candidates: 69



### Generate test data from goldens (from actual persons and company names) or from ibocr (using candidate phrases generated by processIBOCR2candidatePhrases())

In [6]:
fe = FeatureEngineeringNER(DATA_ARGS)
test_data_from_goldens = fe.generate_test_samples_from_goldens() # single dataframe
test_data_from_candidates = fe.generate_test_samples_from_candidates() # dict{'filename' : dataframe}

### Loading fine-tuned model for inference. These models were separately trained using GPUs

In [7]:
MODEL_PATHS = {
    'w2' : '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/w2/no-address/5/model.pt', # trained on public w2 from Kaggle
    'public': '/Users/ahsaasbajaj/Documents/Code/ner-hf/sequence-classification/public/no-address/200/model.pt' # trained on public names repo
}

TRAINING_ARGS = {
    'model_file_or_path' : MODEL_PATHS['w2'],
    'model_type': 'bert-large-cased',
    'num_labels': TASK_CONFIG['num_labels'],
    'gpu': False,
}

model = BERTNER(DATA_ARGS, TRAINING_ARGS)

### Setup model evaluator and evaluate either using test_data generated from goldens (test_data_from_goldens) or all candidate strings (test_data_from_candidates). 
#### Below code runs BERT inference and performs extraction, also calculating Recall, Precision, F1 by comparing with goldens

In [8]:
# Predictions
# output_golden = model.predict(test_data_from_goldens) # single dataframe 

# print('Sample outputs: ', output_golden.head())
# model.analyze_golden_result(output_golden)


# Do only for debugging and getting quick results
test_data = FeatureEngineering.get_subset_for_debugging(test_data_from_candidates, sample_size=5)

output = model.predict(test_data) # output is a dictionary
print('Number of files: ', len(output.keys()))
results = model.analyze_result(output)

INFO:root:inferring BERT classifier for file last_year_w2_1495562738481.PDF
INFO:root:inferring BERT classifier for file last_year_w2_1493919897445.PDF
INFO:root:inferring BERT classifier for file last_year_w2_1494974543429.PDF
INFO:root:inferring BERT classifier for file last_year_w2_1494271195603.PDF
INFO:root:inferring BERT classifier for file last_year_w2_1494967235432.PDF
INFO:root:For field person, recall: 0.8000, precision: 0.4000, F1: 0.5333 
INFO:root:For field org, recall: 0.2000, precision: 0.0667, F1: 0.1000 
Number of files:  5


In [12]:
DIR_PATH = '/Users/ahsaasbajaj/Documents/Data/w2-instabase/pdf'

# Choose one file from the list printed above (Samples)
DEMO_FILE = 'last_year_w2_1495562738481.PDF'

FILE_PATH = DIR_PATH + '/' + DEMO_FILE 
webbrowser.open_new(r'file:' + FILE_PATH)

True

In [13]:
model.demo(results, DEMO_FILE)

INFO:root:Field type: person
INFO:root:filename: last_year_w2_1495562738481.PDF
INFO:root:{'cated Tips', 'AMY Y GHOLSTON'}
INFO:root:Field type: org
INFO:root:filename: last_year_w2_1495562738481.PDF
INFO:root:{'DFAS ATTN : DFASIN/ JARE', 'Third-party', 'DFAS ATTN : DFASIN/ JAREA'}


In [14]:
results['person'].keys()

dict_keys(['last_year_w2_1495562738481.PDF', 'last_year_w2_1493919897445.PDF', 'last_year_w2_1494974543429.PDF', 'last_year_w2_1494271195603.PDF', 'last_year_w2_1494967235432.PDF'])