In [1]:
## import
import os
import docxpy
from glob import glob
import pandas as pd
import spacy 
  
nlp = spacy.load('en_core_web_sm') 

In [2]:
tdf = pd.read_csv("../data/TrainingTestSet.csv")

In [22]:
# find file which are present in training dataframe and folder (in docs format)
present, absent = [], []
for index, file_name in enumerate(tdf['File Name'].values):
    file_path = fr"../data/Training_data/{file_name}.pdf.docx"
    if os.path.exists(file_path):
        present.append((index, file_path))
    else:
        absent.append(file_path)
print(f"Total unique files in training dataframe: {tdf['File Name'].nunique()}")
print(f"Total unique files in docx fmrat: {len(set(present))}")

Total unique files in training dataframe: 55
Total unique files in docx fmrat: 43


In [28]:
indices = [i for i, _ in present]
tdf.iloc[indices].head()

Unnamed: 0,File Name,Aggrement Value,Aggrement Start Date,Aggrement End Date,Renewal Notice (Days),Party One,Party Two
8,6683127-House-Rental-Contract-GERALDINE-GALINA...,6500.0,20.05.2007,20.05.2008,15.0,"Antonio Levy S. Ingles, Jr. and/or Mary Rose C...",GERALDINE Q. GALINATO
9,6683129-House-Rental-Contract-Geraldine-Galina...,6500.0,20.05.2007,20.05.2008,15.0,"Antonio Levy S. Ingles, Jr. and/or Mary Rose C...",GERALDINE Q. GALINATO
10,18325926-Rental-Agreement-1,4000.0,05.12.2008,31.11.2009,90.0,MR.K.Kuttan,P.M. Narayana Namboodri
12,36199312-Rental-Agreement,3800.0,01.05.2010,31.04.2011,30.0,Balaji.R,Kartheek R
13,44737744-Maddireddy-Bhargava-Reddy-Rental-Agre...,3000.0,20.09.2010,19.07.2011,,M.V.V. VIJAYA SHANKAR,MADDIREDDY BHARGAVA REDDY


In [60]:
text = docxpy.process("../data/Training_data/6683127-House-Rental-Contract-GERALDINE-GALINATO-v2-Page-1.pdf.docx")

In [57]:
# ## for all training data, extract and save the txt files
# for present_file in present:
#     text = docxpy.process(present_file[1])
#     new_file_path = present_file[1].replace("Training_data", "Training_data_text") + '.txt'
#     with open(new_file_path, "wb") as f:
#         f.write(text.encode("utf-8"))

## Run NER on the data

In [39]:
def print_all_entities(text_string):
    doc = nlp(text_string) 
    for ent in doc.ents: 
        print(ent.text, ent.start_char, ent.end_char, ent.label_) 
        
sentence = "Apple is looking at buying U.K. startup for $1 billion"
print_all_entities(sentence)



Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [91]:
# find most macthing phrase -- not working !!
import difflib
index = 14
for x in tdf.iloc[present[index][0]].values[1:]:
    x=str(x).lower()
    best_match_score, best_match = 0, None
    text = docxpy.process(present[index][1])
    doc = nlp(text)
    for ent in doc.ents:
        match_score = difflib.SequenceMatcher(None, ent.text.lower(), x).ratio()
#         print(match_score)
        if match_score > best_match_score:
            best_match_score = match_score
            best_match = ent
    print(x, "--best match-->", best_match)
        

9000.0 --best match--> 01-09-2011
01.09.2011 --best match--> 01-09-2011
31.08.2012 --best match--> 01-09-2011
nan --best match--> Tenant
s parthasarathy --best match--> S Parthasarathy
hari kiran tholeti --best match--> Hari Kiran Tholeti


In [92]:
# print_all_entities(text)
# difflib.SequenceMatcher(None, ent.text.lower()).ratio()

## read txt file and extract new annotations

In [95]:
# with open("../data/Training_data_text/100999172-House-Rental-Agreement.pdf.docx.txt", "r") as f:
#     text = f.read()

In [3]:
import re
from glob import glob

new_labels = ['[start]', '[partyone]', '[partytwo]', '[rent]', '[end]', '[duration]']
def clean_text(text_str):
    for word in ['[start]', '[partyone]', '[partytwo]', '[rent]', '[end]', '[duration]', '{{', "}}"]:
        text_str = text_str.replace(word, "")
    return text_str

def get_training_example(text):
    training_example = []
    offset = 0
    for m in re.compile("{{.*?}}\[.*?\]").finditer(text):
    #     print(m.start(), m.group())
        start = m.start() - offset
        val = re.findall("{{.*?}}", m.group())[0]
        val_type = re.findall("\[.*?\]", m.group())[0]
        offset += 4 + 2 + len(val_type) - 2 
        end = start + len(val) - 4
        training_example.append((start, end, clean_text(val), val_type))
    return training_example, clean_text(text)

# run on all files, and only keep the ones which have training examples
all_training_examples = []
for file_path in glob("../data/Training_data_text/*"):
    with open(file_path, 'rb') as f:
        text = str(f.read(), "utf-8")
    try:
        example = get_training_example(text)
    except Exception as e:
        example = None
    if example is not None:
        all_training_examples.append(example)

In [4]:
all_training_examples = [x for x in all_training_examples if len(x[0])>0]
len(all_training_examples)

15

In [5]:
# convert to training format
TRAIN_DATA = []
for exs in all_training_examples:
    entities = [(ex[0], ex[1], ex[3]) for ex in exs[0]]
    TRAIN_DATA.append((exs[1], {"entities": entities}))

15

## Training NER model

source: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

In [6]:
# Load pre-existing spacy model
import spacy
nlp=spacy.load('en_core_web_sm')

# Getting the pipeline component
ner=nlp.get_pipe("ner")

# Add the new label to ner
for label in new_labels:
    ner.add_label(label)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [7]:
# Importing requirements
from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :
    sizes = compounding(1.0, 4.0, 1.001)
    # Training for 30 iterations     
    for itn in range(200):
        # shuffle examples before training
        random.shuffle(TRAIN_DATA)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=sizes)
        # ictionary to store losses
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            # Calling update() over the iteration
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        if itn%10==0:
            print("Losses", losses)

Losses {'ner': 9899.401126128156}
Losses {'ner': 7544.85429251194}
Losses {'ner': 7483.520290374756}
Losses {'ner': 7121.713675275445}
Losses {'ner': 7388.790061235428}
Losses {'ner': 6944.072084188461}
Losses {'ner': 7312.4352350234985}
Losses {'ner': 7461.889449357986}
Losses {'ner': 7356.691516757011}
Losses {'ner': 7421.382605552673}
Losses {'ner': 7160.302977561951}
Losses {'ner': 7210.4822244644165}
Losses {'ner': 7289.49045753479}
Losses {'ner': 7094.23055934906}
Losses {'ner': 7381.638561248779}
Losses {'ner': 6797.321216583252}
Losses {'ner': 7244.598215103149}
Losses {'ner': 7048.762075424194}
Losses {'ner': 7175.387539863586}
Losses {'ner': 7185.857667922974}


In [46]:
# TRAIN_DATA[1][0]
len(TRAIN_DATA)

15

In [12]:
# Testing the model
doc = nlp(TRAIN_DATA[0][0])
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
# print("Entities", [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['[start]', '[partyone]', '[partytwo]', '[rent]', '[end]', '[duration]', '{{', "}}"] ])

Entities [('Namashivayam', '[partyone]'), ('Mrs.', '[partyone]'), ('14500', '[rent]'), ('twelve months', '[duration]'), ('Jan 10, 2011.', '[start]'), ('Mrs.', '[partyone]')]


In [45]:
# spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)

# Save and load
# Save the  model to directory
# from pathlib import Path
# output_dir = Path('../models/blank_learned_ner/')
# nlp.to_disk(output_dir)
# print("Saved model to", output_dir)

# Load the saved model and predict
# print("Loading from", output_dir)
# nlp_updated = spacy.load(output_dir)
# doc = nlp_updated("Fridge can be ordered in FlipKart" )
# print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

## Training blank Spacy NER

In [35]:
# Train NER from a blank spacy model
import spacy

nlp=spacy.blank("en")

nlp.add_pipe(nlp.create_pipe('ner'))

nlp.begin_training()

# Getting the pipeline component
ner=nlp.get_pipe("ner")

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
    
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [39]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(400):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    if iteration%10==0:
        print("Losses", losses)

Losses {'ner': 5951.016310214996}
Losses {'ner': 5989.326205253601}
Losses {'ner': 6076.603830337524}
Losses {'ner': 6092.696279525757}
Losses {'ner': 6176.659318447113}
Losses {'ner': 5905.225286483765}
Losses {'ner': 6000.822365760803}
Losses {'ner': 5973.484649658203}
Losses {'ner': 5974.219406604767}
Losses {'ner': 6037.844934463501}
Losses {'ner': 6028.957887649536}
Losses {'ner': 5945.0911955833435}
Losses {'ner': 6037.5488867759705}
Losses {'ner': 6018.058259963989}
Losses {'ner': 5984.575963020325}
Losses {'ner': 6007.937124252319}
Losses {'ner': 5963.392322540283}
Losses {'ner': 5950.990727424622}
Losses {'ner': 5990.005153179169}
Losses {'ner': 5999.3633670806885}
Losses {'ner': 5989.132791519165}
Losses {'ner': 6098.023772239685}
Losses {'ner': 6000.967232704163}
Losses {'ner': 6048.398564815521}
Losses {'ner': 5942.407207489014}
Losses {'ner': 6024.710395812988}
Losses {'ner': 5993.76183795929}
Losses {'ner': 6014.270380973816}
Losses {'ner': 5859.240726470947}
Losses {'ner

In [42]:
# Testing the model
doc = nlp(TRAIN_DATA[2][0])
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
# print("Entities", [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['[start]', '[partyone]', '[partytwo]', '[rent]', '[end]', '[duration]', '{{', "}}"] ])

Entities [('M. Geetha', '[partyone]'), ('Siruvani Traders Private Limited', '[partytwo]'), ('Rs.2500', '[rent]'), ('11 Months', '[duration]')]


In [27]:
# TRAIN_DATA[6][0]

## Prepare test data and run model to generate output

In [43]:
validation_result = []
for file in glob("../data/Validation_Data/*"):
    # read and convert the doc to text
    text = docxpy.process(file)
    # extract entities
    doc = nlp_updated(text)
    result = {ent.label_:ent.text for ent in doc.ents}
    result['file_name'] = file
    # save them to df
    validation_result.append(result)

In [44]:
pd.DataFrame(validation_result)

Unnamed: 0,file_name,[start],[partyone],[partytwo],[duration]
0,../data/Validation_Data\156155545-Rental-Agree...,,,,
1,../data/Validation_Data\195231682-This-RENTAL-...,06th day of March 2013 at Hyderabad.,C.BHAGYAMMA,"JP INTERIO,”",
2,../data/Validation_Data\228094620-Rental-Agree...,,,,eleven months
3,../data/Validation_Data\239419594-Rental-Agree...,,,,11 months
4,../data/Validation_Data\24158401-Rental-Agreem...,1st day of April 2008 (1-04-08) by and between...,,,Twelve thousand
5,../data/Validation_Data\269135973-Udaya-Rental...,,,,
6,../data/Validation_Data\63793679-Rental-Agreem...,01-09-2011) at Bangalore by and between Mr. S ...,,,eleven months
7,../data/Validation_Data\95980236-Rental-Agreem...,,,,


## Overlap between training and test set -- we ignore these files while testing

In [300]:
tfiles = [x[22:] for x in glob("../data/Training_data/*")]
vfiles = [x[24:] for x in glob("../data/Validation_Data/*")]
set(tfiles) & set(vfiles)

{'195231682-This-RENTAL-AGREEMENT-is-Made-and-Executed-on-24th-Day-of-September.pdf.docx',
 '269135973-Udaya-Rental-Agreement.pdf.docx',
 '63793679-Rental-Agreement.pdf.docx',
 '95980236-Rental-Agreement.pdf.docx'}

In [302]:
vfiles

['156155545-Rental-Agreement-Kns-Home.pdf.docx',
 '195231682-This-RENTAL-AGREEMENT-is-Made-and-Executed-on-24th-Day-of-September.pdf.docx',
 '228094620-Rental-Agreement.pdf.docx',
 '239419594-Rental-Agreement.pdf.docx',
 '24158401-Rental-Agreement.pdf.docx',
 '269135973-Udaya-Rental-Agreement.pdf.docx',
 '63793679-Rental-Agreement.pdf.docx',
 '95980236-Rental-Agreement.pdf.docx']