# Obtaining DocBin for training with spaCy

In [1]:
import spacy
from spacy.tokens import DocBin
import pandas as pd
import re
from spacy import displacy

In [2]:
def massage_data(address):
    # Replacing multiple commas and multiple whitespaces following commas with a single comma and whitespace
    cleansed_address1=re.sub(r'(,)(?!\s)',', ',address)
    # Replacing new lines with comma
    cleansed_address2=re.sub(r'(\\n)',', ',cleansed_address1)
    # Replacing multiple spaces before and after a hyphen with a single whitespace before and after
    cleansed_address3=re.sub(r'(?!\s)(-)(?!\s)',' - ',cleansed_address2)
    # Removing period from the string
    cleansed_address=re.sub(r'\.','',cleansed_address3)
    return cleansed_address

In [3]:
def get_address_span(address=None,address_component=None,label=None):
    if pd.isna(address_component) or str(address_component)=='nan':
        # Don't do anything if the address or address component is not present
        pass
    else:
        # Find the address component in the address and return start and end indices
        address_component1=re.sub('\.','',address_component)
        address_component2=re.sub(r'(?!\s)(-)(?!\s)',' - ',address_component1)
        span=re.search('\\b(?:'+address_component2+')\\b',address)
        return (span.start(),span.end(),label)

In [4]:
def extend_list(entity_list,entity):
    # Method to extend list
    if pd.isna(entity):
        return entity_list
    else:
        entity_list.append(entity)
        return entity_list

In [5]:
def create_entity_spans(df,tag_list):
    # Obtaining entity span and labels of the address components from the given data
    # This data is used for training
    df['Address']=df['Address'].apply(lambda x: massage_data(x))
    df["BuildingTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Building_Name'],label='BUILDING_NAME'),axis=1)
    df["BuildingNoTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Building_Number'],label='BUILDING_NO'),axis=1)
    df["RecipientTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Recipient'],label='RECIPIENT'),axis=1)
    df["StreetNameTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Street_Name'],label='STREET_NAME'),axis=1)
    df["ZipCodeTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Zip_Code'],label='ZIP_CODE'),axis=1)
    df["CityTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['City'],label='CITY'),axis=1)
    df["CountryTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Country'],label='COUNTRY'),axis=1)
    df["StateTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['State'],label='STATE'),axis=1)
    df['EmptySpan']=df.apply(lambda x: [], axis=1)
    # print(df)

    for i in tag_list:
        df['EntitySpans']=df.apply(lambda row: extend_list(row['EmptySpan'],row[i]),axis=1)
        df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
        # print(df)
    return df['EntitySpans']

In [6]:
def get_doc_bin(training_data,nlp):
    # DocBin is used for data storing in spaCy
    # the DocBin will store the example documents
    db = DocBin()
    for text, annotations in training_data:
        doc = nlp(text) #Construct a Doc object
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = ents
        db.add(doc)
    return db

In [7]:
nlp = spacy.blank("en")

In [8]:
tag_list=["BuildingTag","BuildingNoTag","RecipientTag","StreetNameTag","ZipCodeTag","CityTag","StateTag","CountryTag"]

In [9]:
df_train=pd.read_csv(filepath_or_buffer="us-train-dataset.csv",sep=",",dtype=str)

In [10]:
df_entity_spans= create_entity_spans(df_train.astype(str),tag_list)
training_data= df_entity_spans.values.tolist()

In [11]:
doc_bin_train= get_doc_bin(training_data,nlp)
doc_bin_train.to_disk("train.spacy")

In [12]:
df_test=pd.read_csv(filepath_or_buffer="us-test-dataset.csv",sep=",",dtype=str)

In [13]:
df_entity_spans= create_entity_spans(df_test.astype(str),tag_list)
validation_data= df_entity_spans.values.tolist()

In [14]:
doc_bin_test= get_doc_bin(validation_data,nlp)
doc_bin_test.to_disk("test.spacy")

# Initializing and training with spaCy

In [15]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


2022-07-02 13:24:03.765361: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-07-02 13:24:03.765404: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [16]:
!python -m spacy train config.cfg --paths.train train.spacy --paths.dev test.spacy --output output --training.eval_frequency 10 --training.max_steps 300

[i] Saving to output directory: output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['ner']
[i] Initial learn rate: 0.001
E    #       LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --------  ------  ------  ------  ------
  0       0     66.65    6.46    4.40   12.21    0.06
  0      10    824.19    0.00    0.00    0.00    0.00
  1      20    466.36   25.49   35.62   19.85    0.25
  1      30    388.29   58.18   71.91   48.85    0.58
  2      40    340.52   71.73   80.19   64.89    0.72
  2      50    274.83   74.70   78.81   70.99    0.75
  3      60    197.16   90.00   90.70   89.31    0.90
  3      70     94.56   92.72   93.08   92.37    0.93
  4      80     58.78   93.85   94.57   93.13    0.94
  4      90     35.37   93.13   93.13   93.13    0.93
  5     100     39.69   96.55   96.92   96.18    0.97
  5     110     22.34   96.55   96.92   96.18    0.97
  6     120     23.52   97.34   96.97   97.71    0.97
  7     130     10.35   97.32   97.69   96.95    0

2022-07-02 13:24:10.544392: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-07-02 13:24:10.544428: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2022-07-02 13:24:15,499] [INFO] Set up nlp object from config
[2022-07-02 13:24:15,508] [INFO] Pipeline: ['ner']
[2022-07-02 13:24:15,512] [INFO] Created vocabulary
[2022-07-02 13:24:15,514] [INFO] Finished initializing nlp object
[2022-07-02 13:24:15,668] [INFO] Initialized pipeline components: ['ner']


# Prediction from model

In [17]:
nlp=spacy.load("output\model-best")

In [18]:
address_list=["130 W BOSE ST STE 100, PARK RIDGE, IL, 60068, USA",
              "8311 MCDONALD RD, HOUSTON, TX, 77053-4821, USA",
              "PO Box 317, 4100 Hwy 20 E Ste 403, NICEVILLE, FL, 32578-5037, USA",
              "C/O Elon Musk Innovations Inc, 1548 E Florida Avenue, Suite 209, TAMPA, FL, 33613, USA",
              "Seven Edgeway Plaza, C/O Mac Dermott Inc, OAKBROOK TERRACE, IL, 60181, USA"]

In [19]:
for address in address_list:
    doc=nlp(address)
    colors = {'BUILDING_NAME': 'Aquamarine', 'BUILDING_NO': 'lavender', 'RECIPIENT': 'Coral', 'STREET_NAME': 'yellow', 'ZIP_CODE': 'DarkSeaGreen', 'CITY': 'orange', 'COUNTRY': 'MistyRose', 'STATE': 'pink'}
    options = {"colors": colors}
    displacy.render(doc, style="ent", options=options)

# Pass sentence in this function to get visuals

In [20]:
def get_visuals(text):
    doc=nlp(text)
    colors = {'BUILDING_NAME': 'Aquamarine', 'BUILDING_NO': 'lavender', 'RECIPIENT': 'Coral', 'STREET_NAME': 'yellow', 'ZIP_CODE': 'DarkSeaGreen', 'CITY': 'orange', 'COUNTRY': 'MistyRose', 'STATE': 'pink'}
    options = {"colors": colors}
    displacy.render(doc, style="ent", options=options)

In [22]:
get_visuals("My professor Dr. Ruth works for NASA. She lives at PO BOX 363, HERBRONVILLE, TX 78361, UNITED STATES.")

- Model is trained only on address hence for a sentence mixed with address and normal text it will not perform well

- to do the same for document, read the document as string and pass it to this function i.e get_visuals()