# Obtaining DocBin for training with spaCy

In [1]:
import spacy
from spacy.tokens import DocBin
import pandas as pd
import re

In [2]:
def massage_data(address):
    # Replacing multiple commas and multiple whitespaces following commas with a single comma and whitespace
    cleansed_address1=re.sub(r'(,)(?!\s)',', ',address)
    # Replacing new lines with comma
    cleansed_address2=re.sub(r'(\\n)',', ',cleansed_address1)
    # Replacing multiple spaces before and after a hyphen with a single whitespace before and after
    cleansed_address3=re.sub(r'(?!\s)(-)(?!\s)',' - ',cleansed_address2)
    # Removing period from the string
    cleansed_address=re.sub(r'\.','',cleansed_address3)
    return cleansed_address

In [3]:
def get_address_span(address=None,address_component=None,label=None):
    if pd.isna(address_component) or str(address_component)=='nan':
        # Don't do anything if the address or address component is not present
        pass
    else:
        # Find the address component in the address and return start and end indices
        address_component1=re.sub('\.','',address_component)
        address_component2=re.sub(r'(?!\s)(-)(?!\s)',' - ',address_component1)
        span=re.search('\\b(?:'+address_component2+')\\b',address)
        return (span.start(),span.end(),label)

In [4]:
def extend_list(entity_list,entity):
    # Method to extend list
    if pd.isna(entity):
        return entity_list
    else:
        entity_list.append(entity)
        return entity_list

In [5]:
def create_entity_spans(df,tag_list):
    # Obtaining entity span and labels of the address components from the given data
    # This data is used for training
    df['Address']=df['Address'].apply(lambda x: massage_data(x))
    df["BuildingTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Building_Name'],label='BUILDING_NAME'),axis=1)
    df["BuildingNoTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Building_Number'],label='BUILDING_NO'),axis=1)
    df["RecipientTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Recipient'],label='RECIPIENT'),axis=1)
    df["StreetNameTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Street_Name'],label='STREET_NAME'),axis=1)
    df["ZipCodeTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Zip_Code'],label='ZIP_CODE'),axis=1)
    df["CityTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['City'],label='CITY'),axis=1)
    df["CountryTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['Country'],label='COUNTRY'),axis=1)
    df["StateTag"]=df.apply(lambda row:get_address_span(address=row['Address'],address_component=row['State'],label='STATE'),axis=1)
    df['EmptySpan']=df.apply(lambda x: [], axis=1)

    for i in tag_list:
        df['EntitySpans']=df.apply(lambda row: extend_list(row['EmptySpan'],row[i]),axis=1)
        df['EntitySpans']=df[['EntitySpans','Address']].apply(lambda x: (x[1], x[0]),axis=1)
    return df['EntitySpans']

In [6]:
def get_doc_bin(training_data,nlp):
    # DocBin is used for data storing in spaCy
    # the DocBin will store the example documents
    db = DocBin()
    for text, annotations in training_data:
        doc = nlp(text) #Construct a Doc object
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            ents.append(span)
        doc.ents = ents
        db.add(doc)
    return db

In [7]:
nlp = spacy.blank("en")

In [8]:
tag_list=["BuildingTag","BuildingNoTag","RecipientTag","StreetNameTag","ZipCodeTag","CityTag","StateTag","CountryTag"]

In [9]:
df_train=pd.read_csv(filepath_or_buffer="us-train-dataset.csv",sep=",",dtype=str)

In [10]:
df_entity_spans= create_entity_spans(df_train.astype(str),tag_list)
training_data= df_entity_spans.values.tolist()

In [11]:
doc_bin_train= get_doc_bin(training_data,nlp)
doc_bin_train.to_disk("train.spacy")

In [12]:
df_test=pd.read_csv(filepath_or_buffer="us-test-dataset.csv",sep=",",dtype=str)

In [13]:
df_entity_spans= create_entity_spans(df_test.astype(str),tag_list)
validation_data= df_entity_spans.values.tolist()

In [14]:
doc_bin_test= get_doc_bin(validation_data,nlp)
doc_bin_test.to_disk("test.spacy")

# Initializing and training with spaCy

In [16]:
!python -m spacy init fill-config base_config.cfg config.cfg

✔ Auto-filled config with all values
✔ Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


2022-06-05 19:01:57.012184: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found
2022-06-05 19:01:57.012698: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [17]:
!python -m spacy train config.cfg --paths.train train.spacy --paths.dev test.spacy --output output --training.eval_frequency 10 --training.max_steps 300

✔ Created output directory: output
ℹ Saving to output directory: output
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['ner']
ℹ Initial learn rate: 0.001
E    #       LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --------  ------  ------  ------  ------
  0       0     66.65    6.46    4.40   12.21    0.06
  0      10    824.19    0.00    0.00    0.00    0.00
  1      20    466.36   25.49   35.62   19.85    0.25
  1      30    388.29   58.18   71.91   48.85    0.58
  2      40    340.52   71.73   80.19   64.89    0.72
  2      50    274.83   74.70   78.81   70.99    0.75
  3      60    197.17   90.00   90.70   89.31    0.90
  3      70     94.29   92.72   93.08   92.37    0.93
  4      80     58.15   94.62   95.35   93.89    0.95
  4      90     35.47   93.13   93.13   93.13    0.93
  5     100     39.30   96.55   96.92   96.18    0.97
  5     110     22.16   96.18   96.18   96.18    0.96
  6     120     22.06   97.34   96.97   97.71    0.97
  7     130      9.73   9

2022-06-05 19:04:01.593217: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found
2022-06-05 19:04:01.593746: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2022-06-05 19:04:05,219] [INFO] Set up nlp object from config
[2022-06-05 19:04:05,228] [INFO] Pipeline: ['ner']
[2022-06-05 19:04:05,232] [INFO] Created vocabulary
[2022-06-05 19:04:05,239] [INFO] Finished initializing nlp object
[2022-06-05 19:04:05,661] [INFO] Initialized pipeline components: ['ner']


# Prediction from model

In [18]:
nlp=spacy.load("output\model-best")

In [19]:
address_list=["130 W BOSE ST STE 100, PARK RIDGE, IL, 60068, USA",
              "8311 MCDONALD RD, HOUSTON, TX, 77053-4821, USA",
              "PO Box 317, 4100 Hwy 20 E Ste 403, NICEVILLE, FL, 32578-5037, USA",
              "C/O Elon Musk Innovations Inc, 1548 E Florida Avenue, Suite 209, TAMPA, FL, 33613, USA",
              "Seven Edgeway Plaza, C/O Mac Dermott Inc, OAKBROOK TERRACE, IL, 60181, USA"]

In [20]:
for address in address_list:
    doc=nlp(address)
    ent_list=[(ent.text, ent.label_) for ent in doc.ents]
    print("Address string -> "+address)
    print("Parsed address -> "+str(ent_list))
    print("******")

Address string -> 130 W BOSE ST STE 100, PARK RIDGE, IL, 60068, USA
Parsed address -> [('130', 'BUILDING_NO'), ('W BOSE ST', 'STREET_NAME'), ('PARK RIDGE', 'CITY'), ('IL', 'STATE'), ('60068', 'ZIP_CODE'), ('USA', 'COUNTRY')]
******
Address string -> 8311 MCDONALD RD, HOUSTON, TX, 77053-4821, USA
Parsed address -> [('8311', 'BUILDING_NO'), ('MCDONALD RD', 'STREET_NAME'), ('HOUSTON', 'CITY'), ('TX', 'STATE'), ('77053-4821', 'ZIP_CODE'), ('USA', 'COUNTRY')]
******
Address string -> PO Box 317, 4100 Hwy 20 E Ste 403, NICEVILLE, FL, 32578-5037, USA
Parsed address -> [('4100 Hwy 20 E Ste', 'ZIP_CODE'), ('NICEVILLE', 'CITY'), ('FL', 'STATE'), ('32578-5037', 'ZIP_CODE'), ('USA', 'COUNTRY')]
******
Address string -> C/O Elon Musk Innovations Inc, 1548 E Florida Avenue, Suite 209, TAMPA, FL, 33613, USA
Parsed address -> [('C/O Elon Musk Innovations Inc', 'RECIPIENT'), ('1548', 'BUILDING_NO'), ('E Florida Avenue', 'STREET_NAME'), ('TAMPA', 'CITY'), ('FL', 'STATE'), ('33613', 'ZIP_CODE'), ('USA', 