In [None]:
""" Imports and Global Variables """
import os
import re
import glob
import spacy
import subprocess
import pandas as pd
from colorama import init

# Initialize colorama
ERROR = "\033[5;91m"
WARNING = "\033[5;93m"
SUCCESS = "\033[92m"
LABEL = "\033[94m"
INFO = "\033[97m"
RESET = "\033[0m"
init()

# Set pandas option
pd.set_option("display.max_colwidth", None)

# Define global variables
DATASET_PATH = "./dataset/"
TRAINER_PATH = "./trainer/"

INPUT_PATH = "./data/input/"
MODEL_PATH = "./data/models/"

In [None]:
def extend(list: list, item: str) -> list:
    """
    Utility method to safely append an item into a list

    Parameters
    ----------
    list: list
        List containing a set of items
    item: str
        String that needs to be appended to the list

    Returns
    ----------
    list
        Original list with the appended item
    """

    if pd.isna(item) | (str(item) == 'nan'):
        return list
    else:
        list.append(item)
        return list

In [None]:
def strip_address(address: str) -> str:
    """
    Strips the address string of unnecessary symbols and properly formats
    the address into a csv file style format using regex

    Parameters
    ----------
    address: str
        String containing the address

    Returns
    ----------
    str
        Properly formatted address string
    """

    stripped = re.sub(r"(,)(?!\s)", ", ", address)
    stripped = re.sub(r"(\\n)", ", ", stripped)
    stripped = re.sub(r"(?!\s)(-)(?!\s)", " - ", stripped)
    stripped = re.sub(r"\.", "", stripped)
    return stripped

In [None]:
def address_span(address: str = None, component: str = None, label: str = None) -> tuple:
    """
    Return a tuple containing the span of the address component in the
    address string and the classification label of the component

    Parameters
    ----------
    address: str
        String containing the address
    component: str
        String containing the address component
    label: str
        String containing the classification label of the address component

    Returns
    ----------
    tuple
        Tuple of the span of the address component and the classification label
    """

    if pd.isna(component) | (str(component) == 'nan'):
        pass
    else:
        component = re.sub("\.", "", component)
        component = re.sub(r"(?!\s)(-)(?!\s)", " - ", component)
        span = re.search("\\b(?:" + component + ")\\b", address)
        return (span.start(), span.end(), label)

In [None]:
def create_entity_spans(dataset: pd.core.frame.DataFrame, tags: list) -> pd.core.series.Series:
    """
    Create a pandas Series with entity spans for the training dataset

    Parameters
    ----------
    dataset: pandas.core.frame.DataFrame
        pandas DataFrame containing the training dataset
    tags: list
        List of data tags

    Returns
    ----------
    pandas.core.series.Series
        pandas Series of training dataset entity spans
    """

    dataset["Address"] = dataset["Address"].apply(lambda address: strip_address(address))
    dataset["Recipient"] = dataset.apply(lambda row: address_span(address=row['Address'], component=row['Recipient'], label='RECIPIENT'), axis=1)
    dataset["Building_Name"] = dataset.apply(lambda row: address_span(address=row['Address'], component=row['Building_Name'], label='BUILDING_NAME'), axis=1)
    dataset["Building_Number"] = dataset.apply(lambda row: address_span(address=row['Address'], component=row['Building_Number'], label='BUILDING_NUMBER'), axis=1)
    dataset["Street"] = dataset.apply(lambda row: address_span(address=row['Address'], component=row['Street_Name'], label='STREET'), axis=1)
    dataset["City"] = dataset.apply(lambda row: address_span(address=row['Address'], component=row['City'], label='CITY'), axis=1)
    dataset["State"] = dataset.apply(lambda row: address_span(address=row['Address'], component=row['State'], label='STATE'), axis=1)
    dataset["Zip_Code"] = dataset.apply(lambda row: address_span(address=row['Address'], component=row['Zip_Code'], label='ZIP_CODE'), axis=1)
    dataset["Country"] = dataset.apply(lambda row: address_span(address=row['Address'], component=row['Country'], label='COUNTRY'), axis=1)
    dataset["EmptySpan"] = dataset.apply(lambda x: [], axis=1)

    for tag in tags:
        dataset["EntitySpans"] = dataset.apply(lambda row: extend(row["EmptySpan"], row[tag]), axis=1)
        dataset["EntitySpans"] = dataset[["EntitySpans", "Address"]].apply(lambda entity: (entity[1], entity[0]), axis=1)
    return dataset["EntitySpans"]

In [None]:
def create_docbin(data: list, NLP: spacy.Language) -> spacy.tokens._serialize.DocBin:
    """
    Return a DocBin (ie. serialization of information) used by spaCy
    as a training set, using training data and an empty spaCy English model

    Parameters
    ----------
    data: list
        List containing training data
    NLP: spacy.Language
        An empty English spaCy model

    Returns
    ----------
    spacy.tokens._serialize.DocBin
        DocBin object for building a training set
    """

    docbin = spacy.tokens.DocBin()
    for text, annotations in data:
        doc = NLP(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        doc.ents = ents
        docbin.add(doc)
    return docbin

In [None]:
def parse_address(nlp: spacy.Language, address: str) -> list:
    """
    Parses the passed address string and returns the address components
    as a list of tuples

    Parameters
    ----------
    NLP: spacy.Language
        An empty English spaCy model
    address: str
        String containing the address

    Returns
    ----------
    List
        List of address components
    """

    doc = nlp(strip_address(address))
    entities = [(entity.text, entity.label_) for entity in doc.ents]

    print(f"{LABEL}Address: {INFO}{address[0:-1]}{RESET}")
    for entity in entities:
        print(f"  {LABEL}{entity[1]}: {INFO}{entity[0]}{RESET}")
    print("")

    return entities

In [None]:
""" Create NLP model and initialize dataframe """
NLP = spacy.blank('en')

DATASET = None
if os.path.isfile(DATASET_PATH):
    try:
        DATASET = pd.read_csv(filepath_or_buffer=DATASET_PATH, sep=",", dtype=str)
    except Exception:
        print(f"{ERROR}✘ Dataset not found\n{RESET}")
        exit
else:
    DATASETS = glob.glob(f"{DATASET_PATH}/*.csv")
    print(f"{INFO}Found {len(DATASETS)} datasets{RESET}")

    for dataset in DATASETS:
        try:
            if DATASET is None:
                DATASET = pd.read_csv(filepath_or_buffer=dataset, sep=",", dtype=str)
            else:
                pd.concat([DATASET, pd.read_csv(filepath_or_buffer=dataset, sep=",", dtype=str)])
        except Exception:
            print(f"{ERROR}✘ Dataset not found{RESET}")
            exit

print(f"{SUCCESS}✔ Successfully loaded dataset(s){RESET}")

In [None]:
""" Create entity spans and save to file """
try:
    TAGS = ["Recipient", "Building_Name", "Building_Number", "Street", "City", "State", "Zip_Code", "Country"]
    SPANS = create_entity_spans(DATASET.astype(str), TAGS)
    TRAINING_DATA = SPANS.tolist()
    print(f"{SUCCESS}✔ Successfully created entity spans{RESET}")
except Exception:
    print(f"{ERROR}✘ Failed to create entity spans{RESET}")
    exit
    
try:
    if os.path.isdir(TRAINER_PATH):
        TRAINER_PATH = os.path.join(TRAINER_PATH, "training.spacy")    
    DOCBIN = create_docbin(TRAINING_DATA, NLP)
    DOCBIN.to_disk(TRAINER_PATH)
    print(f"{SUCCESS}✔ Successfully created training set{RESET}")
except Exception:
    print(f"{ERROR}✘ Failed to create training set{RESET}")
    exit

print(f"{INFO} Ready to start training{RESET}")

In [None]:
""" Build config and train model """
try:
    command = "python -m spacy init fill-config \"config/base.cfg\" \"config/config.cfg\""
    print(f"{INFO} Running command: {command}")
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()

    command = f"python -m spacy train \"config/config.cfg\" --output {MODEL_PATH} --paths.train {TRAINER_PATH} --paths.dev {TRAINER_PATH}"
    print(f"{INFO} Running command: {command}")
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    print(f"{SUCCESS}✔ Successfully trained model{RESET}")
except Exception as e:
    print(f"{ERROR}✘ Failed to train model{RESET}")
    exit

In [None]:
""" Use trained model to parse address """
NLP = spacy.load(f"{MODEL_PATH}/model-best")
print(f"{SUCCESS}✔ Successfully loaded model{RESET}")

CONTENT = []
if os.path.isfile(INPUT_PATH):
    try:
        FILE = open(INPUT_PATH, "r")
        CONTENT = FILE.readlines()
        FILE.close()
        print(f"{SUCCESS}✔ Successfully loaded input file{RESET}")
    except Exception:
        print(f"{ERROR}✘ Failed to load input file{RESET}")
        exit
else:
    FILES = next(os.walk(INPUT_PATH), (None, None, []))[2]
    for file in FILES:
        try:
            PATH = f"{INPUT_PATH}/{file}"
            DATA = open(PATH, "r")
            CONTENT.append(DATA.readlines())
            DATA.close()
        except Exception:
            print(f"{ERROR}✘ Failed to load input file{RESET}")
            exit
    print(f"{SUCCESS}✔ Successfully loaded input\n{RESET}")

for itr in range(len(CONTENT)):
    if "<?xml" not in CONTENT[itr][0]:
        continue
    else:
        for step in range(len(CONTENT[itr])):
            if "<string>" not in CONTENT[itr][step]:
                CONTENT[itr][step] = CONTENT[itr][step].replace(CONTENT[itr][step], "")
            else:
                CONTENT[itr][step] = CONTENT[itr][step].split("<string>")[1].split("</string>")[0]

for FILE in CONTENT:
    for ADDREESS in FILE:
        if len(ADDREESS) > 0:
            try:
                parse_address(NLP, ADDREESS)
            except Exception:
                print(f"{ERROR}✘ Failed to parse address{RESET}")
                exit