In [4]:
!pip install compress_pickle



In [5]:
!pip  install deepparse



In [6]:
import os
import compress_pickle
import pickle
from deepparse import download_from_public_repository
from deepparse.dataset_container import PickleDatasetContainer
from deepparse.parser import AddressParser
import shutil
from poutyne import set_seeds
import poutyne
import timeit

seed = 42
set_seeds(seed)

In [None]:
# Retrain an Address Parser for Single Country Uses

In this example, we will retrain a pre-trained model to maximize its performance for specific countries (e.g. the UK or Canada).

## Retrain a Model

First, to retrain our supervised model, we need parsed address example, as shown in the following figure. Fortunately, we have access to a public dataset of such parsed examples, the [Structured Multinational Address Dataset](https://github.com/GRAAL-Research/deepparse-address-data).

![parsing](https://github.com/GRAAL-Research/deepparse/blob/master/docs/source/_static/img/address_parsing.png?raw=1)

For our example, we will focus on UK addresses since we want to parse addresses only from the UK. So let's first download the dataset directly from the public repository using Deepparse `download_from_public_repository` function.

In [4]:
os.makedirs("dataset")


In [5]:
download_from_public_repository("dataset/data", "", file_extension="zip")

The dataset archive is a zip directory of subdirectories in which each country's data is compressed into an LZMA file (a more aggressive compression algorithm). The dataset public repository offers a [script](https://github.com/GRAAL-Research/deepparse-address-data/blob/main/lzma_decompress.py) to decompress the LZMA compress dataset zip archive. We will use the basic idea of it to decompress the dataset in the next code cell (the script handles CLI parameters).

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# First, let's decompress the archive
archive_root_path = os.path.join("dataset")
archive_path = os.path.join(archive_root_path, "data.zip")

# Unzip the archive
shutil.unpack_archive(archive_path, archive_root_path)

# Delete the archive
os.remove(archive_path)

In [8]:
# The script functions with minor modification to handle argument
# instead or CLI parsed argument


# Function to handle the files paths
def absolute_file_paths(directory):
    """
    Function to get all the absolute paths of files into a directory.
    """
    for dir_path, _, filenames in os.walk(directory):
        for f in filenames:
            if f.endswith(".lzma"):
                yield os.path.abspath(os.path.join(dir_path, f))


# Function to LZMA decompress the files_directory into the path_to_save directory
def lzma_decompress(files_directory, root_path_to_save) -> None:
    """
    Script to decompress the dataset from LZMA compress files into pickled one.
    """
    paths = absolute_file_paths(files_directory)

    for path in paths:
        pickled_data = compress_pickle.load(path, compression="lzma")
        filename = path.split(os.path.sep)[-1].replace(".lzma", ".p")
        file_path = os.path.join(*path.split(os.path.sep)[-4:-1])
        path_to_save = os.path.join(root_path_to_save, file_path)
        os.makedirs(path_to_save, exist_ok=True)
        with open(os.path.join(path_to_save, filename), "wb") as file:
            pickle.dump(pickled_data, file)
        os.remove(path)

In [9]:
# Let's decompress the dataset. It takes several minutes to decompress.

root_dir = os.path.join("dataset", "data")
clean_root_dir = os.path.join(root_dir, "clean_data")
clean_train_directory = os.path.join(clean_root_dir, "train")
clean_test_directory = os.path.join(clean_root_dir, "test")

In [10]:
# We decompress all the dataset
lzma_decompress(root_dir, "dataset")

Now, let's import our train and test datasets into memory to retrain our parser model.

In [11]:
clean_root_dir = os.path.join(root_dir, "clean_data")
clean_train_directory = os.path.join(clean_root_dir, "train")
clean_test_directory = os.path.join(clean_root_dir, "test")

pl_training_data_path = os.path.join(clean_train_directory, "pl.p")
pl_test_data_path = os.path.join(clean_test_directory, "pl.p")

training_container = PickleDatasetContainer(pl_training_data_path)
test_container = PickleDatasetContainer(pl_test_data_path)

We will use the FastText one for our base pre-trained model since it is faster to retrain.

In [12]:
address_parser = AddressParser(model_type="fasttext", device=0)



Downloading the pre-trained weights for the network fasttext.
The fastText pretrained word embeddings will be downloaded (6.8 GO), this process will take several minutes.
Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz
Loading the embeddings model


But first, let's see what the performance is before retraining.

In [16]:
address_parser.test(test_container, batch_size=256)

Running test


  self.pid = os.fork()


Step: 1796/1796 100.00% |████████████████████|ETA: 0.00s test_loss: 0.035425 test_accuracy: 100.000000

  self.pid = os.fork()


Test steps: 1796 30m13.08s test_loss: 0.137768 test_accuracy: 99.637872                               


{'time': 1813.0773357389999,
 'test_loss': 0.1377678936901547,
 'test_accuracy': 99.63787163770765}

In [13]:
_ = address_parser.retrain(
    training_container,
    train_ratio=0.8,
    epochs=5,
    batch_size=32,
    num_workers=2,
    learning_rate=0.001,
    logging_path="/content/drive/MyDrive/dane/modele/DEEPPARSER/pl_retrain",
    name_of_the_retrain_parser="PLParser",
)

  self.pid = os.fork()


Epoch: 1/5 Train steps: 2500 Val steps: 625 1m12.85s loss: 0.068997 accuracy: 99.727499 val_loss: 0.074176 val_accuracy: 99.704808
Epoch 1: val_loss improved from inf to 0.07418, saving file to /content/drive/MyDrive/dane/modele/DEEPPARSER/pl_retrain/checkpoint_epoch_1.ckpt
Epoch: 2/5 Train steps: 2500 Val steps: 625 1m10.55s loss: 0.065163 accuracy: 99.741057 val_loss: 0.071739 val_accuracy: 99.719749
Epoch 2: val_loss improved from 0.07418 to 0.07174, saving file to /content/drive/MyDrive/dane/modele/DEEPPARSER/pl_retrain/checkpoint_epoch_2.ckpt
Epoch: 3/5 Train steps: 2500 Val steps: 625 1m12.18s loss: 0.062108 accuracy: 99.750257 val_loss: 0.070241 val_accuracy: 99.724146
Epoch 3: val_loss improved from 0.07174 to 0.07024, saving file to /content/drive/MyDrive/dane/modele/DEEPPARSER/pl_retrain/checkpoint_epoch_3.ckpt
Epoch: 4/5 Train steps: 2500 Val steps: 625 1m10.97s loss: 0.059310 accuracy: 99.766909 val_loss: 0.067749 val_accuracy: 99.741775
Epoch 4: val_loss improved from 0.07

In [7]:
adres= AddressParser(model_type="fasttext", device=0, path_to_retrained_model='/content/drive/MyDrive/dane/modele/DEEPPARSER/pl_retrain/PLParser.ckpt')

Loading the embeddings model


In [None]:
address_parser.test(test_container, batch_size=256)

Running test
[35mTest steps: [36m57 [32m1.74s [35mtest_loss:[94m 0.120875[35m test_accuracy:[94m 99.575062[0m                                                


{'time': 1.7367468271404505,
 'test_loss': 0.12087451704787924,
 'test_accuracy': 99.5750624169449}

To further improve performance, we could train for longer, increase the training dataset size (the actual size of 100,000 addresses), or rework the Seq2Seq hidden sizes. See the [retrain interface documentation](https://deepparse.org/parser.html#deepparse.parser.AddressParser.retrain) for all the training parameters.

In [9]:
addres_parsed = adres("Dzikiego Wina 1, 05-500 Józefosław, Mazowieckie")
print(addres_parsed)

The unparsed address is 'Dzikiego Wina 1, 05-500 Józefosław, Mazowieckie' and the parsed address is '('dzikiego', 'StreetName') ('wina', 'StreetName') ('1', 'StreetNumber') ('05-500', 'PostalCode') ('józefosław', 'Municipality') ('mazowieckie', 'Province')'
