## Fine tune LayoutLM on SROIE 

**links:**

*  [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)"

* [Git-hub repo](https://github.com/microsoft/unilm/tree/master/layoutlm).

* [SROIE Dataset](https://rrc.cvc.uab.es/?ch=13).

* [Source Kaggle Notebook](https://www.kaggle.com/code/ammarnassanalhajali/fine-tune-layoutlm-on-sroie-dataset)

* [huggingface](https://huggingface.co/docs/transformers/model_doc/layoutlm)


**What this notebook include ?**


1.   Pre-process SROIE dataset. 
2.   Fine tune and evaluate the model using the git-hub script.
3.   Predict new examles using huggingface (and the fine tuned model).

**Drive Links:**

1.  [SROIE](https://drive.google.com/drive/folders/1kNHB43H5l6M1A1Ay9c58g4k_aSaFGEYh?usp=share_link)
2. [Pre-processed SROIE](https://drive.google.com/drive/folders/1aBDLIAFytM2jCddsdA3u-BIPadan4XVD?usp=share_link)
2.  [Fine tuned model](https://drive.google.com/drive/folders/1-SFJAtIcv_DK4D4lUoRQuqMx_gYWJ3_Z?usp=share_link)
3. [Predicted results](https://drive.google.com/drive/folders/10B0Ga7fb--qzReY0TTR9lPuXwF6W7sc3?usp=share_link)





# 1.Pre-process SROIE dataset

In [None]:
!pip install transformers
! pip install pillow

import glob
import json
import os

import pandas as pd
# import Image
from PIL import Image
from google.colab import drive

drive.mount(
    '/content/drive')  # https://rrc.cvc.uab.es/?ch=13&com=downloads - (trick- add as shortcut indsead of download)
base_dir = '/content/drive/MyDrive/SROIE2019 (1)/'
ds_dir = '/content/drive/MyDrive/SROIE2019 (1)/dataset/'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import tqdm
!cd / content

labels = ['date', 'company', 'address', 'total']


def read_data(part):
    task1_dir = f'{base_dir}{part}/box/'

    df = pd.DataFrame(columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'words'])
    for file_name in tqdm.tqdm(glob.glob(f"{task1_dir}*.txt")):
        try:
            df_file: pd.DataFrame = pd.read_csv(file_name, header=None,
                                                names=['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'words'])
            img = Image.open(file_name.replace("box", 'img').replace("txt", 'jpg'))
            df_file['x_min'] = (1000 * df_file[["x0", "x1", "x2", "x3"]].min(axis=1) / img.width).astype(int)
            df_file['x_max'] = (1000 * df_file[["x0", "x1", "x2", "x3"]].max(axis=1) / img.width).astype(int)
            df_file['y_min'] = (1000 * df_file[["y0", "y1", "y2", "y3"]].min(axis=1) / img.height).astype(int)
            df_file['y_max'] = (1000 * df_file[["y0", "y1", "y2", "y3"]].max(axis=1) / img.height).astype(int)
            if not 0 < df_file['x_min'].max() < 1000:
                continue
            if not 0 < df_file['x_max'].max() < 1000:
                continue
            if not 0 < df_file['y_min'].max() < 1000:
                continue
            if not 0 < df_file['y_max'].max() < 1000:
                continue

            df_file['filename'] = os.path.basename(file_name).split(".")[0]
            df_file['width'] = img.width
            df_file['height'] = img.height
            df_file['label'] = "O"

            with open(file_name.replace("box", "entities"), 'r') as fileread:
                data = json.loads(fileread.read())
                for label in labels:
                    data[label] = "" if label not in data else str(data[label])

            for i, row in df_file.iterrows():
                w = str(row['words'])
                if w in data['company']:
                    df_file.loc[i, "label"] = 'company'
                if w in data['address']:
                    df_file.loc[i, "label"] = 'address'
                if data['total'] in w:
                    df_file.loc[i, "label"] = 'total'
                if data['date'] in w:
                    df_file.loc[i, "label"] = 'date'

            df = df.append(df_file, ignore_index=True)
        except Exception as e:
            print('skip', file_name, e)
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df


def write_dataset(dataset: list, output_dir: str, name: str):
    print(f"Writing {name}ing dataset:")
    with open(f"{output_dir}/{name}.txt", "w", encoding="utf8") as file,
        open(f"{output_dir}/{name}_box.txt", "w", encoding="utf8") as file_bbox,
        open(f"{output_dir}/{name}_image.txt", "w", encoding="utf8") as file_image:
        # Go through each dataset
    for filename in tqdm.tqdm(dataset.filename.unique()):

        data = dataset[dataset.filename == filename]
        width = int(data.iloc[0]['width'])
        height = int(data.iloc[0]['height'])

        # Go through every row in dataset
        for index, row in data.iterrows():
            bbox = [int(p) for p in row[['x0', 'y0', 'x2', 'y2']]]
            normalized_bbox = [int(p) for p in row[['x_min', 'y_min', 'x_max', 'y_max']]]

            file.write("{}\t{}\n".format(row['words'], row["label"]))
            file_bbox.write("{}\t{} {} {} {}\n".format(row['words'], *normalized_bbox))
            file_image.write("{}\t{} {} {} {}\t{} {}\t{}\n".format(row['words'], *bbox, width, height, filename))

        # Write a second newline to separate dataset from others
        file.write("\n")
        file_bbox.write("\n")
        file_image.write("\n")


write_ds = True  #@param {type:"boolean"}
if write_ds:
    write_dataset(read_data('train'), ds_dir, 'train')
    write_dataset(read_data('test'), ds_dir, 'test')
    with open(f'{ds_dir}/labels.txt', 'w') as f:
        for label in labels:
            f.write(f"{label}\n")
        f.write("O")

  2%|▏         | 12/636 [00:00<00:18, 33.95it/s]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51006557185 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51006557185 (1).jpg'


  5%|▍         | 30/636 [00:07<04:30,  2.24it/s]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51008099084 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51008099084 (1).jpg'


 13%|█▎        | 83/636 [00:34<03:57,  2.33it/s]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51007339643 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51007339643 (1).jpg'


 16%|█▌        | 101/636 [00:45<08:32,  1.04it/s]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51007103692 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51007103692 (1).jpg'


 23%|██▎       | 146/636 [01:33<09:49,  1.20s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51008114262 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51008114262 (1).jpg'


 28%|██▊       | 181/636 [02:10<07:56,  1.05s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51006555125 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51006555125 (1).jpg'


 35%|███▌      | 223/636 [02:54<08:06,  1.18s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51006392299 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51006392299 (1).jpg'


  df_file['y_min']=(1000*df_file[["y0", "y1","y2","y3"]].min(axis=1)/img.height).astype(int)
  df_file['y_max']=(1000*df_file[["y0", "y1","y2","y3"]].max(axis=1)/img.height).astype(int)
 45%|████▍     | 286/636 [04:00<06:07,  1.05s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51007339118 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51007339118 (1).jpg'


 49%|████▉     | 313/636 [04:27<05:31,  1.03s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51005715010 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51005715010 (1).jpg'


 56%|█████▌    | 354/636 [05:09<05:21,  1.14s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51005685357 (1).txt [Errno 2] No such file or directory: '/content/drive/MyDrive/SROIE2019 (1)/train/img/X51005685357 (1).jpg'


 68%|██████▊   | 435/636 [06:33<03:36,  1.08s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51006619545.txt Error tokenizing data. C error: EOF inside string starting at row 78


  df_file['x_min']=(1000*df_file[["x0", "x1","x2","x3"]].min(axis=1)/ img.width).astype(int)
  df_file['x_max']=(1000*df_file[["x0", "x1","x2","x3"]].max(axis=1)/ img.width).astype(int)
 87%|████████▋ | 551/636 [08:34<01:41,  1.20s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/train/box/X51006619785.txt Error tokenizing data. C error: EOF inside string starting at row 77


100%|██████████| 636/636 [10:08<00:00,  1.04it/s]


Writing training dataset:


100%|██████████| 621/621 [00:26<00:00, 23.87it/s]
 55%|█████▍    | 190/347 [03:09<02:48,  1.08s/it]

skip /content/drive/MyDrive/SROIE2019 (1)/test/box/X51006619503.txt 'utf-8' codec can't decode byte 0xa3 in position 407: invalid start byte


100%|██████████| 347/347 [05:43<00:00,  1.01it/s]


Writing testing dataset:


100%|██████████| 345/345 [00:13<00:00, 24.90it/s]


In [None]:
%%bash
git clone https://github.com/microsoft/unilm.git
cd unilm/layoutlm/deprecated
pip install .

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing /content/unilm/layoutlm/deprecated
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: layoutlm
  Building wheel for layoutlm (setup.py): started
  Building wheel for layoutlm (setup.py): finished with status 'done'
  Created wheel for layoutlm: filename=layoutlm-0.0-py3-none-any.whl size=11481 sha256=913a03f5f3015e8b41ec5ab7f505130ef195dac8e09d27c4181e6778de5edfc8
  Stored in directory: /tmp/pip-ephem-wheel-cache-61sic6ao/wheels/31/2d/25/ecc19c4d92e12c3abaed9d94883bf8cee481e067d9635b66d1
Successfully built layoutlm
Installing collected packages: layoutlm
  Attempting uninstall: layoutlm
    Found existing installation: layoutlm 0.0
    Uninstalling layoutlm-0.0:
      Successfully uninstalled layoutlm-0.0
Successfully installed layoutlm-0.0


fatal: destination path 'unilm' already exists and is not an empty directory.


# 2.Fine tune and evaluate the model using the git-hub script.


In [None]:
os.chdir("/content/unilm/layoutlm/deprecated/examples/seq_labeling")
#! sed -i 's/"num_attention_heads": 16,/"num_attention_heads": 12,/' "{pretrained_model_folder}/"config.json
! rm -rf "{ds_dir}"cached*
!mkdir "{base_dir}"predict
!mkdir "{base_dir}"output

! python run_seq_labeling.py \
                            --data_dir "{ds_dir}" \
                            --labels "{ds_dir}"/labels.txt \
                            --model_name_or_path "{base_dir}"/layoutlm-base-uncased/ \
                            --model_type layoutlm \
                            --max_seq_length 512 \
                            --do_lower_case \
                            --do_train \
                            --num_train_epochs 10 \
                            --logging_steps 50 \
                            --save_steps -1 \
                            --output_dir "{base_dir}"/output \
                            --overwrite_output_dir \
                            --per_gpu_train_batch_size 8 \
                            --per_gpu_eval_batch_size 16


# Evaluate for test set and make predictions
! python run_seq_labeling.py \
                            --data_dir "{ds_dir}" \
                            --labels "{ds_dir}"/labels.txt \
                            --model_name_or_path "{base_dir}"/layoutlm-base-uncased/ \
                            --model_type layoutlm \
                            --do_lower_case \
                            --max_seq_length 512 \
                            --do_predict \
                            --logging_steps 10 \
                            --save_steps -1 \
                            --output_dir "{base_dir}"/output \
                            --per_gpu_eval_batch_size 8


mkdir: cannot create directory ‘/content/drive/MyDrive/SROIE2019 (1)/predict’: File exists
mkdir: cannot create directory ‘/content/drive/MyDrive/SROIE2019 (1)/output’: File exists
Epoch:   0% 0/10 [00:00<?, ?it/s]
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1420.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)

Iteration:   1% 1/78 [00:03<04:27,  3.47s/it][A
Iteration:   3% 2/78 [00:04<02:19,  1.83s/it][A
Iteration:   4% 3/78 [00:04<01:38,  1.31s/it][A
Iteration:   5% 4/78 [00:05<01:18,  1.06s/it][A
Iteration:   6% 5/78 [00:06<01:07,  1.08it/s][A
Iteration:   8% 6/78 [00:06<01:01,  1.18it/s][A
Iteration:   9% 7/78 [00:07<00:56,  1.25it/s][A
Iteration:  10% 8/78 [00:08<00:53,  1.31it/s][A
Iteration:  12% 9/78 [00:08<00:51,  1.35it/s][A
Iteration:  13% 10/78 [00:09<00:49,  1.38it/s][A
Iteration:  14% 11/78 [00:10<00:47,  1.40it/s]