In [None]:
#dataset
!git clone https://github.com/zzzDavid/ICDAR-2019-SROIE.git

## Preprocess Dataset

We will preprocess Dataset as per PICK-pytorch.<br>
Reference: https://github.com/wenwenyu/PICK-pytorch/blob/master/data/README.md

In [None]:
## Creating folders for preprocessed dataset
!mkdir boxes_and_transcripts images entities

In [None]:
## Script for preprocessing dataset
import os
import pandas
import json
import csv
import shutil

## Input dataset
data_path = "ICDAR-2019-SROIE/data/"
box_path = data_path + "box/"
img_path = data_path + "img/"
key_path = data_path + "key/"

## Output dataset
out_boxes_and_transcripts = "boxes_and_transcripts/"
out_images = "images/"
out_entities  = "entities/"

train_samples_list =  []
for file in os.listdir(data_path + "box/"):
  
  ## Reading csv
  with open(box_path +file, "r") as fp:
    reader = csv.reader(fp, delimiter=",")
    ## arranging dataframe index ,coordinates x1_1,y1_1,x2_1,y2_1,x3_1,y3_1,x4_1,y4_1, transcript
    rows = [[1] + x[:8] + [','.join(x[8:]).strip(',')] for x in reader] 
    df = pandas.DataFrame(rows)
  
  ## including ner label dataframe index ,coordinates x1_1,y1_1,x2_1,y2_1,x3_1,y3_1,x4_1,y4_1, transcript , ner tag
  df[10] = 'other'  
  
  ##saving file into new dataset folder
  jpg = file.replace(".csv",".jpg")
  entities = json.load(open(key_path+file.replace(".csv",".json")))
  for key,value in sorted(entities.items()):
    idx = df[df[9].str.contains('|'.join(map(str.strip, value.split(','))))].index
    df.loc[idx, 10] = key

  shutil.copy(img_path +jpg, out_images)
  with open(out_entities + file.replace(".csv",".txt"),"w") as j:  
    print(json.dumps(entities), file=j)
  
  df.to_csv(out_boxes_and_transcripts+file.replace(".csv",".tsv"),index=False,header=False, quotechar='',escapechar='\\',quoting=csv.QUOTE_NONE, )
  train_samples_list.append(['receipt',file.replace('.csv','')])
train_samples_list = pandas.DataFrame(train_samples_list)
train_samples_list.to_csv("train_samples_list.csv")

In [None]:
## document_type, file_name
train_samples_list.head()

**Spliting dataset into train-test sets**

In [None]:
from sklearn.model_selection import train_test_split
train_test = pandas.read_csv("train_samples_list.csv",dtype=str)
train, test= train_test_split(train_test,test_size=0.2,random_state = 42)

In [None]:
!git clone https://github.com/wenwenyu/PICK-pytorch.git

**Copy train data into PICK-pytorch data folder**

In [None]:
for index, row in train.iterrows():
  shutil.copy(out_boxes_and_transcripts+str(row[2])+".tsv",'PICK-pytorch/data/data_examples_root/boxes_and_transcripts/')
  shutil.copy(out_images+str(row[2])+".jpg",'PICK-pytorch/data/data_examples_root/images/')
  shutil.copy(out_entities +str(row[2])+".txt", 'PICK-pytorch/data/data_examples_root/entities/')

train.drop(['Unnamed: 0'], axis = 1,inplace = True)
train.reset_index(inplace= True)
train.drop(['index'], axis = 1,inplace = True)
train.to_csv("PICK-pytorch/data/data_examples_root/train_samples_list.csv",header = False)

**Copy test data into PICK-pytorch data folder**

In [None]:
!mkdir 'PICK-pytorch/data/test_data_example/entities/'

In [None]:
for index, row in test.iterrows():
  shutil.copy(out_boxes_and_transcripts+str(row[2])+".tsv",'PICK-pytorch/data/test_data_example/boxes_and_transcripts/')
  shutil.copy(out_images+str(row[2])+".jpg",'PICK-pytorch/data/test_data_example/images/')
  shutil.copy(out_entities +str(row[2])+".txt", 'PICK-pytorch/data/test_data_example/entities/')

train.drop(['Unnamed: 0'], axis = 1,inplace = True)
train.reset_index(inplace= True)
train.drop(['index'], axis = 1,inplace = True)
train.to_csv("PICK-pytorch/data/test_data_example/train_samples_list.csv",header = False)

In [None]:
## Removing data once it is copied into PICK-pytorch data folder
!rm /content/boxes_and_transcripts/*.tsv
!rm /content/images/*.jpg
!rm /content/entities/*.txt

In [22]:
%cd PICK-pytorch/

C:\Users\Mula Ram\NLP\in\PICK-pytorch


In [23]:
%%writefile utils/entities_list.py
# -*- coding: utf-8 -*-
# @Author: Wenwen Yu
# @Created Time: 7/8/2020 9:34 PM

Entities_list = [
    "company",
    "address",
    "date",
    "total"
]

Overwriting utils/entities_list.py


In [None]:
## Installing requirements for running PICK-pytorch
!pip install -r requirements.txt
!pip install torch==1.5.1+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html

## Training

In [24]:
#!/bin/bash
!python -m torch.distributed.launch --nnode=1 --node_rank=0 --nproc_per_node=1 \
   train.py -c config.json -d 0 --local_world_size 1
  # --resume /content/PICK-pytorch/saved/models/PICK_Default/test_0917_074722/model_best.pth ##uncomment for resume training

C:\Users\Mula Ram\NLP\in\Scripts\python.exe: Error while finding module specification for 'torch.distributed.launch' (ModuleNotFoundError: No module named 'torch')


## Testing

In [None]:
##creating testing folders
!mkdir /content/test_img /content/test_boxes_and_transcripts

In [None]:
## copy one file from test sample
import os
import shutil
data_path = "data/test_data_example/boxes_and_transcripts/"
image_path = "data/test_data_example/images/"

out_img_path = "/content/test_img/"
out_box_path = "/content/test_boxes_and_transcripts/"

for file in os.listdir(data_path)[:10]:
  shutil.copy(data_path+file,out_box_path)
  shutil.copy(image_path+file.replace(".tsv",".jpg"),out_img_path)

In [None]:
## change model_best.pth path
!python test.py --checkpoint saved/models/PICK_Default/test_1003_053713/model_best.pth \
                --boxes_transcripts {out_box_path} \
                --images_path {out_img_path} --output_folder /content/output/ \
                --gpu 0 --batch_size 2