# **I. Clone Data**

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.getcwd()
os.chdir("/content/drive/MyDrive/KIE_invoice_minimal/")

In [None]:
!git clone https://github.com/huyhoang17/KIE_invoice_minimal.git

Cloning into 'KIE_invoice_minimal'...
remote: Enumerating objects: 114, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 114 (delta 0), reused 1 (delta 0), pack-reused 111[K
Receiving objects: 100% (114/114), 11.33 MiB | 11.51 MiB/s, done.
Resolving deltas: 100% (25/25), done.


In [None]:
ls

api.py   [0m[01;34mbackend[0m/    graph.py     LICENSE   [01;34m__pycache__[0m/      [01;34mresults[0m/
app.py   configs.py  [01;34mimages[0m/      Makefile  README.md         [01;34mweights[0m/
[01;34massets[0m/  [01;34mdata[0m/       __init__.py  [01;34mmodels[0m/   requirements.txt


In [None]:
!gdown 1VA9hbj3rFlvWimzqoNSrwInIK43qYBLv

Downloading...
From: https://drive.google.com/uc?id=1VA9hbj3rFlvWimzqoNSrwInIK43qYBLv
To: /content/drive/MyDrive/KIE_invoice_minimal/weights/weights.zip
100% 131M/131M [00:00<00:00, 185MB/s]


In [None]:
!unzip weights.zip

Archive:  weights.zip
   creating: weights/
   creating: weights/kie/
  inflating: weights/kie/kie_mcocr.pkl  
   creating: weights/saliency/
  inflating: weights/saliency/u2netp.pth  
   creating: weights/text_detect/
  inflating: weights/text_detect/craft_mlt_25k_1.pth  


# **II. Visualize Data**

# **III. Training**

In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
!pip3 install torch-scatter -f https://data.pyg.org/whl/torch-1.12.0+11.2.html
!pip3 install torch-sparse -f https://data.pyg.org/whl/torch-1.12.0+11.2.html
!pip3 install torch-geometric
!pip3 install sentence-transformers
!pip3 install igraph
!pip3 install bpemb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.0+11.2.html
Collecting torch-scatter
  Downloading torch_scatter-2.0.9.tar.gz (21 kB)
Building wheels for collected packages: torch-scatter
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch-scatter: filename=torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl size=3567067 sha256=f5570f3a1a22d0c42330342b91c59e89c2541ef040ce3df7ecddb473cc3d0285
  Stored in directory: /root/.cache/pip/wheels/dd/57/a3/42ea193b77378ce634eb9454c9bc1e3163f3b482a35cdee4d1
Successfully built torch-scatter
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.12.0+11.2.html
Collecting torch-sparse
  Downloading torch_sparse-0.6.14.tar.gz (51 k

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import LSTM
from torch.nn.utils.rnn import pack_padded_sequence

import numpy as np

import random
from tqdm import *

from models.kie.graph_norm import GraphNorm

# import torch_geometric
from torch_geometric.nn.conv.cheb_conv import ChebConv

In [None]:
class InvoiceGCN(nn.Module):

    def __init__(self, input_dim, chebnet=False, n_classes=5, dropout_rate=0.2, K=3):
        super().__init__()

        self.input_dim = input_dim
        self.n_classes = n_classes
        self.dropout_rate = dropout_rate

        if chebnet:
            self.conv1 = ChebConv(self.input_dim, 64, K=K)
            self.conv2 = ChebConv(64, 32, K=K)
            self.conv3 = ChebConv(32, 16, K=K)
            self.conv4 = ChebConv(16, self.n_classes, K=K)
        else:
            self.conv1 = GCNConv(self.first_dim, 64, improved=True, cached=True)
            self.conv2 = GCNConv(64, 32, improved=True, cached=True)
            self.conv3 = GCNConv(32, 16, improved=True, cached=True)
            self.conv4 = GCNConv(16, self.n_classes, improved=True, cached=True)

    def forward(self, data):
        # for transductive setting with full-batch update
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr

        x = F.dropout(F.relu(self.conv1(x, edge_index, edge_weight)), p=self.dropout_rate, training=self.training)
        x = F.dropout(F.relu(self.conv2(x, edge_index, edge_weight)), p=self.dropout_rate, training=self.training)
        x = F.dropout(F.relu(self.conv3(x, edge_index, edge_weight)), p=self.dropout_rate, training=self.training)
        x = self.conv4(x, edge_index, edge_weight)

        return F.log_softmax(x, dim=1)

# 1. Torch-Geometric & Sentence Tranformer using to define Graph

In [None]:
src = "/content/drive/MyDrive/KIE_invoice_minimal/data/SROIE_2019/raw/img/"
for file in os.listdir(src):
  if ".jpg" not in file:
    print(file)
    os.remove(src + file)

src = "/content/drive/MyDrive/KIE_invoice_minimal/data/SROIE_2019/raw/box/"
for file in os.listdir(src):
  if ".csv" not in file:
    print(file)
    os.remove(src + file)


# test input

In [None]:
import random
import pandas as pd
import cv2 
path = "/content/drive/MyDrive/KIE_invoice_minimal/data/SROIE_2019/raw/box/"
files = [i.split('.')[0] for i in os.listdir(path)]
files.sort()
all_files = files[1:]

list_of_graphs = []
train_list_of_graphs, test_list_of_graphs = [], []

files = all_files.copy()
random.shuffle(files)

for index, file in enumerate(tqdm_notebook(all_files)):
  data_fd = "/content/drive/MyDrive/KIE_invoice_minimal/data/SROIE_2019/"

  file_path = os.path.join(data_fd, "raw/box", file + '.csv')
  interim_path = os.path.join(data_fd, "interim", file + '.csv')
  image_path = os.path.join(data_fd, "raw/img", file + '.jpg')
  if file + '.jpg' not in os.listdir(os.path.join(data_fd, "raw/img")):
    print(file, " cannot find !!!")
  df = pd.read_csv(file_path, header=None, sep='\n')
  image = cv2.imread(image_path)
  if type(image) != np.ndarray:
    if file+".jpg" not in os.listdir(data_fd+"raw/img"):
      print(file_path)
  # df_withlabels = pd.read_csv(interim_path)

NameError: ignored

## Load data into train/test

In [None]:
import torch
import torch_geometric
from torch_geometric.utils.convert import from_networkx
from bpemb import BPEmb
from models.kie import graph
from sentence_transformers import SentenceTransformer
import traceback

bpemb_en = BPEmb(lang="en", dim=100)
sent_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

def make_sent_bert_features(text):
    emb = sent_model.encode([text])[0]
    return emb

def get_data(save_fd):
    """
    returns one big graph with unconnected graphs with the following:
    - x (Tensor, optional) – Node feature matrix with shape [num_nodes, num_node_features]. (default: None)
    - edge_index (LongTensor, optional) – Graph connectivity in COO format with shape [2, num_edges]. (default: None)
    - edge_attr (Tensor, optional) – Edge feature matrix with shape [num_edges, num_edge_features]. (default: None)
    - y (Tensor, optional) – Graph or node targets with arbitrary shape. (default: None)
    - validation mask, training mask and testing mask 
    """
    path = "/content/drive/MyDrive/KIE_invoice_minimal/data/SROIE_2019/raw/box"
    files = [i.split('.')[0] for i in os.listdir(path)]
    files.sort()
    all_files = files[1:]

    list_of_graphs = []
    train_list_of_graphs, test_list_of_graphs = [], []

    files = all_files.copy()
    random.shuffle(files)

    data_fd = "/content/drive/MyDrive/KIE_invoice_minimal/data/SROIE_2019/"

    """Resulting in 550 receipts for training"""
    training, testing = files[:550], files[550:]

    for index, file in enumerate(tqdm_notebook(all_files)):
      try:
        connect = graph.Grapher(file, data_fd)
        G,_,_ = connect.graph_formation()
        df = connect.relative_distance()
        if index == 0:
          print(df.columns.tolist())
          print(df.loc[0])
        individual_data = from_networkx(G)

        feature_cols = ['rd_b', 'rd_r', 'rd_t', 'rd_l','line_number', \
                'n_upper', 'n_alpha', 'n_spaces', 'n_numeric','n_special']

        text_features = np.array(df["Object"].map(make_sent_bert_features).tolist()).astype(np.float32)
        numeric_features = df[feature_cols].values.astype(np.float32)

        features = np.concatenate((numeric_features, text_features), axis=1)
        features = torch.tensor(features)

        for col in df.columns:
            try:
                df[col] = df[col].str.strip()
            except AttributeError as e:
                pass

        df['labels'] = df['labels'].fillna('undefined')
        df.loc[df['labels'] == 'company', 'num_labels'] = 1
        df.loc[df['labels'] == 'address', 'num_labels'] = 2
        df.loc[df['labels'] == 'date', 'num_labels'] = 3
        df.loc[df['labels'] == 'total', 'num_labels'] = 4
        df.loc[df['labels'] == 'undefined', 'num_labels'] = 5
 
        assert df['labels'].isnull().values.any() == False, f'labeling error! Invalid label(s) present in {file}.csv'
        labels = torch.tensor(df['num_labels'].values.astype(np.int))
        # print("labels: ", labels)
        text = df['Object'].values
        # print("text: ", text)

        if index == 0:
          print("df[0]: ", df.loc[0])
          print("features: ", features)
          print("labels: ", labels)
          print("text: ", text)
          print("file: ", file)

        individual_data.x = features
        individual_data.y = labels
        individual_data.text = text
        individual_data.img_id = file

        if file in training:
            train_list_of_graphs.append(individual_data)
        elif file in testing:
            test_list_of_graphs.append(individual_data)
      except:
        print(traceback.format_exc())

    train_data = torch_geometric.data.Batch.from_data_list(train_list_of_graphs)
    train_data.edge_attr = None
    test_data = torch_geometric.data.Batch.from_data_list(test_list_of_graphs)
    test_data.edge_attr = None

    torch.save(train_data, os.path.join(save_fd, 'train_data.dataset'))
    torch.save(test_data, os.path.join(save_fd, 'test_data.dataset'))

get_data(save_fd="/content/drive/MyDrive/KIE_invoice_minimal/data/processed")

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 997960.86B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d100.w2v.bin.tar.gz


100%|██████████| 3784656/3784656 [00:00<00:00, 5242327.32B/s]


Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
def load_train_test_split(save_fd):
    train_data = torch.load(os.path.join(save_fd, 'train_data.dataset'))
    test_data = torch.load(os.path.join(save_fd, 'test_data.dataset'))
    return train_data, test_data

train_data, test_data = load_train_test_split(save_fd="/content/drive/MyDrive/KIE_invoice_minimal/data/processed")
print(train_data)
print(test_data)
# Batch(batch=[29704], edge_index=[2, 40638], img_id=[550], text=[550], x=[29707, 778], y=[29707])
# Batch(batch=[3919], edge_index=[2, 5347], img_id=[76], text=[76], x=[3919, 778], y=[3919])

DataBatch(edge_index=[2, 79720], num_nodes=29153, x=[29156, 778], y=[29156], text=[547], img_id=[547], batch=[29153], ptr=[548])
DataBatch(edge_index=[2, 11504], num_nodes=4195, x=[4195, 778], y=[4195], text=[75], img_id=[75], batch=[4195], ptr=[76])


In [None]:
from sklearn.utils import class_weight
print(train_data)
print(train_data[0])

x = train_data
y = train_data.y.cpu().numpy()
print("y shape old: ", y.shape)
indices = np.where(y==-9223372036854775808)
y = np.delete(y, indices)
print("y shape new: ", y.shape)

print(len(x))
print("x.size old: ", x.size())
print(x)
for i in indices[0]:
  print(type(int(i)))
  # del x[int(i)]
  print(len(x))
  x.pop(int(i))


print("x.size new: ", x.size())

_class_weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=train_data.y.unique().cpu().numpy()[1:],
    y=y
)
for label in y:
  if label not in [1, 2, 3, 4, 5]:
    print(label)
print("train labels: ", train_data.y.unique().cpu().numpy()[1:])
print("y: ", y)
print(_class_weights)

DataBatch(edge_index=[2, 79720], num_nodes=29153, x=[29156, 778], y=[29156], text=[547], img_id=[547], batch=[29153], ptr=[548])
Data(edge_index=[2, 130], x=[48, 778], y=[48], text=[48], img_id='001', num_nodes=48)
y shape old:  (29156,)
y shape new:  (28635,)
8
x.size old:  (29153, 29153)
DataBatch(edge_index=[2, 79720], num_nodes=29153, x=[29156, 778], y=[29156], text=[547], img_id=[547], batch=[29153], ptr=[548])
<class 'int'>
8


KeyError: ignored

# 2. Huấn luyện model

In [None]:
from sklearn.utils import class_weight
# !set CUDA_LAUNCH_BLOCKING=1

model = InvoiceGCN(input_dim=train_data.x.shape[1], chebnet=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = torch.optim.AdamW(
    model.parameters(), lr=0.001, weight_decay=0.9
)
train_data = train_data.to(device)
test_data = test_data.to(device)


#DatNT
y = train_data.y.cpu().numpy()
indices = np.where(y==-9223372036854775808)
y = np.delete(y, indices)

# class weights for imbalanced data
_class_weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=train_data.y.unique().cpu().numpy()[1:],
    y=y
)
print(train_data)
print(train_data.y.unique().cpu().numpy().shape)
print(train_data.y.cpu().numpy().shape)
print("_class_weights: ", _class_weights)


no_epochs = 2000
for epoch in range(1, no_epochs + 1):
    model.train()
    optimizer.zero_grad()

    # NOTE: just use boolean indexing to filter out test data, and backward after that!
    # the same holds true with test data :D
    # https://github.com/rusty1s/pytorch_geometric/issues/1928
    loss = F.nll_loss(
        model(x), #train_data
        torch.Tensor(y) - 1, #train_data.y - 1,
        weight=torch.FloatTensor(_class_weights).to(device)
    )
    loss.backward()
    optimizer.step()

    # calculate acc on 5 classes
    with torch.no_grad():
        if epoch % 200 == 0:
            model.eval()

            # forward model
            for index, name in enumerate(['train', 'test']):
                _data = eval("{}_data".format(name))
                y_pred = model(_data).max(dim=1)[1]
                y_true = (_data.y - 1)
                acc = y_pred.eq(y_true).sum().item() / y_pred.shape[0]

                y_pred = y_pred.cpu().numpy()
                y_true = y_true.cpu().numpy()
                print("\t{} acc: {}".format(name, acc))
                # confusion matrix
                if name == 'test':
                    cm = confusion_matrix(y_true, y_pred)
                    class_accs = cm.diagonal() / cm.sum(axis=1)
                    print(classification_report(y_true, y_pred))

            loss_val = F.nll_loss(model(test_data), test_data.y - 1
            )
            fmt_log = "Epoch: {:03d}, train_loss:{:.4f}, val_loss:{:.4f}"
            print(fmt_log.format(epoch, loss, loss_val))
            print(">" * 50)

DataBatch(edge_index=[2, 79720], num_nodes=29153, x=[29156, 778], y=[29156], text=[547], img_id=[547], batch=[29153], ptr=[548])
(6,)
(29156,)
_class_weights:  [ 9.51328904  3.93337912 10.50825688 10.46983547  0.22472042]


ValueError: ignored