In [16]:
import pandas as pd
import numpy as np

## Data Exploration

### OCR results

In [17]:
train_dataset = pd.DataFrame(pd.read_pickle("../data/train_set_ocr.pkl").items(), columns="filename text".split())
train_dataset.head()

Unnamed: 0,filename,text
0,test_hashed\pit37_v1\9d6f61c6-52ad-4546-85b7-b...,"POLA Lalwe poda PODATNIK POLA , CIEMNE _ URZAD..."
1,test_hashed\pit37_v1\f14ea197-70bd-45ae-9a22-d...,PlTaxpl POUA 'WYPELNIA PODATNIK POLA CIEMNE WY...
2,test_hashed\pit37_v1\070a521a-7038-4551-bc6a-b...,MyPfenl Wypelnic DulyMi DaukowanlUiTERAM ClRnY...
3,test_hashed\pit37_v1\f4d15b30-6e96-4452-b614-a...,PlTaxpl POLA JASNE WYPEŁNIA PODATNIK _POLA CIE...
4,test_hashed\pit37_v1\64d085a8-39a0-4887-859e-8...,PITaxpl LA Jash€ WyPelNIA PODiTŃIK PCU CieMne ...


In [18]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10884 entries, 0 to 10883
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  10884 non-null  object
 1   text      10884 non-null  object
dtypes: object(2)
memory usage: 170.2+ KB


### Classes

In [19]:
filename_to_class = pd.read_pickle("../data/train_labels_final.pkl")

classes_not_found = []
for filename in train_dataset["filename"].to_list():
    if filename not in filename_to_class:
        classes_not_found.append(filename)

print(f"Classes not found count: {len(classes_not_found)}")
print(f"Equals to {len(classes_not_found)/train_dataset.shape[0]} of train data")

Classes not found count: 35
Equals to 0.003215729511209114 of train data


The number of classes not found is relatively very small. We can ignore it.

In [20]:
train_dataset = train_dataset[~train_dataset["filename"].isin(classes_not_found)]
train_dataset.shape[0]

10849

In [21]:
train_dataset["class"] = train_dataset["filename"].apply(lambda f: filename_to_class[f])
train_dataset.head()

Unnamed: 0,filename,text,class
0,test_hashed\pit37_v1\9d6f61c6-52ad-4546-85b7-b...,"POLA Lalwe poda PODATNIK POLA , CIEMNE _ URZAD...",pit37_v1
1,test_hashed\pit37_v1\f14ea197-70bd-45ae-9a22-d...,PlTaxpl POUA 'WYPELNIA PODATNIK POLA CIEMNE WY...,pit37_v1
2,test_hashed\pit37_v1\070a521a-7038-4551-bc6a-b...,MyPfenl Wypelnic DulyMi DaukowanlUiTERAM ClRnY...,pit37_v1
3,test_hashed\pit37_v1\f4d15b30-6e96-4452-b614-a...,PlTaxpl POLA JASNE WYPEŁNIA PODATNIK _POLA CIE...,pit37_v1
4,test_hashed\pit37_v1\64d085a8-39a0-4887-859e-8...,PITaxpl LA Jash€ WyPelNIA PODiTŃIK PCU CieMne ...,pit37_v1


### Numerical Classes

In [22]:
label_to_label_id = pd.read_pickle("../data/id2label_final.pkl")
label_to_label_id

{'advertisement': 0,
 'budget': 1,
 'email': 2,
 'file_folder': 3,
 'form': 4,
 'handwritten': 5,
 'invoice': 6,
 'letter': 7,
 'memo': 8,
 'news_article': 9,
 'pit37_v1': 10,
 'pozwolenie_uzytkowanie_obiektu_budowlanego': 11,
 'presentation': 12,
 'questionnaire': 13,
 'resume': 14,
 'scientific_publication': 15,
 'scientific_report': 16,
 'specification': 17,
 'umowa_na_odleglosc_odstapienie': 18,
 'umowa_o_dzielo': 19,
 'umowa_sprzedazy_samochodu': 20}

In [23]:
train_dataset["class_id"] = train_dataset["class"].apply(lambda c: label_to_label_id[c])
train_dataset.head()

Unnamed: 0,filename,text,class,class_id
0,test_hashed\pit37_v1\9d6f61c6-52ad-4546-85b7-b...,"POLA Lalwe poda PODATNIK POLA , CIEMNE _ URZAD...",pit37_v1,10
1,test_hashed\pit37_v1\f14ea197-70bd-45ae-9a22-d...,PlTaxpl POUA 'WYPELNIA PODATNIK POLA CIEMNE WY...,pit37_v1,10
2,test_hashed\pit37_v1\070a521a-7038-4551-bc6a-b...,MyPfenl Wypelnic DulyMi DaukowanlUiTERAM ClRnY...,pit37_v1,10
3,test_hashed\pit37_v1\f4d15b30-6e96-4452-b614-a...,PlTaxpl POLA JASNE WYPEŁNIA PODATNIK _POLA CIE...,pit37_v1,10
4,test_hashed\pit37_v1\64d085a8-39a0-4887-859e-8...,PITaxpl LA Jash€ WyPelNIA PODiTŃIK PCU CieMne ...,pit37_v1,10


### Classes Summary

In [24]:
train_dataset["class class_id".split()].value_counts()

class                                       class_id
handwritten                                 5           618
presentation                                12          592
scientific_report                           16          584
advertisement                               0           580
scientific_publication                      15          577
file_folder                                 3           566
form                                        4           566
resume                                      14          565
email                                       2           563
memo                                        8           558
budget                                      1           556
invoice                                     6           555
letter                                      7           551
questionnaire                               13          551
specification                               17          529
news_article                                9  

In [25]:
len(train_dataset.class_id.unique())

21

### Tokenization

In [26]:
import string

def tokenize(text: str):
    text = text.lower()

    for punctuation_mark in set(string.punctuation) - {"-"}:
        text = text.replace(punctuation_mark, "")
    text = "".join(c for c in text if not c.isdigit())

    word_list = [word.strip().strip("-") for word in text.split()]
    word_list = [word for word in word_list if word]
    return " ".join(word_list)

train_dataset["text"] = train_dataset["text"].apply(tokenize)
train_dataset.head()

Unnamed: 0,filename,text,class,class_id
0,test_hashed\pit37_v1\9d6f61c6-52ad-4546-85b7-b...,pola lalwe poda podatnik pola ciemne urzad nip...,pit37_v1,10
1,test_hashed\pit37_v1\f14ea197-70bd-45ae-9a22-d...,pltaxpl poua wypelnia podatnik pola ciemne wyp...,pit37_v1,10
2,test_hashed\pit37_v1\070a521a-7038-4551-bc6a-b...,mypfenl wypelnic dulymi daukowanluiteram clrny...,pit37_v1,10
3,test_hashed\pit37_v1\f4d15b30-6e96-4452-b614-a...,pltaxpl pola jasne wypełnia podatnik pola ciem...,pit37_v1,10
4,test_hashed\pit37_v1\64d085a8-39a0-4887-859e-8...,pitaxpl la jash€ wypelnia poditńik pcu ciemne ...,pit37_v1,10


In [27]:
print("Mean number of characters:", train_dataset["text"].apply(lambda s: len(s)).mean())
print("Mean number of words:", train_dataset["text"].apply(lambda s: len(s.split())).mean())

Mean number of characters: 1023.0436906627339
Mean number of words: 154.70946631025902


### Language Split

Based on inspection of the JSON file we can see that the language of the file changes.

In [28]:
first_english_index = train_dataset["filename"].to_list().index("./test_hashed/advertisement/a64b06e6-8f0a-4fd7-ba12-0779c5560d9a.tiff")
train_dataset_pl = train_dataset.iloc[:first_english_index]
train_dataset_eng = train_dataset.iloc[first_english_index:]

print("Polish language in training dataset", train_dataset_pl.shape[0] / train_dataset.shape[0])
print("English language in training dataset", train_dataset_eng.shape[0] / train_dataset.shape[0])

Polish language in training dataset 0.16914001290441516
English language in training dataset 0.8308599870955848


## Model Creation

### Shared Functions

In [56]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [57]:
def pad(sequences):
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

def create_all_categories_columns(input_array: np.array):
    result = np.zeros(shape=(input_array.shape[0], LABEL_N), dtype=int)
    result[np.arange(0, input_array.shape[0]), input_array] = 1
    return result

def prepare_input(input_list, tokenizer):
    return np.array(pad(tokenizer.texts_to_sequences(input_list)))

### English

#### Input Preparation

In [40]:
x_columns = "text"
y_columns = "class_id"
x, y = train_dataset_eng[x_columns], train_dataset_eng[y_columns]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4, stratify=y)

LABEL_N = 21

# Optimized
VOCAB_SIZE = 2_000
EMBEDDING_DIM = 64
DENSE_LAYER_COUNT = 128
max_length = 400
trunc_type='post'
padding_type='post'

english_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
english_tokenizer.fit_on_texts(x_train.to_list())
print(english_tokenizer.word_index)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### Creation

In [41]:
english_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(DENSE_LAYER_COUNT, activation="relu"),
        tf.keras.layers.Dense(LABEL_N, activation='softmax'),
])
adam = tf.keras.optimizers.Adam(
    learning_rate=0.002,
    beta_1=0.9,
    beta_2=0.999,
    amsgrad=False
)
english_model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['accuracy'])
english_model.summary()

num_epochs = 20
history = english_model.fit(
    x=prepare_input(x_train.to_list()),
    y=create_all_categories_columns(np.array(y_train)),
    epochs=num_epochs,
    validation_data=(prepare_input(x_test.to_list(), english_tokenizer), create_all_categories_columns(np.array(y_test))),
    verbose=2
)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 64)          128000    
                                                                 
 global_average_pooling1d_2   (None, 64)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 21)                2709      
                                                                 
Total params: 139,029
Trainable params: 139,029
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
212/212 - 3s - loss: 2.5588 - accuracy: 0.1636 - val_loss: 2.3753 - val_accuracy: 0.2413 - 3s/epoch - 1

KeyboardInterrupt: 

### Polish

In [58]:
x_columns = "text"
y_columns = "class_id"
x, y = train_dataset_pl[x_columns], train_dataset_pl[y_columns]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4, stratify=y)

# Optimized
LEARNING_RATE = 0.005
VOCAB_SIZE = 2_000
EMBEDDING_DIM = 128 # 16
DENSE_LAYER_COUNT = 128
max_length = 400
trunc_type='post'
padding_type='post'

polish_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
polish_tokenizer.fit_on_texts(x_train.to_list())

In [59]:
polish_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(DENSE_LAYER_COUNT, activation="relu"),
        tf.keras.layers.Dense(LABEL_N, activation='softmax'),
])
adam = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, amsgrad=False)
polish_model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['accuracy'])

polish_model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 128)         256000    
                                                                 
 global_average_pooling1d_6   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_12 (Dense)            (None, 128)               16512     
                                                                 
 dense_13 (Dense)            (None, 21)                2709      
                                                                 
Total params: 275,221
Trainable params: 275,221
Non-trainable params: 0
_________________________________________________________________


In [63]:
num_epochs = 10
history = polish_model.fit(
    x=prepare_input(x_train.to_list(), polish_tokenizer),
    y=create_all_categories_columns(np.array(y_train)),
    epochs=num_epochs,
    validation_data=(prepare_input(x_test.to_list(), polish_tokenizer), create_all_categories_columns(np.array(y_test))),
    verbose=2
)

Epoch 1/10
43/43 - 2s - loss: 1.4172 - accuracy: 0.5938 - val_loss: 0.5063 - val_accuracy: 0.8845 - 2s/epoch - 51ms/step
Epoch 2/10
43/43 - 1s - loss: 0.1765 - accuracy: 0.9797 - val_loss: 0.0422 - val_accuracy: 0.9913 - 568ms/epoch - 13ms/step
Epoch 3/10
43/43 - 1s - loss: 0.0183 - accuracy: 0.9993 - val_loss: 0.0152 - val_accuracy: 0.9978 - 578ms/epoch - 13ms/step
Epoch 4/10
43/43 - 1s - loss: 0.0070 - accuracy: 1.0000 - val_loss: 0.0084 - val_accuracy: 1.0000 - 564ms/epoch - 13ms/step
Epoch 5/10
43/43 - 1s - loss: 0.0044 - accuracy: 1.0000 - val_loss: 0.0051 - val_accuracy: 1.0000 - 560ms/epoch - 13ms/step
Epoch 6/10
43/43 - 1s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.0049 - val_accuracy: 1.0000 - 572ms/epoch - 13ms/step
Epoch 7/10
43/43 - 1s - loss: 0.0018 - accuracy: 1.0000 - val_loss: 0.0035 - val_accuracy: 1.0000 - 585ms/epoch - 14ms/step
Epoch 8/10
43/43 - 1s - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.0028 - val_accuracy: 1.0000 - 574ms/epoch - 13ms/step
Epoch 9/10


## Models in Action

In [64]:
def get_model_prediction(text, model, tokenizer):
    text_representation = np.array(
        pad(
            tokenizer.texts_to_sequences(
                [str(text)]
            )
        )
    )
    prediction = list(model.predict(text_representation)[0])
    column = prediction.index(max(prediction))

    print(model.predict(text_representation)[0], column)
    return column

In [68]:
from py3langid.langid import LanguageIdentifier, MODEL_FILE
from langdetect import detect, detect_langs

testing_output_df = pd.DataFrame(
    pd.read_pickle("../data/test_ocr_clean.pkl").items(),
    columns="filename text".split()
)

identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE)
identifier.set_languages(['pl', 'en'])

testing_output_df["lang"] = testing_output_df["text"].apply(lambda text: identifier.classify(str(text))[0])

pl_filter = testing_output_df["lang"] == "pl"
testing_output_df.loc[pl_filter, "predict"] = testing_output_df.loc[pl_filter, "text"].apply(
    get_model_prediction,
    args=[polish_model, polish_tokenizer]
)

eng_filter = testing_output_df["lang"] == "en"
testing_output_df.loc[eng_filter, "predict"] = testing_output_df.loc[eng_filter, "text"].apply(
    get_model_prediction,
    args=[english_model, english_tokenizer]
)
print(testing_output_df.head())

testing_output_df["predict"] = testing_output_df["predict"].astype(int)
testing_output_df = testing_output_df.sort_values(by="filename")
testing_output_df["filename predict".split()].head()

[3.38806691e-11 1.43022356e-11 1.29107211e-11 8.81846887e-11
 1.67957800e-10 4.54881445e-11 2.05348932e-11 1.26812894e-10
 1.20053429e-11 3.28690339e-11 9.99769866e-01 1.03498525e-04
 5.69937916e-11 2.28555178e-11 4.28323627e-11 1.05555079e-11
 4.22469594e-11 2.80407936e-11 3.88966203e-12 9.73234637e-06
 1.16943571e-04] 10
[1.7683356e-14 3.9477605e-15 4.3196868e-15 2.3630517e-14 1.8668525e-13
 1.3195155e-14 6.1718005e-15 4.1817739e-14 2.1438234e-15 1.3945450e-14
 9.9999857e-01 5.1772327e-07 2.9619075e-14 4.7052812e-15 1.3879535e-14
 1.7075888e-15 1.1731381e-14 1.8138962e-14 7.6672683e-20 1.7946954e-08
 9.8729959e-07] 10
[1.19031141e-13 2.33782144e-14 3.08991724e-14 1.58437120e-13
 9.26160624e-13 7.11350602e-14 3.75181623e-14 2.76520370e-13
 1.17141429e-14 9.58758084e-14 9.99985337e-01 7.16343629e-06
 1.54892131e-13 2.78650750e-14 1.01207115e-13 1.13796666e-14
 7.78214028e-14 9.16974220e-14 2.71472488e-18 1.01830139e-07
 7.43338160e-06] 10
[5.93303593e-15 1.39747434e-15 1.33476691e-15 7

NameError: name 'english_tokenizer' is not defined