In [131]:
import pandas as pd

## Data Exploration

### OCR results

In [132]:
train_dataset = pd.DataFrame(pd.read_pickle("../data/train_set_ocr.pkl").items(), columns="filename text".split())
train_dataset.head()

Unnamed: 0,filename,text
0,test_hashed\pit37_v1\9d6f61c6-52ad-4546-85b7-b...,"POLA Lalwe poda PODATNIK POLA , CIEMNE _ URZAD..."
1,test_hashed\pit37_v1\f14ea197-70bd-45ae-9a22-d...,PlTaxpl POUA 'WYPELNIA PODATNIK POLA CIEMNE WY...
2,test_hashed\pit37_v1\070a521a-7038-4551-bc6a-b...,MyPfenl Wypelnic DulyMi DaukowanlUiTERAM ClRnY...
3,test_hashed\pit37_v1\f4d15b30-6e96-4452-b614-a...,PlTaxpl POLA JASNE WYPEŁNIA PODATNIK _POLA CIE...
4,test_hashed\pit37_v1\64d085a8-39a0-4887-859e-8...,PITaxpl LA Jash€ WyPelNIA PODiTŃIK PCU CieMne ...


In [133]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10884 entries, 0 to 10883
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  10884 non-null  object
 1   text      10884 non-null  object
dtypes: object(2)
memory usage: 170.2+ KB


### Classes

In [135]:
filename_to_class = pd.read_pickle("../data/train_labels_final.pkl")

classes_not_found = []
for filename in train_dataset["filename"].to_list():
    if filename not in filename_to_class:
        classes_not_found.append(filename)

print(f"Classes not found count: {len(classes_not_found)}")
print(f"Equals to {len(classes_not_found)/train_dataset.shape[0]} of train data")

Classes not found count: 35
Equals to 0.003215729511209114 of train data


The number of classes not found is relatively very small. We can ignore it.

In [136]:
train_dataset = train_dataset[~train_dataset["filename"].isin(classes_not_found)]
train_dataset.shape[0]

10849

In [137]:
train_dataset["class"] = train_dataset["filename"].apply(lambda f: filename_to_class[f])
train_dataset.head()

Unnamed: 0,filename,text,class
0,test_hashed\pit37_v1\9d6f61c6-52ad-4546-85b7-b...,"POLA Lalwe poda PODATNIK POLA , CIEMNE _ URZAD...",pit37_v1
1,test_hashed\pit37_v1\f14ea197-70bd-45ae-9a22-d...,PlTaxpl POUA 'WYPELNIA PODATNIK POLA CIEMNE WY...,pit37_v1
2,test_hashed\pit37_v1\070a521a-7038-4551-bc6a-b...,MyPfenl Wypelnic DulyMi DaukowanlUiTERAM ClRnY...,pit37_v1
3,test_hashed\pit37_v1\f4d15b30-6e96-4452-b614-a...,PlTaxpl POLA JASNE WYPEŁNIA PODATNIK _POLA CIE...,pit37_v1
4,test_hashed\pit37_v1\64d085a8-39a0-4887-859e-8...,PITaxpl LA Jash€ WyPelNIA PODiTŃIK PCU CieMne ...,pit37_v1


### Numerical Classes

In [138]:
label_to_label_id = pd.read_pickle("../data/id2label_final.pkl")
label_to_label_id

{'advertisement': 0,
 'budget': 1,
 'email': 2,
 'file_folder': 3,
 'form': 4,
 'handwritten': 5,
 'invoice': 6,
 'letter': 7,
 'memo': 8,
 'news_article': 9,
 'pit37_v1': 10,
 'pozwolenie_uzytkowanie_obiektu_budowlanego': 11,
 'presentation': 12,
 'questionnaire': 13,
 'resume': 14,
 'scientific_publication': 15,
 'scientific_report': 16,
 'specification': 17,
 'umowa_na_odleglosc_odstapienie': 18,
 'umowa_o_dzielo': 19,
 'umowa_sprzedazy_samochodu': 20}

In [139]:
train_dataset["class_id"] = train_dataset["class"].apply(lambda c: label_to_label_id[c])
train_dataset.head()

Unnamed: 0,filename,text,class,class_id
0,test_hashed\pit37_v1\9d6f61c6-52ad-4546-85b7-b...,"POLA Lalwe poda PODATNIK POLA , CIEMNE _ URZAD...",pit37_v1,10
1,test_hashed\pit37_v1\f14ea197-70bd-45ae-9a22-d...,PlTaxpl POUA 'WYPELNIA PODATNIK POLA CIEMNE WY...,pit37_v1,10
2,test_hashed\pit37_v1\070a521a-7038-4551-bc6a-b...,MyPfenl Wypelnic DulyMi DaukowanlUiTERAM ClRnY...,pit37_v1,10
3,test_hashed\pit37_v1\f4d15b30-6e96-4452-b614-a...,PlTaxpl POLA JASNE WYPEŁNIA PODATNIK _POLA CIE...,pit37_v1,10
4,test_hashed\pit37_v1\64d085a8-39a0-4887-859e-8...,PITaxpl LA Jash€ WyPelNIA PODiTŃIK PCU CieMne ...,pit37_v1,10


### Classes Summary

In [140]:
train_dataset["class class_id".split()].value_counts()

class                                       class_id
handwritten                                 5           618
presentation                                12          592
scientific_report                           16          584
advertisement                               0           580
scientific_publication                      15          577
file_folder                                 3           566
form                                        4           566
resume                                      14          565
email                                       2           563
memo                                        8           558
budget                                      1           556
invoice                                     6           555
letter                                      7           551
questionnaire                               13          551
specification                               17          529
news_article                                9  

In [141]:
len(train_dataset.class_id.unique())

21

## Tokenization

In [142]:
import string

def tokenize(text: str):
    text = text.lower()

    for punctuation_mark in set(string.punctuation) - {"-"}:
        text = text.replace(punctuation_mark, "")
    text = "".join(c for c in text if not c.isdigit())

    word_list = [word.strip().strip("-") for word in text.split()]
    word_list = [word for word in word_list if word]
    return " ".join(word_list)

train_dataset["text"] = train_dataset["text"].apply(tokenize)
train_dataset.head()

Unnamed: 0,filename,text,class,class_id
0,test_hashed\pit37_v1\9d6f61c6-52ad-4546-85b7-b...,pola lalwe poda podatnik pola ciemne urzad nip...,pit37_v1,10
1,test_hashed\pit37_v1\f14ea197-70bd-45ae-9a22-d...,pltaxpl poua wypelnia podatnik pola ciemne wyp...,pit37_v1,10
2,test_hashed\pit37_v1\070a521a-7038-4551-bc6a-b...,mypfenl wypelnic dulymi daukowanluiteram clrny...,pit37_v1,10
3,test_hashed\pit37_v1\f4d15b30-6e96-4452-b614-a...,pltaxpl pola jasne wypełnia podatnik pola ciem...,pit37_v1,10
4,test_hashed\pit37_v1\64d085a8-39a0-4887-859e-8...,pitaxpl la jash€ wypelnia poditńik pcu ciemne ...,pit37_v1,10


In [143]:
print("Mean number of characters:", train_dataset["text"].apply(lambda s: len(s)).mean())
print("Mean number of words:", train_dataset["text"].apply(lambda s: len(s.split())).mean())

Mean number of characters: 1023.0436906627339
Mean number of words: 154.70946631025902


### Language Split

Based on inspection of the JSON file we can see that the language of the file changes.

In [145]:
first_english_index = train_dataset["filename"].to_list().index("./test_hashed/advertisement/a64b06e6-8f0a-4fd7-ba12-0779c5560d9a.tiff")
train_dataset_pl = train_dataset.iloc[:first_english_index]
train_dataset_eng = train_dataset.iloc[first_english_index:]

print("Polish language in training dataset", train_dataset_pl.shape[0] / train_dataset.shape[0])
print("English language in training dataset", train_dataset_eng.shape[0] / train_dataset.shape[0])

Polish language in training dataset 0.16914001290441516
English language in training dataset 0.8308599870955848
