In [5]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append('../../../')
from collections import Counter
from typing import List, Tuple

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from datasets import load_dataset
from multitask_nlp.settings import DATASETS_DIR

tqdm.pandas()

dataset_path = DATASETS_DIR / 'poleval2018'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
def read_file(filepath: str) -> List[Tuple[str, List[str], List[str]]]:
    data = []
    sentence_tokens = []
    tags = []

    f = open(filepath, encoding='UTF-8')
    for i, line in enumerate(f, 1):
        if not line.strip() or len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence_tokens) > 0:
                sentence = ' '.join(sentence_tokens)
                data.append((sentence, sentence_tokens, tags))
                sentence_tokens = []
                tags = []
            continue

        splits = line.split('\t')
        assert len(splits) >= 2, "error on line {}. Found {} splits".format(i, len(splits))
        word, tag = splits[0], splits[2]
        sentence_tokens.append(word.strip())
        tags.append(tag.strip())

    if len(sentence_tokens) > 0:
        sentence = ' '.join(sentence_tokens)
        data.append((sentence, sentence_tokens, tags))

    f.close()
    return data

text_ids, texts, texts_tokens, tags = [], [], [], []

files = os.listdir(dataset_path)
files = list(filter(lambda x: x.split('.')[1] == 'iob', files))

text_id = 1

for f_name in tqdm(files):
    data = read_file(dataset_path / f_name)
    for sentence, sentence_tokens, sentence_tags in data:
        texts.append(sentence)
        texts_tokens.append(sentence_tokens)
        tags.append(sentence_tags)
        text_id += 1

  0%|          | 0/1828 [00:00<?, ?it/s]

In [7]:
assert all([len(tokens) == len(texts_tags) for tokens, texts_tags 
            in zip(texts_tokens, tags)])

In [8]:
len(texts)

36096

In [9]:
all_tags = [t for sent_tags in tags for t in sent_tags]

In [10]:
len(all_tags)

540545

In [11]:
counter = Counter(all_tags)

In [12]:
counter.most_common()

[('interp', 95364),
 ('qub', 29958),
 ('conj', 19503),
 ('fin:sg:ter:imperf', 14412),
 ('subst:sg:gen:f', 13684),
 ('prep:loc:nwok', 12074),
 ('subst:sg:nom:m1', 11337),
 ('subst:sg:nom:f', 10855),
 ('subst:sg:gen:m3', 10308),
 ('adv', 9078),
 ('adv:pos', 8829),
 ('comp', 8493),
 ('prep:gen', 8082),
 ('prep:acc', 7739),
 ('subst:sg:nom:m3', 7389),
 ('prep:loc', 7078),
 ('subst:sg:acc:f', 6545),
 ('subst:sg:loc:m3', 6356),
 ('prep:gen:nwok', 6202),
 ('subst:sg:acc:m3', 6182),
 ('subst:sg:gen:n', 6108),
 ('subst:sg:nom:n', 5586),
 ('subst:sg:loc:f', 5252),
 ('inf:perf', 4991),
 ('brev:pun', 4942),
 ('fin:pl:ter:imperf', 4917),
 ('subst:pl:gen:m3', 4760),
 ('subst:pl:gen:f', 4674),
 ('subst:sg:acc:n', 4590),
 ('adj:sg:nom:f:pos', 4545),
 ('adj:sg:gen:f:pos', 4462),
 ('prep:inst:nwok', 4419),
 ('adj:sg:gen:m3:pos', 4174),
 ('inf:imperf', 4136),
 ('praet:sg:m1:perf', 4089),
 ('subst:sg:gen:m1', 3953),
 ('adj:sg:nom:m3:pos', 3861),
 ('pred', 3252),
 ('prep:acc:nwok', 3222),
 ('praet:sg:m1:im

In [13]:
unique_tags = np.unique(all_tags)

In [14]:
len(unique_tags)

779

In [15]:
grammatic_classes = np.unique([t.split(':')[0] for t in unique_tags])

In [16]:
grammatic_classes = Counter([t.split(':')[0] for t in all_tags])

In [17]:
for grammar_class in grammatic_classes:
    print('\n' + '#'*100)
    print(grammar_class)
    print('#'*100 + '\n')
    class_tags = [t for t in unique_tags if t.split(':')[0] == grammar_class]
    print(' | '.join(class_tags))



####################################################################################################
praet
####################################################################################################

praet:pl:f:imperf | praet:pl:f:perf | praet:pl:m1:imperf | praet:pl:m1:perf | praet:pl:m2:imperf | praet:pl:m2:perf | praet:pl:m3:imperf | praet:pl:m3:perf | praet:pl:n:imperf | praet:pl:n:perf | praet:sg:f:imperf | praet:sg:f:perf | praet:sg:m1:imperf | praet:sg:m1:imperf:agl | praet:sg:m1:imperf:nagl | praet:sg:m1:perf | praet:sg:m1:perf:agl | praet:sg:m1:perf:nagl | praet:sg:m2:imperf | praet:sg:m2:imperf:nagl | praet:sg:m2:perf | praet:sg:m2:perf:nagl | praet:sg:m3:imperf | praet:sg:m3:imperf:nagl | praet:sg:m3:perf | praet:sg:m3:perf:nagl | praet:sg:n:imperf | praet:sg:n:perf

####################################################################################################
subst
##############################################################################################