# NKJP - 1M

## POS tagging

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import xml.etree.ElementTree as ET
sys.path.append('../../../')
from collections import Counter
from typing import List, Tuple

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from datasets import load_dataset
from multitask_nlp.settings import DATASETS_DIR

tqdm.pandas()

dataset_path = DATASETS_DIR / 'nkjp1m'

In [4]:
def read_xml_file(filepath: str):
    tree = ET.parse(filepath)
    root = tree.getroot()
    chunk_list_root = root[0]

    total_count = 0
    data = []

    tokens_num = 0

    for paragraph_chunk in chunk_list_root:
        pragraph_data = []
        for sentence_chunk in paragraph_chunk:
            sentence_tokens = []
            sentence_tags = []
            for token in sentence_chunk.iter('tok'):
                total_count += 1
                orth = token[0].text
                sentence_tokens.append(orth)

                pos_tags = [descendant for descendant in token.iter('lex')]
                for pos_tag in pos_tags:
                    if 'disamb' in pos_tag.attrib:
                        sentence_tags.append(pos_tag[1].text)

            text = ' '.join(sentence_tokens)
            pragraph_data.append((text, sentence_tokens, sentence_tags))
            
        data.append(pragraph_data)
        
    return data


In [5]:
filepath = dataset_path / 'nkjp1m-1.2-xces-xml'
all_documents_data = read_xml_file(filepath)

In [9]:
text_ids, texts, texts_tokens, tags = [], [], [], []
text_id = 1

for document_data in all_documents_data:
    for sentence, sentence_tokens, sentence_tags in document_data:
        texts.append(sentence)
        texts_tokens.append(sentence_tokens)
        tags.append(sentence_tags)
        text_id += 1

In [10]:
assert all([len(tokens) == len(texts_tags) for tokens, texts_tags 
            in zip(texts_tokens, tags)])

In [11]:
len(texts)

85663

In [12]:
all_tags = [t for sent_tags in tags for t in sent_tags]

In [13]:
len(all_tags)

1215513

In [20]:
unique_tags = np.unique(all_tags)

In [21]:
len(unique_tags)

1129

In [22]:
grammatic_classes = np.unique([t.split(':')[0] for t in unique_tags])

In [23]:
grammatic_classes

array(['adj', 'adja', 'adjc', 'adjp', 'adv', 'aglt', 'bedzie', 'brev',
       'burk', 'comp', 'conj', 'depr', 'fin', 'ger', 'ign', 'imps',
       'impt', 'inf', 'interj', 'interp', 'num', 'numcol', 'pact', 'pant',
       'pcon', 'ppas', 'ppron12', 'ppron3', 'praet', 'pred', 'prep',
       'qub', 'siebie', 'subst', 'winien', 'xxx'], dtype='<U7')

In [25]:
for grammar_class in grammatic_classes:
    print('\n' + '#'*100)
    print(grammar_class)
    print('#'*100 + '\n')
    class_tags = [t for t in unique_tags if t.split(':')[0] == grammar_class]
    print(' | '.join(class_tags))



####################################################################################################
adj
####################################################################################################

adj:pl:acc:f:com | adj:pl:acc:f:pos | adj:pl:acc:f:sup | adj:pl:acc:m1:com | adj:pl:acc:m1:pos | adj:pl:acc:m1:sup | adj:pl:acc:m2:com | adj:pl:acc:m2:pos | adj:pl:acc:m2:sup | adj:pl:acc:m3:com | adj:pl:acc:m3:pos | adj:pl:acc:m3:sup | adj:pl:acc:n:com | adj:pl:acc:n:pos | adj:pl:acc:n:sup | adj:pl:dat:f:com | adj:pl:dat:f:pos | adj:pl:dat:f:sup | adj:pl:dat:m1:com | adj:pl:dat:m1:pos | adj:pl:dat:m1:sup | adj:pl:dat:m2:com | adj:pl:dat:m2:pos | adj:pl:dat:m2:sup | adj:pl:dat:m3:com | adj:pl:dat:m3:pos | adj:pl:dat:m3:sup | adj:pl:dat:n:com | adj:pl:dat:n:pos | adj:pl:dat:n:sup | adj:pl:gen:f:com | adj:pl:gen:f:pos | adj:pl:gen:f:sup | adj:pl:gen:m1:com | adj:pl:gen:m1:pos | adj:pl:gen:m1:sup | adj:pl:gen:m2:com | adj:pl:gen:m2:pos | adj:pl:gen:m2:sup | adj:pl:gen:m3:com | adj:pl: