# CCPL

## POS tagging

In [63]:
%load_ext autoreload
%autoreload 2

import os
import sys
import xml.etree.ElementTree as ET
sys.path.append('../../../')
from collections import Counter
from typing import List, Tuple

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from datasets import load_dataset
from multitask_nlp.settings import DATASETS_DIR

tqdm.pandas()

dataset_path = DATASETS_DIR / 'ccpl'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
def read_xml_file(filepath: str):
    tree = ET.parse(filepath)
    root = tree.getroot()
    chunk_node = root[0]
    
    if len(root) > 1:
        print('Warning')
    
    data = []
    for sentence_chunk in chunk_node.iter('sentence'):
        replace_previous = False
        
        sentence_tokens = []
        sentence_tags = []
        
        for token in sentence_chunk.iter('tok'):
            orth = token[0].text
            if orth is not None and len(orth) > 0:
                lex_tags = [descendant for descendant in token.iter('lex')]
                if len(lex_tags) >0:
                    for lex_tag in lex_tags:
                        if 'disamb' in lex_tag.attrib:
                            pos_tag = lex_tag[1].text
                        else:
                            print('No disamb found.')
                            print(filepath)  
                else:
                    pos_tag = 'ign'
                    
                grammatic_class = pos_tag.split(':')[0]
                if grammatic_class == '@www':
                    pos_tag = 'subst:sg:nom:n'

                if grammatic_class != 'blank':
                    sentence_tokens.append(orth)
                    sentence_tags.append(pos_tag)
                else:
                    if len(sentence_tokens) == 0 and len(data) > 0:
                        _, prev_sentence_tokens, prev_sentence_tags = data[-1]
                        sentence_tokens = prev_sentence_tokens
                        sentence_tags = prev_sentence_tags
                        replace_previous = True

                    sentence_tokens[-1] =  sentence_tokens[-1] + orth          
        
        text = ' '.join(sentence_tokens)
        if replace_previous:
            data[-1] =(text, sentence_tokens, sentence_tags)
        else:
            data.append((text, sentence_tokens, sentence_tags))
        
    return data

In [73]:
files_path = dataset_path / 'anonimizacja_xml_out_ver'

files = os.listdir(files_path)
files = list(filter(lambda x: x.split('.')[-1] == 'xml', files))

text_ids, texts, texts_tokens, tags = [], [], [], []
text_to_file_name = []

text_id = 0
for f_name in tqdm(files):
    try:
        data = read_xml_file(files_path / f_name)
        
        sent_id = 0
        for sentence, sentence_tokens, sentence_tags in data:      
            text_to_file_name.append(f_name)
            texts.append(sentence)
            texts_tokens.append(sentence_tokens)
            tags.append(sentence_tags)
            text_id += 1
            sent_id +=1
        
    except ET.ParseError:
        pass
    
    


  0%|          | 0/7561 [00:00<?, ?it/s]

In [74]:
len(files)

7561

In [75]:
sum([len(t) for t in texts_tokens])

396445

In [77]:
sum([len(t) for t in tags])

396445

In [78]:
assert all([len(tokens) == len(texts_tags) for tokens, texts_tags 
            in zip(texts_tokens, tags)])

In [79]:
len(text_to_file_name)

24305

In [80]:
len(texts)

24305

In [81]:
all_tags = [t for sent_tags in tags for t in sent_tags]
len(all_tags)

396445

In [54]:
unique_tags = np.unique(all_tags)
len(unique_tags)

889

In [55]:
grammatic_classes = np.unique([t.split(':')[0] for t in unique_tags])

In [56]:
len(grammatic_classes)

36

In [57]:
grammatic_classes

array(['adj', 'adja', 'adjc', 'adjp', 'adv', 'aglt', 'bedzie', 'brev',
       'burk', 'comp', 'conj', 'depr', 'fin', 'ger', 'ign', 'imps',
       'impt', 'inf', 'interj', 'interp', 'num', 'numcol', 'pact', 'pant',
       'pcon', 'ppas', 'ppron12', 'ppron3', 'praet', 'pred', 'prep',
       'qub', 'siebie', 'subst', 'winien', 'xxx'], dtype='<U7')

In [58]:
counter = Counter([t.split(':')[0] for t in all_tags])

In [59]:
counter

Counter({'ign': 2014,
         'prep': 36714,
         'subst': 104791,
         'ger': 3976,
         'adj': 36073,
         'fin': 25713,
         'interp': 70802,
         'comp': 6532,
         'adv': 15370,
         'inf': 6973,
         'conj': 14748,
         'num': 10087,
         'qub': 22355,
         'ppas': 3182,
         'praet': 11221,
         'aglt': 4526,
         'ppron3': 2601,
         'ppron12': 3981,
         'pred': 2788,
         'imps': 321,
         'pcon': 546,
         'interj': 1719,
         'winien': 224,
         'brev': 4564,
         'bedzie': 823,
         'burk': 70,
         'pact': 621,
         'impt': 1480,
         'siebie': 471,
         'xxx': 885,
         'adjp': 191,
         'adja': 43,
         'adjc': 15,
         'numcol': 1,
         'depr': 24,
         'pant': 10})

In [60]:
for grammar_class in grammatic_classes:
    print('\n' + '#'*100)
    print(grammar_class)
    print('#'*100 + '\n')
    class_tags = [t for t in unique_tags if t.split(':')[0] == grammar_class]
    print(' | '.join(class_tags))



####################################################################################################
adj
####################################################################################################

adj:pl:acc:f:com | adj:pl:acc:f:pos | adj:pl:acc:f:sup | adj:pl:acc:m1:com | adj:pl:acc:m1:pos | adj:pl:acc:m1:sup | adj:pl:acc:m2:com | adj:pl:acc:m2:pos | adj:pl:acc:m3:com | adj:pl:acc:m3:pos | adj:pl:acc:m3:sup | adj:pl:acc:n:com | adj:pl:acc:n:pos | adj:pl:acc:n:sup | adj:pl:dat:f:pos | adj:pl:dat:m1:com | adj:pl:dat:m1:pos | adj:pl:dat:m1:sup | adj:pl:dat:m2:pos | adj:pl:dat:m3:com | adj:pl:dat:m3:pos | adj:pl:dat:m3:sup | adj:pl:dat:n:pos | adj:pl:gen:f:com | adj:pl:gen:f:pos | adj:pl:gen:f:sup | adj:pl:gen:m1:com | adj:pl:gen:m1:pos | adj:pl:gen:m1:sup | adj:pl:gen:m2:com | adj:pl:gen:m2:pos | adj:pl:gen:m3:com | adj:pl:gen:m3:pos | adj:pl:gen:m3:sup | adj:pl:gen:n:com | adj:pl:gen:n:pos | adj:pl:gen:n:sup | adj:pl:inst:f:com | adj:pl:inst:f:pos | adj:pl:inst:f:sup | adj:pl