# NKJP Podkorpus milionowy

Wykorzystane dane oryginalne z http://clip.ipipan.waw.pl/NationalCorpusOfPolish?action=AttachFile&do=get&target=NKJP-PodkorpusMilionowy-1.2.tar.gz.

Stworzenie pliku `.iob`, który będzie zawierał dane ze wszystkich dokumentów wraz z anotacją NER oraz POS (już po dezambiguacji).


In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import xml.etree.ElementTree as ET
sys.path.append('../../../')
from collections import Counter
from typing import List, Tuple
from dataclasses import dataclass
from collections import OrderedDict

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from datasets import load_dataset
from multitask_nlp.settings import DATASETS_DIR

tqdm.pandas()

data_path  = DATASETS_DIR / 'nkjp1m' 
dataset_path = data_path / 'NKJP-PodkorpusMilionowy-1.2'

In [14]:
document_dirs = [f for f in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, f))]

In [15]:
@dataclass
class Token:
    orth: str
    base: str
    pos_tag: str = None
    ner_tags: List[str] = None
    ner_tag: str = None
        
    def ner_from_tag_list(self, tag_list):
        self.ner_tags = tag_list
        self.ner_tag = self.get_full_ner_tag()        
        
    def get_full_ner_tag(self):        
        processed_tags = []
        for tag in self.ner_tags:
            split_tag = tag.split('-')
            if len(split_tag) > 1 and split_tag[0] in ['B', 'I']:
                processed_tags.append('-'.join(split_tag[1:]))
            else:
                processed_tags.append(tag)

        sorted_tags = sorted(zip(self.ner_tags, processed_tags), key=lambda item: item[1])
        sorted_tags = [i[0] for i in sorted_tags]
        full_ner_tag = '#'.join(sorted_tags)
        
#         print(processed_tags)
#         print(sorted_tags)
#         print(full_ner_tag)
#         print()
        return full_ner_tag

@dataclass
class NER_Tag:
    id: str
    tag: str
    target: str

sentence_tag = '{http://www.tei-c.org/ns/1.0}s'
segment_tag = '{http://www.tei-c.org/ns/1.0}seg'
id_tag = '{http://www.w3.org/XML/1998/namespace}id'
ptr_tag = '{http://www.tei-c.org/ns/1.0}ptr'

documents_data = []

for document_dir in tqdm(document_dirs):
    morpho_filepath = dataset_path/ document_dir / 'ann_morphosyntax.xml'
    named_filepath = dataset_path/ document_dir / 'ann_named.xml'
    
    tree = ET.parse(morpho_filepath)
    text_node = tree.getroot()[1][1]

    sentences = OrderedDict()
    
    for sentence_chunk in text_node.iter(sentence_tag):
        sentence_id = sentence_chunk.attrib[id_tag]
        tokens_in_sentence = OrderedDict()       
        
        for seg in sentence_chunk.iter(segment_tag):
            seg_id = seg.attrib[id_tag]
            fs_morph_node = seg[0]
            f_nodes = fs_morph_node.findall('{http://www.tei-c.org/ns/1.0}f')
            
            orth = None
            base = None
            pos_tag = None
            for f_node in f_nodes:
                if f_node.attrib['name'] == 'orth':
                    orth = f_node[0].text
                elif f_node.attrib['name'] == 'disamb':
                    disamb_node = f_node
                    disamb_text = disamb_node[0][1][0].text

                    disamb_text_split = disamb_text.split(':')
                    base = disamb_text_split[0]
                    pos_tag = ':'.join(disamb_text_split[1:])
            
            
            token = Token(orth=orth, base=base, pos_tag=pos_tag)
            tokens_in_sentence[seg_id] = token
        
        sentences[sentence_id] = tokens_in_sentence
        
    
    try:
        tree = ET.parse(named_filepath)
        text_node = tree.getroot()[1][1]

        for sentence_chunk in text_node.iter(sentence_tag):
            sentence_id = sentence_chunk.attrib[id_tag]
            morph_sentence_id = sentence_id.replace('named', 'morph')

            ner_tags = []
            for seg in sentence_chunk.iter(segment_tag):
                seg_id = seg.attrib[id_tag]           
                fs_morph_node = seg[0]
                f_nodes = fs_morph_node.findall('{http://www.tei-c.org/ns/1.0}f')

                ner_tag_type = None
                ner_tag_subtype = None
                for f_node in f_nodes:
                    if f_node.attrib['name'] == 'type':
                        ner_tag_type = f_node[0].attrib['value']
                    if f_node.attrib['name'] == 'subtype':
                        ner_tag_subtype = f_node[0].attrib['value']

                ner_tag = f'{ner_tag_type}'
                if ner_tag_subtype is not None:
                    ner_tag += f'-{ner_tag_subtype}'

                ptr_targets = [ptr.attrib['target']  for ptr in seg.findall(ptr_tag)]            
                if ner_tag is not None:
                    for ptr_target in ptr_targets:
                        ner_tag_obj = NER_Tag(id=seg_id, tag=ner_tag, target=ptr_target)
                        ner_tags.append(ner_tag_obj)

            basic_ner_tags = [nt for nt in ner_tags if len(nt.target.split('#')) > 1]
            sub_ner_tags = [nt for nt in ner_tags if len(nt.target.split('#')) == 1]

            for nt in basic_ner_tags:
                morph_seg_id = nt.target.split('#')[1]

                ner_tag_list = []
                for sub_nt in sub_ner_tags:
                    if sub_nt.target == nt.id:
                        ner_tag_list.append(sub_nt.tag)

                ner_tag_list.append(nt.tag)

                sentences[morph_sentence_id][morph_seg_id].ner_from_tag_list(ner_tag_list)
    except FileNotFoundError:
        pass
            
    for s_id, s_tokens in sentences.items():
        prev_token = None
        for t_id, curr_token in s_tokens.items():
            if curr_token.ner_tags == None:
                curr_token.ner_tag = 'O'
            else:
                if prev_token is not None and prev_token.ner_tags is not None:
                    prev_original_tags = ['-'.join(t.split('-')[1:]) for t in prev_token.ner_tags]
                    
                    iob_ner_tags = []
                    for t in curr_token.ner_tags:
                        if t in prev_original_tags:
                            iob_ner_tags.append(f'I-{t}')
                        else:
                            iob_ner_tags.append(f'B-{t}')
                    
                    curr_token.ner_from_tag_list(iob_ner_tags)
                    
                else:
                    curr_token.ner_from_tag_list([f'B-{tag}' for tag in curr_token.ner_tags])     
                    
            prev_token = curr_token
    
    documents_data.append((document_dir, sentences))

  0%|          | 0/3889 [00:00<?, ?it/s]

In [16]:
with open(data_path / 'nkjp1m.iob', 'w', encoding='utf-8') as f:
    f.write('-DOCSTART CONFIG FEATURES orth base ctag\n')
    for document_dir, sentences in documents_data:
        f.write(f'-DOCSTART FILE ./{document_dir}\n')
        for sent_id, tokens in sentences.items():
            for token_id, token in tokens.items():
                f.write(f'{token.orth}\t{token.base}\t{token.pos_tag}\t{token.ner_tag}\n')
            f.write('\n')