In [113]:
from nlp_common.acts_reader import ActsReader
from nlp_common.morphological_tagging import tag_morphologically
from collections import Counter
import requests
import os
import regex
import xml.etree.ElementTree as ET
import pandas as pd

In [59]:
def longest_bills(n=50):
    r = ActsReader('../ustawy')
    bills = [ (n,c) for n,y,c in r.all_acts() ]
    bills = sorted(bills, key=lambda x: -len(x[1]))
    new_line_re = regex.compile(r'\n+|\s+|\t+')
    return [(name, new_line_re.sub(' ', bill)) for name,bill in bills[:n]]

named_bills = longest_bills()
bills = [ c for n,c in named_bills]

In [6]:
tags = tag_morphologically(bills)

Reading cached response


### Basic NER

In [24]:
def split_sentence(tagged_bills):
    res = []
    c = [ tagged_bills[0] ]
    for i,b in enumerate(tagged_bills[1:]):
        if tagged_bills[i-1][1] == '.' and tagged_bills[i][0][0].isupper():
            res.append(c)
            c = []
        c.append(tagged_bills[i])
    return res

def basic_NER(tagged_bills):
    res = []
    c = [ ]
    for i,b in enumerate(tagged_bills[1:]):
        if tagged_bills[i-1][1] != '.' and tagged_bills[i][0][0].isupper():
            c.append(tagged_bills[i][1])
        elif len(c) > 0:
            res.append(" ".join(c))
            c = []
    if len(c) > 0:
            res.append(" ".join(c))
    return res

entities = basic_NER(tags)

In [25]:
entities[:10]

['ustawa',
 'artykuł',
 'dziennik',
 'minister',
 'artykuł',
 'dziennik',
 'artykuł',
 'minister sprawa wewnętrzny',
 'minister',
 'minister sprawa wewnętrzny']

In [30]:
ctr = Counter(entities)
sorted([ (k, ctr[k]) for k in ctr ], key=lambda x: -x[1])[:50]

[('numer', 3831),
 ('artykuł', 1848),
 ('dziennik', 1602),
 ('Rzeczpospolita polski', 611),
 ('kodeks', 510),
 ('policja', 463),
 ('skarb państwo', 397),
 ('prawo', 332),
 ('kasa chory', 328),
 ('unia europejski', 293),
 ('straż graniczny', 290),
 ('minister', 276),
 ('rada minister', 276),
 ('państwowy straż pożarny', 240),
 ('zakład', 230),
 ('państwowy komisja wyborczy', 212),
 ('rada', 195),
 ('prezes urząd', 183),
 ('fundusz', 182),
 ('minister obrona narodowy', 174),
 ('zmiana', 173),
 ('urząd patentowy', 172),
 ('I', 168),
 ('minister sprawiedliwość', 168),
 ('sprawiedliwość', 164),
 ('azot', 155),
 ('II', 150),
 ('rozdział', 148),
 ('pozostały', 143),
 ('urząd ochrona państwo', 141),
 ('tkanina', 141),
 ('prezes rada minister', 134),
 ('obrona narodowy', 129),
 ('minister finanse', 124),
 ('finanse', 116),
 ('inspektor nadzór wewnętrzny', 115),
 ('EFTA', 114),
 ('nawóz', 111),
 ('P2O5', 110),
 ('biuro', 109),
 ('Art', 107),
 ('komisja', 107),
 ('administracja', 106),
 ('europej

### Clarin NER

In [127]:
def start_clarin_task(bill):
    r = requests.post('https://ws.clarin-pl.eu/nlprest2/base/startTask', json={ 
        "lpmn":'any2txt|wcrft2|liner2({"model":"n82"})',
        "text": bill
    })

    return r.text

def start_clarin_tasks(named_bills):
    return { name: start_clarin_task(bill) for name,bill in named_bills }

bills_requests_ids = start_clarin_tasks(named_bills)
print(bills_requests_ids)

{'2000_696.txt': 'a01d9118-dfa5-4a93-9d65-e6e246472238', '2001_627.txt': '8fa35b81-9a6e-467e-bad9-edad19213b6d', '1996_465.txt': '30f17271-bbba-4cf6-b24d-a5f1ca4a982a', '1997_555.txt': '924af644-4157-4a5e-98be-993c2720f9e8', '2002_1689.txt': 'eca20285-5fa9-46a7-bc3c-6ad47eae00f8', '2000_1186.txt': '0a83b0b4-4c99-40f9-b4db-eec340e23063', '1998_1118.txt': '3e445891-aa42-45d5-8d3d-5a3cd6dad22e', '1997_117.txt': '0335f770-f5de-437b-a3ae-c6fd1995247b', '2001_1070.txt': 'd70ec89d-3349-4e29-8890-e169fc93b7e3', '2001_1368.txt': '5f100b0d-790e-4d61-8ad2-ddc8a2d67837', '1997_714.txt': '07ee9a50-e98d-4cd0-947d-e8cfab32212c', '2001_499.txt': 'd282c1de-8c41-4718-8ac1-0304f278a8fd', '2000_991.txt': '9a08f4ac-c060-45f0-92a9-735f83152143', '2003_1750.txt': 'bfe48e93-4185-454a-a793-13302d5c3ea7', '2001_1545.txt': 'e5eaf76c-45ad-4c05-bc9a-7dd93c60fa67', '2001_1229.txt': 'e04002ee-72ff-42fc-bbef-da55430a24ac', '2000_1268.txt': '3605540d-67aa-451e-88b2-31d51300e3b6', '1994_195.txt': '83391bdc-a0ca-4d1c-a1

In [130]:
def get_response(request_id, file_name):
    if os.path.exists(file_name):
        print(f"{file_name} already exists. Skipping...")
        return 
    
    status = requests.get('https://ws.clarin-pl.eu/nlprest2/base/getStatus/'+request_id).json()
    
    if status["status"] == "DONE":
        print(f"{request_id} for {file_name} is ready. Downloading response...")
        fileID = status["value"][0]["fileID"]
        t = requests.get("https://ws.clarin-pl.eu/nlprest2/base/download"+fileID).text
        with open(file_name, "w") as f:
            f.write(t)
    else:
        print(f"{request_id} for {file_name} is not yet ready")
    
def get_responses(bills_requests_ids):
    for name in bills_requests_ids:
        get_response(bills_requests_ids[name], f"clarin_output/clarin_output_{name}")
        
get_responses(bills_requests_ids)

a01d9118-dfa5-4a93-9d65-e6e246472238 for clarin_output/clarin_output_2000_696.txt is ready. Downloading response...
8fa35b81-9a6e-467e-bad9-edad19213b6d for clarin_output/clarin_output_2001_627.txt is ready. Downloading response...
30f17271-bbba-4cf6-b24d-a5f1ca4a982a for clarin_output/clarin_output_1996_465.txt is ready. Downloading response...
924af644-4157-4a5e-98be-993c2720f9e8 for clarin_output/clarin_output_1997_555.txt is ready. Downloading response...
eca20285-5fa9-46a7-bc3c-6ad47eae00f8 for clarin_output/clarin_output_2002_1689.txt is ready. Downloading response...
0a83b0b4-4c99-40f9-b4db-eec340e23063 for clarin_output/clarin_output_2000_1186.txt is ready. Downloading response...
3e445891-aa42-45d5-8d3d-5a3cd6dad22e for clarin_output/clarin_output_1998_1118.txt is ready. Downloading response...
0335f770-f5de-437b-a3ae-c6fd1995247b for clarin_output/clarin_output_1997_117.txt is ready. Downloading response...
d70ec89d-3349-4e29-8890-e169fc93b7e3 for clarin_output/clarin_output_

In [131]:
def read_clarin_output(bill_name):
    file_name = f"clarin_output/clarin_output_{bill_name}"
    tree = ET.parse(file_name)
    return tree.getroot()

def parse_single_clarin_output(root):
    res = []
    sent_id = 0
    for chunk in root:
        for sentence in chunk:
            for token in sentence:
                if token.tag != "tok":
                    continue
                    
                orth_t = token.find("orth")
                orth = orth_t.text if orth_t is not None else None
                lex_t = token.find("lex")
                lex = lex_t.find("base").text if lex_t is not None else None
                ne_attrs = set([])
                    
                for token_lex in token:
                    if token_lex.tag == "ann":
                        for attr_name in token_lex.attrib:
                            if attr_name == "chan" and token_lex.attrib[attr_name].startswith("nam_"):
                                if token_lex.text == "1":
                                    ne_attrs.add((token_lex.attrib[attr_name]))
                res.append((orth, lex, sent_id, ne_attrs))
            sent_id += 1
    return res


def clarin_NER_single_bill(bill_name):
    root = read_clarin_output(bill_name)
    lexs = parse_single_clarin_output(root)
    cur_tags = {}
    res = []
    for lex in lexs:
        ctags = set(cur_tags.keys())
        new_tags = lex[3] - ctags
        cont_tags = lex[3] & ctags
        end_tags = ctags - lex[3]
        for t in new_tags:
            cur_tags[t] = [ lex ]
        for t in end_tags:
            res.append(cur_tags[t])
            del cur_tags[t]
        for t in cont_tags:
            cur_tags[t].append(lex)
    
    for t in cur_tags:
        res.append(cur_tags[t])
    
    return res

def clarin_NER(named_bills):
    res = []
    for name, _ in named_bills:
        s = clarin_NER_single_bill(name)
        for en in s:
            cat = next(iter(en[0][3]))
            en_text = " ".join(map(lambda x: x[1], en))
            orig_text = " ".join(map(lambda x: x[0], en))
            res.append((en_text, orig_text, cat))
    return pd.DataFrame(data=res, columns=['lemmatized', 'original', 'category'])

NER_entities = clarin_NER(named_bills)

In [141]:
NER_entities["coarse_grained_category"] = NER_entities["category"].apply(lambda x: f"{x.split('_')[0]}_{x.split('_')[1]}")

In [144]:
NER_entities.groupby("coarse_grained_category").agg({"lemmatized": "count"}).sort_values(["lemmatized"], ascending=[0])

Unnamed: 0_level_0,lemmatized
coarse_grained_category,Unnamed: 1_level_1
nam_org,6720
nam_pro,3098
nam_loc,1087
nam_liv,598
nam_oth,383
nam_adj,312
nam_fac,138
nam_eve,33
nam_num,16


In [150]:
pd.set_option('display.max_colwidth', None)
NER_entities.groupby("coarse_grained_category")["original"].apply(lambda x: list(x)[:10])

coarse_grained_category
nam_adj                                                                                                                                                                       [polski, polskim, polskim, polskiego, warszawski, wojewódzkiego, polskie, polskimi, Wojewódzki, Wojewódzki]
nam_eve                      [Generalny Konserwator Zabytków, Ochrony Roślin, Monitorze Sądowym, Monitorze Sądowym, Monitorze Sądowym i Gospodarczym, Narodowy Bank Polski, Międzynarodowe Standardy Rachunkowości, II wojny światowej, Oddział 2 Tranzyt, X . Opieka domowa nad dziećmi]
nam_fac                                                                                              [Komendant Główny, Komendant Główny, Komendant Główny, Komendant Główny, Komendant Główny, Komendant Główny, Komendant Główny, Komendant Główny, Komendant Główny, Komendant Główny]
nam_liv     [Komendę Uzupełnień, M, Głównym Inspektorem, Głównego Inspektora, Głównego Inspektora, Głównego Inspektora, Głównego I

In [145]:
NER_entities.groupby("lemmatized").agg({ "original": "count", "category": "first" }).sort_values(["original"], ascending=[0])[:50]

Unnamed: 0_level_0,original,category
lemmatized,Unnamed: 1_level_1,Unnamed: 2_level_1
Dzieje_(Apostolskie) . u .,1081,nam_pro_media_periodic
rzeczpospolita polski,677,nam_loc_gpe_country
dziennik . u .,414,nam_pro_media_periodic
skarb państwo,307,nam_org_institution
minister sprawiedliwość,274,nam_org_institution
rada minister,262,nam_org_institution
minister obrona narodowy,226,nam_org_institution
unia europejski,226,nam_org_organization
państwowy komisja wyborczy,215,nam_org_institution
złoty,210,nam_oth_currency


### Conclusion

```
 <tok>
    <orth>Prezesowi</orth>
    <lex disamb="1"><base>prezes</base><ctag>subst:sg:dat:m1</ctag></lex>
    <ann chan="nam_org_institution" head="1">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Zarządu</orth>
    <lex disamb="1"><base>zarząd</base><ctag>subst:sg:gen:m3</ctag></lex>
    <ann chan="nam_org_institution">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Państwowego</orth>
    <lex disamb="1"><base>państwowy</base><ctag>adj:sg:gen:m1:pos</ctag></lex>
    <ann chan="nam_org_institution">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Funduszu</orth>
    <lex disamb="1"><base>fundusz</base><ctag>subst:sg:gen:m3</ctag></lex>
    <ann chan="nam_org_institution">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Rehabilitacyjnego</orth>
    <lex disamb="1"><base>rehabilitacyjny</base><ctag>adj:sg:gen:m1:pos</ctag></lex>
    <ann chan="nam_org_institution">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>i</orth>
    <lex disamb="1"><base>i</base><ctag>conj</ctag></lex>
    <ann chan="nam_org_institution">0</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Osób</orth>
    <lex disamb="1"><base>osoba</base><ctag>subst:pl:gen:f</ctag></lex>
    <ann chan="nam_org_institution">0</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Niepełnosprawnych</orth>
    <lex disamb="1"><base>niepełnosprawna</base><ctag>subst:pl:gen:f</ctag></lex>
    <ann chan="nam_org_institution">0</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
```