In [None]:
from nlp_common.acts_reader import ActsReader
from nlp_common.morphological_tagging import tag_morphologically
from collections import Counter
import requests
import os
import regex
import xml.etree.ElementTree as ET

In [59]:
def longest_bills(n=50):
    r = ActsReader('../ustawy')
    bills = [ (n,c) for n,y,c in r.all_acts() ]
    bills = sorted(bills, key=lambda x: -len(x[1]))
    new_line_re = regex.compile(r'\n+|\s+|\t+')
    return [(name, new_line_re.sub(' ', bill)) for name,bill in bills[:n]]

named_bills = longest_bills()
bills = [ c for n,c in named_bills]

In [6]:
tags = tag_morphologically(bills)

Reading cached response


### Basic NER

In [24]:
def split_sentence(tagged_bills):
    res = []
    c = [ tagged_bills[0] ]
    for i,b in enumerate(tagged_bills[1:]):
        if tagged_bills[i-1][1] == '.' and tagged_bills[i][0][0].isupper():
            res.append(c)
            c = []
        c.append(tagged_bills[i])
    return res

def basic_NER(tagged_bills):
    res = []
    c = [ ]
    for i,b in enumerate(tagged_bills[1:]):
        if tagged_bills[i-1][1] != '.' and tagged_bills[i][0][0].isupper():
            c.append(tagged_bills[i][1])
        elif len(c) > 0:
            res.append(" ".join(c))
            c = []
    if len(c) > 0:
            res.append(" ".join(c))
    return res

entities = basic_NER(tags)

In [25]:
entities[:10]

['ustawa',
 'artykuł',
 'dziennik',
 'minister',
 'artykuł',
 'dziennik',
 'artykuł',
 'minister sprawa wewnętrzny',
 'minister',
 'minister sprawa wewnętrzny']

In [30]:
ctr = Counter(entities)
sorted([ (k, ctr[k]) for k in ctr ], key=lambda x: -x[1])[:50]

[('numer', 3831),
 ('artykuł', 1848),
 ('dziennik', 1602),
 ('Rzeczpospolita polski', 611),
 ('kodeks', 510),
 ('policja', 463),
 ('skarb państwo', 397),
 ('prawo', 332),
 ('kasa chory', 328),
 ('unia europejski', 293),
 ('straż graniczny', 290),
 ('minister', 276),
 ('rada minister', 276),
 ('państwowy straż pożarny', 240),
 ('zakład', 230),
 ('państwowy komisja wyborczy', 212),
 ('rada', 195),
 ('prezes urząd', 183),
 ('fundusz', 182),
 ('minister obrona narodowy', 174),
 ('zmiana', 173),
 ('urząd patentowy', 172),
 ('I', 168),
 ('minister sprawiedliwość', 168),
 ('sprawiedliwość', 164),
 ('azot', 155),
 ('II', 150),
 ('rozdział', 148),
 ('pozostały', 143),
 ('urząd ochrona państwo', 141),
 ('tkanina', 141),
 ('prezes rada minister', 134),
 ('obrona narodowy', 129),
 ('minister finanse', 124),
 ('finanse', 116),
 ('inspektor nadzór wewnętrzny', 115),
 ('EFTA', 114),
 ('nawóz', 111),
 ('P2O5', 110),
 ('biuro', 109),
 ('Art', 107),
 ('komisja', 107),
 ('administracja', 106),
 ('europej

### Clarin NER

In [61]:
def start_clarin_task(bill):
    r = requests.post('https://ws.clarin-pl.eu/nlprest2/base/startTask', json={ 
        "lpmn":'any2txt|wcrft2|liner2({"model":"n82"})',
        "text": bills[0]
    })

    return r.text

def start_clarin_tasks(named_bills):
    return { name: start_clarin_task(bill) for name,bill in named_bills }

bills_requests_ids = start_clarin_tasks(named_bills)
print(bills_requests_ids)

{'2000_696.txt': '67a5bde5-c15e-4b7a-8004-792107b4dae0', '2001_627.txt': '14c661ca-8f3a-4a81-bea4-6227e82a229a', '1996_465.txt': 'f0a62dc0-748e-4d59-8b71-56a5357679b2', '1997_555.txt': 'a41c972d-4ffc-4e8d-854e-9e953939fd28', '2002_1689.txt': '2971a96b-6694-41fe-94d1-cb9c9c934cfb', '2000_1186.txt': 'f91895ac-0492-4249-8657-0c39b763dc1e', '1998_1118.txt': 'b163fe1f-3b04-4a57-8856-241d214d9e64', '1997_117.txt': '5c705c49-d9df-4e02-8842-c24378d78688', '2001_1070.txt': '2b074c08-3885-4bce-8efb-a4a41e01b433', '2001_1368.txt': 'ffeac07c-86c5-4011-a82f-ecd24be6d261', '1997_714.txt': '154fca1a-b76b-47c8-a616-4dc36fdf52ad', '2001_499.txt': '806299d6-3d22-46b2-89b7-26aa97930abf', '2000_991.txt': 'a52ee00f-45eb-4c8e-bcb9-676f3b0542de', '2003_1750.txt': '68e93200-63c3-4645-a525-1fb84688ad09', '2001_1545.txt': '68eacf90-0629-44b8-8ce4-f8d4ad3254ef', '2001_1229.txt': 'b8d7763e-c351-4d6e-ad95-2f66699f4ad4', '2000_1268.txt': '7805f84f-bb06-4e09-a2cc-de5090758dc5', '1994_195.txt': '16a4f863-aa81-40b8-9e

In [None]:
def get_response(request_id, file_name):
    if os.path.exists(file_name):
        print(f"{file_name} already exists. Skipping...")
    status = requests.get('https://ws.clarin-pl.eu/nlprest2/base/getStatus/'+request_id).json()
    
    if status["status"] == "DONE":
        print(f"{request_id} for {file_name} is ready. Downloading response...")
        fileID = status["value"][0]["fileID"]
        t = requests.get("https://ws.clarin-pl.eu/nlprest2/base/download"+fileID).text
        with open(file_name, "w") as f:
            f.write(t)
    else:
        print(f"{request_id} for {file_name} is not yet ready")
    
def get_responses(bills_requests_ids):
    for name in bills_requests_ids:
        get_response(bills_requests_ids[name], f"clarin_output/clarin_output_{name}")
        
get_responses(bills_requests_ids)

67a5bde5-c15e-4b7a-8004-792107b4dae0 for clarin_output/clarin_output_2000_696.txt is not yet ready
14c661ca-8f3a-4a81-bea4-6227e82a229a for clarin_output/clarin_output_2001_627.txt is not yet ready
f0a62dc0-748e-4d59-8b71-56a5357679b2 for clarin_output/clarin_output_1996_465.txt is ready. Downloading response...
a41c972d-4ffc-4e8d-854e-9e953939fd28 for clarin_output/clarin_output_1997_555.txt is not yet ready
2971a96b-6694-41fe-94d1-cb9c9c934cfb for clarin_output/clarin_output_2002_1689.txt is not yet ready
f91895ac-0492-4249-8657-0c39b763dc1e for clarin_output/clarin_output_2000_1186.txt is not yet ready
b163fe1f-3b04-4a57-8856-241d214d9e64 for clarin_output/clarin_output_1998_1118.txt is not yet ready
5c705c49-d9df-4e02-8842-c24378d78688 for clarin_output/clarin_output_1997_117.txt is not yet ready
2b074c08-3885-4bce-8efb-a4a41e01b433 for clarin_output/clarin_output_2001_1070.txt is not yet ready
ffeac07c-86c5-4011-a82f-ecd24be6d261 for clarin_output/clarin_output_2001_1368.txt is no

In [None]:
def read_clarin_output(bill_name):
    file_name = f"clarin_output/clarin_output_{bill_name}"
    tree = ET.parse(file_name)
    return tree.getroot()

def clarin_NER_single_bill(bill_name)
    chunk_list = read_clarin_output(bill_name)
    for chunk in chunk_list:
        for sentence in chunk:
            for token in sentence:
                for token_lex in token:
                    if token_lex.tag == "ann":
                        for attr_name in token_lex.attrib:
                            if attr_name == "chan" and token_lex.attrib[attr_name].startswith("nam_"):
                                print(attr_name, token_lex.attrib[attr_name], token_lex.text)

clarin_NER_single_bill("2001_1545.txt")

### Conclusion

```
 <tok>
    <orth>Prezesowi</orth>
    <lex disamb="1"><base>prezes</base><ctag>subst:sg:dat:m1</ctag></lex>
    <ann chan="nam_org_institution" head="1">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Zarządu</orth>
    <lex disamb="1"><base>zarząd</base><ctag>subst:sg:gen:m3</ctag></lex>
    <ann chan="nam_org_institution">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Państwowego</orth>
    <lex disamb="1"><base>państwowy</base><ctag>adj:sg:gen:m1:pos</ctag></lex>
    <ann chan="nam_org_institution">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Funduszu</orth>
    <lex disamb="1"><base>fundusz</base><ctag>subst:sg:gen:m3</ctag></lex>
    <ann chan="nam_org_institution">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Rehabilitacyjnego</orth>
    <lex disamb="1"><base>rehabilitacyjny</base><ctag>adj:sg:gen:m1:pos</ctag></lex>
    <ann chan="nam_org_institution">1</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>i</orth>
    <lex disamb="1"><base>i</base><ctag>conj</ctag></lex>
    <ann chan="nam_org_institution">0</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Osób</orth>
    <lex disamb="1"><base>osoba</base><ctag>subst:pl:gen:f</ctag></lex>
    <ann chan="nam_org_institution">0</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
   <tok>
    <orth>Niepełnosprawnych</orth>
    <lex disamb="1"><base>niepełnosprawna</base><ctag>subst:pl:gen:f</ctag></lex>
    <ann chan="nam_org_institution">0</ann>
    <ann chan="nam_org_organization">0</ann>
   </tok>
```