In [81]:
import elasticsearch
import regex
import spacy
from spacy.tokenizer import Tokenizer
from nlp_common.acts_reader import ActsReader
from collections import Counter

### Data preparation

Data used in the exercise comes from two sources: set of polish bills (from previous exercies) and sgjp dictionary. The second one is loaded to elasticsearch in the attached script (main.py).

In [65]:
acts_reader = ActsReader('../ustawy') 
bills = [ text for _, _, text in acts_reader.all_acts()]

### Remove HTML tags

In [66]:
html_regex = regex.compile(r'<\/?[^>]*>', regex.IGNORECASE)
for bill in bills:
    matches = html_regex.findall(bill)
    for match in matches:
        print(match)

< < tajne >


Nothing to clean 🤷

### Tokenize

In [67]:
new_line_re = regex.compile(r'\n+')
bills = [new_line_re.sub(' ', bill) for bill in bills]

In [93]:
nlp = spacy.load("pl_core_news_sm")
prefix_re = regex.compile(r'''^[\[\("']''')
suffix_re = regex.compile(r'''[\]\)"':;,]$''')
nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search)

In [94]:
bill_docs = list(nlp.tokenizer.pipe(bills))

### Word counters

In [99]:
ctrs = [ Counter([d.text for d in doc]) for doc in bill_docs ]

In [100]:
global_counter = Counter()
for ctr in ctrs:
    global_counter += ctr

In [101]:
global_counter

Counter({' ': 39644,
         'Tekst': 225,
         'ustawy': 11902,
         'ustalony': 210,
         'ostatecznie': 103,
         'po': 12988,
         'rozpatrzeniu': 151,
         'poprawek': 214,
         'Senatu': 287,
         '\xa0 \xa0 \xa0 ': 156,
         'USTAWA': 999,
         'z': 81515,
         'dnia': 17909,
         '11': 2803,
         'lipca': 1337,
         '2014': 251,
         'r.': 32879,
         '\xa0 ': 3834,
         'o': 64134,
         'zmianie': 1498,
         '–': 3979,
         'Prawo': 1603,
         'ochrony': 3264,
         'środowiska': 2514,
         'oraz': 33542,
         'niektórych': 974,
         'innych': 4582,
         'ustaw[1]),[2': 1,
         ']': 1984,
         ')': 99822,
         '\xa0 \xa0 ': 235,
         'Art.': 29918,
         '\xa0': 10723,
         '1.': 19319,
         'W': 13653,
         'ustawie': 5185,
         '27': 1631,
         'kwietnia': 992,
         '2001': 1992,
         '(': 12937,
         'Dz.': 2886,
        