In [95]:
import spacy
from nlp_common.acts_reader import ActsReader
import regex
from spacy.tokenizer import Tokenizer
import numpy as np
from collections import Counter
import math
import pandas as pd

In [85]:
acts_reader = ActsReader('../ustawy')
bills = [ text for _, _, text in acts_reader.all_acts()]

In [86]:
new_line_re = regex.compile(r'\n+|\s+')
bills = [new_line_re.sub(' ', bill) for bill in bills]

### Tokenization

In [87]:
nlp = spacy.load("pl_core_news_sm")
prefix_re = regex.compile(r'''^[\[\("'']+''')
suffix_re = regex.compile(r'''[\]\)"']+$''')
nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search)

In [233]:
bill_docs_tokenized = list(nlp.tokenizer.pipe(bills))

### Bigrams

In [266]:
filter_bigrams_regex = regex.compile(r'^\p{L}+$')
def single_tokens(tokens):
    return [ str(token).lower() for token in tokens if filter_bigrams_regex.match(str(token).lower())]

def bigrams(single_tokens):
    return [(single_tokens[i], single_tokens[i+1]) for i in range(len(single_tokens) - 1)]

bill_docs_single_tokens = [single_tokens(doc_tokens) for doc_tokens in bill_docs_tokenized]
bill_docs_bigrams = [bigrams(doc_tokens) for doc_tokens in bill_docs_single_tokens]

In [267]:
bill_docs_bigrams[0][:10]

[('tekst', 'ustawy'),
 ('ustawy', 'ustalony'),
 ('ustalony', 'ostatecznie'),
 ('ostatecznie', 'po'),
 ('po', 'rozpatrzeniu'),
 ('rozpatrzeniu', 'poprawek'),
 ('poprawek', 'senatu'),
 ('senatu', 'ustawa'),
 ('ustawa', 'z'),
 ('z', 'dnia')]

### PMI

In [268]:
def compute_global_counter(ctrs): 
    global_counter = Counter()
    for ctr in ctrs:
        global_counter += ctr
    return global_counter

def probabilities(bigrams, filter_fn=None):
    ctrs = [Counter(doc) for doc in bigrams]
    global_counter = compute_global_counter(ctrs)
    l = len(global_counter)
    return { k: global_counter[k]/l for k in global_counter 
            if filter_fn == None or filter_fn(k, global_counter[k])}, global_counter

def pmi_df(filter_fn=None):
    p, pctr = probabilities(bill_docs_single_tokens, None)
    pm, pmctr = probabilities(bill_docs_bigrams, filter_fn)
    
    df = pd.DataFrame(data=[(k,) for k in pm.keys()], columns = ["bigram"])
    df["p(a)"] = df["bigram"].apply(lambda x: p[x[0]])
    df["#a"] = df["bigram"].apply(lambda x: pctr[x[0]])
    df["p(b)"] = df["bigram"].apply(lambda x: p[x[1]])
    df["#b"] = df["bigram"].apply(lambda x: pctr[x[1]])
    df["p(a,b)"] = df["bigram"].apply(lambda x: pm[x])
    df["#ab"] = df["bigram"].apply(lambda x: pmctr[x])
    df["#ba"] = df["bigram"].apply(lambda x: pmctr[(x[1], x[0])])
    df["pmi"] = np.log(df["p(a,b)"]/ (df["p(a)"]*df["p(b)"]))
    return df.sort_values(["pmi"], ascending=[0])

In [269]:
df = pmi_df()

In [270]:
df.head(10)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,#ba,pmi
195776,"(szarańczyn, ceratonia)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
614732,"(samaria, człuchów)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
614669,"(kanaan, namysłów)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
537754,"(rozgłasza, prawdziwy)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
446750,"(drenażu, odwadniającego)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
467305,"(masami, ziemnymi)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
59532,"(nieprzereagowanym, substracie)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
614721,"(genezaret, barlinek)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
252934,"(furfuralu, aldehydu)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
252706,"(sokiem, winogronowym)",2e-05,1,2e-05,1,2e-06,1,0,8.23043


#### PMI filtered
PMI for pairs that occured at least 5 times

In [271]:
df_filtered = pmi_df(lambda _, v: v > 5)

In [272]:
df_filtered.head(10)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,#ba,pmi
12631,"(chrześcijan, baptystów)",0.000122,6,0.000122,6,9e-06,6,0,6.43867
39734,"(adama, mickiewicza)",0.000122,6,0.000122,6,9e-06,6,0,6.43867
49282,"(lambrekiny, okienne)",0.000142,7,0.000122,6,9e-06,6,0,6.28452
36058,"(schedę, spadkową)",0.000142,7,0.000142,7,1.1e-05,7,0,6.28452
61030,"(zdrowego, stylu)",0.000162,8,0.000142,7,1.1e-05,7,0,6.150988
25134,"(buraka, cukrowego)",0.000162,8,0.000122,6,9e-06,6,0,6.150988
44896,"(zniekształcających, rzeźbę)",0.000142,7,0.000182,9,1.1e-05,7,0,6.033205
19182,"(małżeńską, wspólnością)",0.000182,9,0.000182,9,1.4e-05,9,0,6.033205
64751,"(partiom, politycznym)",0.000162,8,0.000142,7,9e-06,6,0,5.996838
22484,"(ziemiach, zachodnich)",0.000203,10,0.000182,9,1.4e-05,9,0,5.927845


### LLR

In [278]:
def H(a):
    a = np.array(a)
    N = a.sum()
    return np.sum(a/N * np.log(a/N + (a==0)))
                                  
def llr(pmi_df):
    def apply_llr(k):
        return 2*np.sum(k)*(H(k) - H(k.sum(axis=0)) - H(k.sum(axis=1)))
        
    df = pmi_df.copy()
    sum_all = pmi_df["#ab"].sum()
    df["k11"] = pmi_df["#ab"] + pmi_df["#ba"]
    df["k12"] = pmi_df["#a"].apply(lambda x: x*2) - df["k11"]
    df["k21"] = pmi_df["#b"].apply(lambda x: x*2) - df["k11"] 
    df["k22"] = sum_all - df["k11"]
    df["k"] = df[["k11", "k12","k21", "k22"]].values.tolist()
    df["k"] = df["k"].apply(lambda k: np.array(k).reshape((2,2)))
    df["llr"] = df["k"].apply(apply_llr)
    pmi_df["llr"] = df["llr"]
    return pmi_df

In [279]:
df = llr(df)

In [280]:
df.sort_values(["llr"], ascending=[0]).head(20)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,#ba,pmi,llr
9372,"(nr, nr)",0.910521,44926,0.910521,44926,0.019978,12958,12958,-3.725646,86454.534116
738,"(nr, z)",0.910521,44926,1.667761,82289,0.017194,11152,17929,-4.480961,69556.269601
809,"(z, nr)",1.667761,82289,0.910521,44926,0.027642,17929,11152,-4.00616,69556.269601
756,"(których, mowa)",0.362741,17898,0.582862,28759,0.021349,13847,0,-2.292897,66329.024415
757,"(mowa, w)",0.582862,28759,4.073245,200978,0.043871,28455,0,-3.991148,50142.924068
755,"(o, których)",1.306135,64446,0.362741,17898,0.0214,13880,9,-3.097393,43792.684433
99055,"(których, o)",0.362741,17898,1.306135,64446,1.4e-05,9,13880,-10.438373,43792.684433
848,"(którym, mowa)",0.238422,11764,0.582862,28759,0.014132,9166,0,-2.285819,43243.139174
236,"(dodaje, się)",0.17069,8422,0.891389,43982,0.012627,8190,26,-2.489039,36229.037481
36711,"(się, dodaje)",0.891389,43982,0.17069,8422,4e-05,26,8190,-8.241612,36229.037481


### Trigrams

In [276]:
def trigrams(single_tokens):
    return [(single_tokens[i], single_tokens[i+1], single_tokens[i+2]) for i in range(len(single_tokens) - 2)]

bill_docs_trigrams = [trigrams(doc_tokens) for doc_tokens in bill_docs_single_tokens]

In [277]:
bill_docs_trigrams[:10]

[[('tekst', 'ustawy', 'ustalony'),
  ('ustawy', 'ustalony', 'ostatecznie'),
  ('ustalony', 'ostatecznie', 'po'),
  ('ostatecznie', 'po', 'rozpatrzeniu'),
  ('po', 'rozpatrzeniu', 'poprawek'),
  ('rozpatrzeniu', 'poprawek', 'senatu'),
  ('poprawek', 'senatu', 'ustawa'),
  ('senatu', 'ustawa', 'z'),
  ('ustawa', 'z', 'dnia'),
  ('z', 'dnia', 'lipca'),
  ('dnia', 'lipca', 'o'),
  ('lipca', 'o', 'zmianie'),
  ('o', 'zmianie', 'ustawy'),
  ('zmianie', 'ustawy', 'prawo'),
  ('ustawy', 'prawo', 'ochrony'),
  ('prawo', 'ochrony', 'środowiska'),
  ('ochrony', 'środowiska', 'oraz'),
  ('środowiska', 'oraz', 'niektórych'),
  ('oraz', 'niektórych', 'innych'),
  ('niektórych', 'innych', 'w'),
  ('innych', 'w', 'ustawie'),
  ('w', 'ustawie', 'z'),
  ('ustawie', 'z', 'dnia'),
  ('z', 'dnia', 'kwietnia'),
  ('dnia', 'kwietnia', 'prawo'),
  ('kwietnia', 'prawo', 'ochrony'),
  ('prawo', 'ochrony', 'środowiska'),
  ('ochrony', 'środowiska', 'z'),
  ('środowiska', 'z', 'i'),
  ('z', 'i', 'oraz'),
  ('i', 