In [1]:
import spacy
from nlp_common.acts_reader import ActsReader
import regex
from spacy.tokenizer import Tokenizer
import numpy as np
from collections import Counter
import math
import pandas as pd

In [2]:
acts_reader = ActsReader('../ustawy')
bills = [ text for _, _, text in acts_reader.all_acts()]

In [3]:
new_line_re = regex.compile(r'\n+|\s+')
bills = [new_line_re.sub(' ', bill) for bill in bills]

### Tokenization

In [4]:
nlp = spacy.load("pl_core_news_sm")
prefix_re = regex.compile(r'''^[\[\("'']+''')
suffix_re = regex.compile(r'''[\]\)"']+$''')
nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search)

In [5]:
bill_docs_tokenized = list(nlp.tokenizer.pipe(bills))

### Bigrams

In [6]:
filter_bigrams_regex = regex.compile(r'^\p{L}+$')
def single_tokens(tokens):
    return [ str(token).lower() for token in tokens if filter_bigrams_regex.match(str(token).lower())]

def bigrams(single_tokens):
    return [(single_tokens[i], single_tokens[i+1]) for i in range(len(single_tokens) - 1)]

bill_docs_single_tokens = [single_tokens(doc_tokens) for doc_tokens in bill_docs_tokenized]
bill_docs_bigrams = [bigrams(doc_tokens) for doc_tokens in bill_docs_single_tokens]

In [7]:
bill_docs_bigrams[0][:10]

[('tekst', 'ustawy'),
 ('ustawy', 'ustalony'),
 ('ustalony', 'ostatecznie'),
 ('ostatecznie', 'po'),
 ('po', 'rozpatrzeniu'),
 ('rozpatrzeniu', 'poprawek'),
 ('poprawek', 'senatu'),
 ('senatu', 'ustawa'),
 ('ustawa', 'z'),
 ('z', 'dnia')]

### PMI

In [15]:
def compute_global_counter(ctrs): 
    global_counter = Counter()
    for ctr in ctrs:
        global_counter += ctr
    return global_counter

def probabilities(ngrams, filter_fn=None):
    ctrs = [Counter(doc) for doc in ngrams]
    global_counter = compute_global_counter(ctrs)
    l = len(global_counter)
    probs = { k: global_counter[k]/l for k in global_counter 
            if filter_fn == None or filter_fn(k, global_counter[k])}
    global_counter = { k: global_counter[k] for k in global_counter
            if filter_fn == None or filter_fn(k, global_counter[k]) }
    return probs, global_counter

def probabilities_flat(counter_ngrams, map_key_fn):
    ctr = Counter()
    for k in counter_ngrams:
        mk = map_key_fn(k)
        ctr[mk] += counter_ngrams[k]
    l = len(ctr)
    probs = {
        k: ctr[k]/l for k in ctr
    }
    return probs, ctr

def pmi_df_bigram(filter_fn=None):
    pm, pmctr = probabilities(bill_docs_bigrams, filter_fn)
    pa, pactr = probabilities_flat(pmctr, lambda x: x[0])
    pb, pbctr = probabilities_flat(pmctr, lambda x: x[1])
    
    df = pd.DataFrame(data=[(k,) for k in pm.keys()], columns = ["bigram"])
    df["p(a)"] = df["bigram"].apply(lambda x: pa[x[0]])
    df["#a"] = df["bigram"].apply(lambda x: pactr[x[0]])
    df["p(b)"] = df["bigram"].apply(lambda x: pb[x[1]])
    df["#b"] = df["bigram"].apply(lambda x: pbctr[x[1]])
    df["p(a,b)"] = df["bigram"].apply(lambda x: pm[x])
    df["#ab"] = df["bigram"].apply(lambda x: pmctr[x])
    df["pmi"] = np.log(df["p(a,b)"]/ (df["p(a)"]*df["p(b)"]))
    return df.sort_values(["pmi"], ascending=[0])

In [16]:
df = pmi_df_bigram()

In [17]:
df.head(10)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi
214753,"(aln, odsączona)",2e-05,1,2e-05,1,2e-06,1,8.230369
527027,"(dżumę, płucną)",2e-05,1,2e-05,1,2e-06,1,8.230369
527038,"(gorączek, krwotocznych)",2e-05,1,2e-05,1,2e-06,1,8.230369
527037,"(wirusowych, gorączek)",2e-05,1,2e-05,1,2e-06,1,8.230369
331318,"(rudzie, serpentynity)",2e-05,1,2e-05,1,2e-06,1,8.230369
527036,"(płucnej, wirusowych)",2e-05,1,2e-05,1,2e-06,1,8.230369
527035,"(dżumy, płucnej)",2e-05,1,2e-05,1,2e-06,1,8.230369
556929,"(wyczuwalne, dotykiem)",2e-05,1,2e-05,1,2e-06,1,8.230369
527034,"(cholery, dżumy)",2e-05,1,2e-05,1,2e-06,1,8.230369
289709,"(skrzynek, lęgowych)",2e-05,1,2e-05,1,2e-06,1,8.230369


#### PMI filtered
PMI for pairs that occured at least 5 times

In [18]:
df_filtered = pmi_df_bigram(lambda _, v: v > 5)

In [19]:
df_filtered.head(10)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi
61059,"(przewoźnicy, kolejowi)",0.00053,6,0.000507,6,9e-06,6,3.5382
42892,"(szpitalu, psychiatrycznym)",0.00053,6,0.000507,6,9e-06,6,3.5382
64958,"(osady, ściekowe)",0.00053,6,0.000507,6,9e-06,6,3.5382
62055,"(jedynym, akcjonariuszem)",0.00053,6,0.000507,6,9e-06,6,3.5382
64432,"(prowadziły, rokowania)",0.00053,6,0.000507,6,9e-06,6,3.5382
56611,"(ozonową, cfc)",0.00053,6,0.000507,6,9e-06,6,3.5382
66799,"(śląskiego, obejmująca)",0.00053,6,0.000507,6,9e-06,6,3.5382
44194,"(rezydenta, długoterminowego)",0.00053,6,0.000507,6,9e-06,6,3.5382
60635,"(opłacają, czynsz)",0.00053,6,0.000507,6,9e-06,6,3.5382
21872,"(izolacji, budowlanej)",0.00053,6,0.000507,6,9e-06,6,3.5382


### LLR

In [20]:
def H(a):
    a = np.array(a)
    N = a.sum()
    return np.sum(a/N * np.log(a/N + (a==0)))
                                  
def llr(pmi_df):
    def apply_llr(k):
        return 2*np.sum(k)*(H(k) - H(k.sum(axis=0)) - H(k.sum(axis=1)))
        
    df = pmi_df.copy()
    sum_all = pmi_df["#ab"].sum()
    df["k11"] = pmi_df["#ab"]
    df["k12"] = pmi_df["#a"] - df["k11"]
    df["k21"] = pmi_df["#b"] - df["k11"] 
    df["k22"] = sum_all - (pmi_df["#a"] + pmi_df["#b"] - pmi_df["#ab"])
    df["k"] = df[["k11", "k12","k21", "k22"]].values.tolist()
    df["k"] = df["k"].apply(lambda k: np.array(k).reshape((2,2)))
    df["llr"] = df["k"].apply(apply_llr)
    pmi_df["llr"] = df["llr"]
    return pmi_df

In [21]:
df = llr(df)

In [22]:
df.sort_values(["llr"], ascending=[0]).head(20)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi,llr
757,"(mowa, w)",0.582898,28759,4.073245,200978,0.043871,28455,-3.991209,153998.420606
756,"(których, mowa)",0.362763,17898,0.582862,28759,0.021349,13847,-2.292957,117417.333602
755,"(o, których)",1.306214,64446,0.362741,17898,0.0214,13880,-3.097454,90627.890058
848,"(którym, mowa)",0.238437,11764,0.582862,28759,0.014132,9166,-2.28588,75923.816646
809,"(z, nr)",1.667194,82256,0.910521,44926,0.027642,17929,-4.00582,73545.54974
236,"(dodaje, się)",0.1707,8422,0.891389,43982,0.012627,8190,-2.4891,68520.597174
900,"(do, spraw)",1.222486,60315,0.200523,9894,0.013393,8687,-2.907068,61728.4758
9372,"(nr, nr)",0.908022,44800,0.910521,44926,0.019978,12958,-3.722898,59576.428903
847,"(o, którym)",1.306214,64446,0.238422,11764,0.014156,9182,-3.091013,59439.79239
401,"(na, podstawie)",1.020512,50350,0.136965,6758,0.010211,6623,-2.616563,53610.47825


### Trigrams

In [24]:
def trigrams(single_tokens):
    return [(single_tokens[i], single_tokens[i+1], single_tokens[i+2]) for i in range(len(single_tokens) - 2)]

bill_docs_trigrams = [trigrams(doc_tokens) for doc_tokens in bill_docs_single_tokens]

In [25]:
bill_docs_trigrams[0][:10]

[('tekst', 'ustawy', 'ustalony'),
 ('ustawy', 'ustalony', 'ostatecznie'),
 ('ustalony', 'ostatecznie', 'po'),
 ('ostatecznie', 'po', 'rozpatrzeniu'),
 ('po', 'rozpatrzeniu', 'poprawek'),
 ('rozpatrzeniu', 'poprawek', 'senatu'),
 ('poprawek', 'senatu', 'ustawa'),
 ('senatu', 'ustawa', 'z'),
 ('ustawa', 'z', 'dnia'),
 ('z', 'dnia', 'lipca')]

In [26]:
def pmi_df_trigram(filter_fn=None):
    pt, ptctr = probabilities(bill_docs_trigrams, filter_fn)
    pa, pactr = probabilities_flat(ptctr, lambda x: (x[0], x[1]))
    pb, pbctr = probabilities_flat(ptctr, lambda x: x[2])
    
    df = pd.DataFrame(data=[(k,) for k in pt.keys()], columns = ["trigram"])
    df["p(a)"] = df["trigram"].apply(lambda x: pa[(x[0], x[1])])
    df["#a"] = df["trigram"].apply(lambda x: pactr[(x[0], x[1])])
    df["p(b)"] = df["trigram"].apply(lambda x: pb[x[2]])
    df["#b"] = df["trigram"].apply(lambda x: pbctr[x[2]])
    df["p(a,b)"] = df["trigram"].apply(lambda x: pt[x])
    df["#ab"] = df["trigram"].apply(lambda x: ptctr[x])
    df["pmi"] = np.log(df["p(a,b)"]/ (df["p(a)"]*df["p(b)"]))
    return df.sort_values(["pmi"], ascending=[0])

In [27]:
df_trigram = pmi_df_trigram()
df_trigram_filtered = pmi_df_trigram(lambda _, v: v > 5)

In [28]:
df_trigram.sort_values(["pmi"], ascending=[0]).head(20)

Unnamed: 0,trigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi
1175441,"(statystyki, rachmistrze, ankieterzy)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
981530,"(uzasadnione, przyczynami, niedotyczącymi)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
426734,"(wyjątkiem, używania, pastuchów)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
768731,"(niepieniężnych, które, objęły)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
1034477,"(głównej, dla, tryków)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
536948,"(ze, skrzynek, lęgowych)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
536947,"(gniazd, ze, skrzynek)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
434076,"(energetycznej, oznaczenie, literowe)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
858254,"(procesem, zarządzania, kadrami)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385
1033965,"(numida, struś, struthio)",2e-06,1,2e-05,1,7.159221e-07,1,10.039385


In [29]:
df_trigram_filtered.sort_values(["pmi"], ascending=[0]).head(20)

Unnamed: 0,trigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi
33225,"(zasadach, w, niej)",0.000212,6,0.000796,6,4e-06,6,3.237404
46662,"(kandydatów, otrzymało, równą)",0.000212,6,0.000796,6,4e-06,6,3.237404
47826,"(na, podbudowie, programowej)",0.000212,6,0.000796,6,4e-06,6,3.237404
51399,"(treści, programu, dostosowawczego)",0.000212,6,0.000796,6,4e-06,6,3.237404
38760,"(na, sesji, zwołanej)",0.000212,6,0.000796,6,4e-06,6,3.237404
24049,"(ustawie, określenia, towar)",0.000212,6,0.000796,6,4e-06,6,3.237404
12024,"(biura, i, ekspert)",0.000212,6,0.000796,6,4e-06,6,3.237404
51397,"(pomiarów, w, wymaganym)",0.000212,6,0.000796,6,4e-06,6,3.237404
38729,"(otrzymuje, urlop, bezpłatny)",0.000212,6,0.000796,6,4e-06,6,3.237404
50197,"(przypadku, dostawy, wewnątrzwspólnotowej)",0.000212,6,0.000796,6,4e-06,6,3.237404


In [30]:
df_trigram = llr(df_trigram)

In [31]:
df_trigram.sort_values(["llr"], ascending=[0]).head(20)

Unnamed: 0,trigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi,llr
852,"(o, których, mowa)",0.0214,13880,0.582862,28759,0.009909,13841,-0.230135,135910.391424
969,"(o, którym, mowa)",0.014157,9182,0.582862,28759,0.00656,9163,-0.229393,88001.557781
12780,"(nr, z, nr)",0.017194,11152,0.891348,43980,0.007118,9943,-0.766856,78580.65434
853,"(których, mowa, w)",0.021349,13847,4.073245,200978,0.009874,13792,-2.175546,74521.896697
12786,"(nr, i, nr)",0.015501,10054,0.891348,43980,0.006222,8691,-0.797788,67119.695605
1338,"(właściwy, do, spraw)",0.007464,4841,0.200523,9894,0.003304,4615,0.791891,53429.958585
1467,"(o, której, mowa)",0.008523,5528,0.582862,28759,0.003938,5501,-0.232218,51812.126103
970,"(którym, mowa, w)",0.014132,9166,4.073245,200978,0.006547,9145,-2.17386,49384.17758
13275,"(nr, nr, nr)",0.019975,12956,0.891348,43980,0.005259,7346,-1.219509,45608.467032
13938,"(zastępuje, się, wyrazami)",0.007364,4776,0.063963,3156,0.002104,2939,1.49679,38604.522082


### Conclusion

##### Why do we have to filter the bigrams, rather than the token sequence?
If we filter tokens instead of bigrams we would get totally different results and bigrams because of removing the word between two others which were not neighbours.

##### Which measure (PMI, PMI with filtering, LLR) works better for the bigrams and which for the trigrams?
The result that we expect matches results for LLR methods in both cases. We expect to see phrases that are really common. PMI checkes rather if the words always come together

##### What types of expressions are discovered by the methods.
LLR finds the expressions that frequently appear in the corpus. It is proven by the results, since phrases such as "o których mowa" are commonly used in the corpus. PMI treats the phrases locally and check how strong collocation is. The best score is for phrases that always appear in determined orded and not in different neighbourhood. Especially when there is only one occurence.