In [2]:
import spacy
from nlp_common.acts_reader import ActsReader
import regex
from spacy.tokenizer import Tokenizer
import numpy as np
from collections import Counter
import math
import pandas as pd

In [3]:
acts_reader = ActsReader('../ustawy')
bills = [ text for _, _, text in acts_reader.all_acts()]

In [4]:
new_line_re = regex.compile(r'\n+|\s+')
bills = [new_line_re.sub(' ', bill) for bill in bills]

### Tokenization

In [5]:
nlp = spacy.load("pl_core_news_sm")
prefix_re = regex.compile(r'''^[\[\("'']+''')
suffix_re = regex.compile(r'''[\]\)"']+$''')
nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search)

In [6]:
bill_docs_tokenized = list(nlp.tokenizer.pipe(bills))

### Bigrams

In [7]:
filter_bigrams_regex = regex.compile(r'^\p{L}+$')
def single_tokens(tokens):
    return [ str(token).lower() for token in tokens if filter_bigrams_regex.match(str(token).lower())]

def bigrams(single_tokens):
    return [(single_tokens[i], single_tokens[i+1]) for i in range(len(single_tokens) - 1)]

bill_docs_single_tokens = [single_tokens(doc_tokens) for doc_tokens in bill_docs_tokenized]
bill_docs_bigrams = [bigrams(doc_tokens) for doc_tokens in bill_docs_single_tokens]

In [8]:
bill_docs_bigrams[0][:10]

[('tekst', 'ustawy'),
 ('ustawy', 'ustalony'),
 ('ustalony', 'ostatecznie'),
 ('ostatecznie', 'po'),
 ('po', 'rozpatrzeniu'),
 ('rozpatrzeniu', 'poprawek'),
 ('poprawek', 'senatu'),
 ('senatu', 'ustawa'),
 ('ustawa', 'z'),
 ('z', 'dnia')]

### PMI

In [9]:
def compute_global_counter(ctrs): 
    global_counter = Counter()
    for ctr in ctrs:
        global_counter += ctr
    return global_counter

def probabilities(bigrams, filter_fn=None):
    ctrs = [Counter(doc) for doc in bigrams]
    global_counter = compute_global_counter(ctrs)
    l = len(global_counter)
    return { k: global_counter[k]/l for k in global_counter 
            if filter_fn == None or filter_fn(k, global_counter[k])}, global_counter

def pmi_df_bigram(filter_fn=None):
    p, pctr = probabilities(bill_docs_single_tokens, None)
    pm, pmctr = probabilities(bill_docs_bigrams, filter_fn)
    
    df = pd.DataFrame(data=[(k,) for k in pm.keys()], columns = ["bigram"])
    df["p(a)"] = df["bigram"].apply(lambda x: p[x[0]])
    df["#a"] = df["bigram"].apply(lambda x: pctr[x[0]])
    df["p(b)"] = df["bigram"].apply(lambda x: p[x[1]])
    df["#b"] = df["bigram"].apply(lambda x: pctr[x[1]])
    df["p(a,b)"] = df["bigram"].apply(lambda x: pm[x])
    df["#ab"] = df["bigram"].apply(lambda x: pmctr[x])
    df["#ba"] = df["bigram"].apply(lambda x: pmctr[(x[1], x[0])])
    df["pmi"] = np.log(df["p(a,b)"]/ (df["p(a)"]*df["p(b)"]))
    return df.sort_values(["pmi"], ascending=[0])

In [10]:
df = pmi_df_bigram()

In [11]:
df.head(10)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,#ba,pmi
195776,"(szarańczyn, ceratonia)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
614732,"(samaria, człuchów)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
614669,"(kanaan, namysłów)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
537754,"(rozgłasza, prawdziwy)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
446750,"(drenażu, odwadniającego)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
467305,"(masami, ziemnymi)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
59532,"(nieprzereagowanym, substracie)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
614721,"(genezaret, barlinek)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
252934,"(furfuralu, aldehydu)",2e-05,1,2e-05,1,2e-06,1,0,8.23043
252706,"(sokiem, winogronowym)",2e-05,1,2e-05,1,2e-06,1,0,8.23043


#### PMI filtered
PMI for pairs that occured at least 5 times

In [12]:
df_filtered = pmi_df_bigram(lambda _, v: v > 5)

In [13]:
df_filtered.head(10)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,#ba,pmi
12631,"(chrześcijan, baptystów)",0.000122,6,0.000122,6,9e-06,6,0,6.43867
39734,"(adama, mickiewicza)",0.000122,6,0.000122,6,9e-06,6,0,6.43867
49282,"(lambrekiny, okienne)",0.000142,7,0.000122,6,9e-06,6,0,6.28452
36058,"(schedę, spadkową)",0.000142,7,0.000142,7,1.1e-05,7,0,6.28452
61030,"(zdrowego, stylu)",0.000162,8,0.000142,7,1.1e-05,7,0,6.150988
25134,"(buraka, cukrowego)",0.000162,8,0.000122,6,9e-06,6,0,6.150988
44896,"(zniekształcających, rzeźbę)",0.000142,7,0.000182,9,1.1e-05,7,0,6.033205
19182,"(małżeńską, wspólnością)",0.000182,9,0.000182,9,1.4e-05,9,0,6.033205
64751,"(partiom, politycznym)",0.000162,8,0.000142,7,9e-06,6,0,5.996838
22484,"(ziemiach, zachodnich)",0.000203,10,0.000182,9,1.4e-05,9,0,5.927845


### LLR

In [14]:
def H(a):
    a = np.array(a)
    N = a.sum()
    return np.sum(a/N * np.log(a/N + (a==0)))
                                  
def llr(pmi_df):
    def apply_llr(k):
        return 2*np.sum(k)*(H(k) - H(k.sum(axis=0)) - H(k.sum(axis=1)))
        
    df = pmi_df.copy()
    sum_all = pmi_df["#ab"].sum()
    df["k11"] = pmi_df["#ab"]
    df["k12"] = pmi_df["#a"] - df["k11"]
    df["k21"] = pmi_df["#b"] - df["k11"] 
    df["k22"] = sum_all - df["k11"]
    df["k"] = df[["k11", "k12","k21", "k22"]].values.tolist()
    df["k"] = df["k"].apply(lambda k: np.array(k).reshape((2,2)))
    df["llr"] = df["k"].apply(apply_llr)
    pmi_df["llr"] = df["llr"]
    return pmi_df

In [15]:
df = llr(df)

In [16]:
df.sort_values(["llr"], ascending=[0]).head(20)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,#ba,pmi,llr
757,"(mowa, w)",0.582862,28759,4.073245,200978,0.043871,28455,0,-3.991148,157224.378865
756,"(których, mowa)",0.362741,17898,0.582862,28759,0.021349,13847,0,-2.292897,117593.412969
755,"(o, których)",1.306135,64446,0.362741,17898,0.0214,13880,9,-3.097393,91131.148054
848,"(którym, mowa)",0.238422,11764,0.582862,28759,0.014132,9166,0,-2.285819,76059.977375
809,"(z, nr)",1.667761,82289,0.910521,44926,0.027642,17929,11152,-4.00616,74581.234477
236,"(dodaje, się)",0.17069,8422,0.891389,43982,0.012627,8190,26,-2.489039,68717.914736
900,"(do, spraw)",1.222452,60317,0.200523,9894,0.013393,8687,56,-2.90704,62033.352781
9372,"(nr, nr)",0.910521,44926,0.910521,44926,0.019978,12958,12958,-3.725646,60030.55639
847,"(o, którym)",1.306135,64446,0.238422,11764,0.014156,9182,9,-3.090952,59792.033924
401,"(na, podstawie)",1.02045,50350,0.136965,6758,0.010211,6623,15,-2.616503,53804.464479


### Trigrams

In [32]:
def trigrams(single_tokens):
    return [(single_tokens[i], single_tokens[i+1], single_tokens[i+2]) for i in range(len(single_tokens) - 2)]

bill_docs_trigrams = [trigrams(doc_tokens) for doc_tokens in bill_docs_single_tokens]

In [33]:
bill_docs_trigrams[0][:10]

[('tekst', 'ustawy', 'ustalony'),
 ('ustawy', 'ustalony', 'ostatecznie'),
 ('ustalony', 'ostatecznie', 'po'),
 ('ostatecznie', 'po', 'rozpatrzeniu'),
 ('po', 'rozpatrzeniu', 'poprawek'),
 ('rozpatrzeniu', 'poprawek', 'senatu'),
 ('poprawek', 'senatu', 'ustawa'),
 ('senatu', 'ustawa', 'z'),
 ('ustawa', 'z', 'dnia'),
 ('z', 'dnia', 'lipca')]

In [34]:
def pmi_df_trigram(filter_fn=None):
    p, pctr = probabilities(bill_docs_single_tokens, None)
    pm, pmctr = probabilities(bill_docs_bigrams, None)
    pt, ptctr = probabilities(bill_docs_trigrams, filter_fn)
    
    df = pd.DataFrame(data=[(k,) for k in pt.keys()], columns = ["trigram"])
    df["p(a)"] = df["trigram"].apply(lambda x: pm[(x[0], x[1])])
    df["#a"] = df["trigram"].apply(lambda x: pmctr[(x[0], x[1])])
    df["p(b)"] = df["trigram"].apply(lambda x: p[x[2]])
    df["#b"] = df["trigram"].apply(lambda x: pctr[x[2]])
    df["p(a,b)"] = df["trigram"].apply(lambda x: pt[x])
    df["#ab"] = df["trigram"].apply(lambda x: ptctr[x])
    df["pmi"] = np.log(df["p(a,b)"]/ (df["p(a)"]*df["p(b)"]))
    return df.sort_values(["pmi"], ascending=[0])

In [35]:
df_trigram = pmi_df_trigram()
df_trigram_filtered = pmi_df_trigram(lambda _, v: v > 5)

In [36]:
df_trigram.sort_values(["pmi"], ascending=[0]).head(20)

Unnamed: 0,trigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi
684577,"(ładunek, części, frachtem)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
621640,"(katechetyczny, lub, kaplicę)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
892692,"(upinanie, welonów, żałobnych)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
1061039,"(to, niewielkie, odchylenia)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
892691,"(i, upinanie, welonów)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
787190,"(idei, miłości, bliźnich)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
621638,"(na, punkt, katechetyczny)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
320575,"(wzrost, niską, elastyczność)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
1396028,"(strategii, opinii, audytowej)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408
1249859,"(programów, pilotażowych, testujących)",2e-06,1,2e-05,1,7.159221e-07,1,10.039408


In [37]:
df_trigram_filtered.sort_values(["pmi"], ascending=[0]).head(20)

Unnamed: 0,trigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi
34958,"(i, lambrekiny, okienne)",9e-06,6,0.000122,6,4e-06,6,8.247648
8312,"(praca, społecznie, użyteczna)",9e-06,6,0.000122,6,4e-06,6,8.247648
48893,"(wytwarzania, dwutlenku, tytanu)",9e-06,6,0.000122,6,4e-06,6,8.247648
14663,"(wyższą, szkołą, teologiczną)",9e-06,6,0.000122,6,4e-06,6,8.247648
15572,"(jako, czynnika, ułatwiającego)",9e-06,6,0.000122,6,4e-06,6,8.247648
38434,"(promieniowania, do, kev)",9e-06,6,0.000122,6,4e-06,6,8.247648
50349,"(gabinetem, ministrów, ukrainy)",9e-06,6,0.000122,6,4e-06,6,8.247648
15027,"(obrębie, terenów, kaplic)",9e-06,6,0.000122,6,4e-06,6,8.247648
7762,"(kościoła, chrześcijan, baptystów)",9e-06,6,0.000122,6,4e-06,6,8.247648
36712,"(wodzie, nazwę, anionu)",9e-06,6,0.000122,6,4e-06,6,8.247648


In [38]:
df_trigram = llr(df_trigram)

In [39]:
df_trigram.sort_values(["llr"], ascending=[0]).head(20)

Unnamed: 0,trigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi,llr
852,"(o, których, mowa)",0.0214,13880,0.582862,28759,0.009909,13841,-0.230112,136049.565555
969,"(o, którym, mowa)",0.014156,9182,0.582862,28759,0.00656,9163,-0.22937,88122.197985
12780,"(nr, z, nr)",0.017194,11152,0.910521,44926,0.007118,9943,-0.788114,78342.951179
853,"(których, mowa, w)",0.021349,13847,4.073245,200978,0.009874,13792,-2.175523,76209.23367
12786,"(nr, i, nr)",0.01551,10060,0.910521,44926,0.006222,8691,-0.819643,66903.419061
1338,"(właściwy, do, spraw)",0.007464,4841,0.200523,9894,0.003304,4615,0.791914,53447.037447
1467,"(o, której, mowa)",0.008523,5528,0.582862,28759,0.003938,5501,-0.232195,51897.995727
970,"(którym, mowa, w)",0.014132,9166,4.073245,200978,0.006547,9145,-2.173837,50528.963605
13275,"(nr, nr, nr)",0.019978,12958,0.910521,44926,0.005259,7346,-1.240922,45477.038471
13938,"(zastępuje, się, wyrazami)",0.007363,4776,0.063963,3156,0.002104,2939,1.496813,38608.581627


### Conclusion

##### Why do we have to filter the bigrams, rather than the token sequence?
Tokens sometimes appear very often but not in specific bigram. We are more error-prone when we filter bigrams, because phrases introduced by mistake are not counted. 

##### Which measure (PMI, PMI with filtering, LLR) works better for the bigrams and which for the trigrams?
The result that we expect matches results for LLR methods in both cases. We expect to see phrases that are really common. PMI checkes rather is the phrase always appear as is (next question)

##### What types of expressions are discovered by the methods.
LLR finds the expressions that frequently appear in the corpus. It is proven by the results, since phrases such as "o których mowa" are commonly used in the corpus. PMI treats the phrases locally and check how strong collocation is. The best score is for phrases that always appear in determined orded and not in different neighbourhood. Especially when there is only one occurence.