In [41]:
from nlp_common.acts_reader import ActsReader
import regex
import requests
import pandas as pd
from collections import Counter
import numpy as np
import os.path

In [42]:
reader = ActsReader('../ustawy')
bills = [ act[2] for act in reader.all_acts() ]

In [43]:
new_line_re = regex.compile(r'\n+|\s+|\t+')
bills = [new_line_re.sub(' ', bill) for bill in bills]

### Tagging

In [None]:
def get_tags():
    file_name = "tags_response.txt"
    if os.path.exists(file_name):
        print("Reading cached response")
        with open(file_name, "r") as f:
            return f.read()
    else:
        with open(file_name, "w") as f:
            for i, bill in enumerate(bills):
                tagged_response = requests.post('http://localhost:9200', bill.encode(encoding='utf-8'))
                f.write(tagged_response.text)
                print(f"Processed: {i+1}/{len(bills)}")
        return get_tags()

tagged_response = get_tags()

Processed: 1/1179
Processed: 2/1179
Processed: 3/1179
Processed: 4/1179
Processed: 5/1179
Processed: 6/1179
Processed: 7/1179
Processed: 8/1179
Processed: 9/1179
Processed: 10/1179
Processed: 11/1179
Processed: 12/1179
Processed: 13/1179
Processed: 14/1179
Processed: 15/1179
Processed: 16/1179
Processed: 17/1179
Processed: 18/1179
Processed: 19/1179
Processed: 20/1179
Processed: 21/1179
Processed: 22/1179
Processed: 23/1179
Processed: 24/1179
Processed: 25/1179
Processed: 26/1179
Processed: 27/1179
Processed: 28/1179
Processed: 29/1179
Processed: 30/1179
Processed: 31/1179
Processed: 32/1179
Processed: 33/1179
Processed: 34/1179
Processed: 35/1179
Processed: 36/1179
Processed: 37/1179
Processed: 38/1179
Processed: 39/1179
Processed: 40/1179
Processed: 41/1179
Processed: 42/1179
Processed: 43/1179
Processed: 44/1179
Processed: 45/1179
Processed: 46/1179
Processed: 47/1179
Processed: 48/1179
Processed: 49/1179
Processed: 50/1179
Processed: 51/1179
Processed: 52/1179
Processed: 53/1179
Pr

### Bigrams

In [32]:
def create_unigrams(tagged_corpus):
    splitted = [ l for l in tagged_corpus.split('\n') if l != '']
    unigrams = []
    for i in range(0, len(splitted), 2):
        if splitted[i].startswith('\t') or not splitted[i+1].startswith('\t'):
            raise Exception("Wrong assumption")
            
        tagging =  splitted[i+1].split('\t')
        tags = tagging[2].split(':')
        unigrams.append(f'{tagging[1].lower()}:{tags[0]}')
    
    return unigrams
    
unigrams = create_unigrams(tagged_response)

In [33]:
def create_bigrams(unigrams):
    return [(unigrams[i], unigrams[i+1]) for i in range(len(unigrams)-1)]
    
bigrams = create_bigrams(unigrams)

### LLR

In [34]:
def compute_global_counter(ctrs): 
    global_counter = Counter()
    for ctr in ctrs:
        global_counter += ctr
    return global_counter

def probabilities(ngrams, filter_fn=None):
    ctrs = [Counter(doc) for doc in ngrams]
    global_counter = compute_global_counter(ctrs)
    l = len(global_counter)
    probs = { k: global_counter[k]/l for k in global_counter 
            if filter_fn == None or filter_fn(k, global_counter[k])}
    global_counter = { k: global_counter[k] for k in global_counter
            if filter_fn == None or filter_fn(k, global_counter[k]) }
    return probs, global_counter

def probabilities_flat(counter_ngrams, map_key_fn):
    ctr = Counter()
    for k in counter_ngrams:
        mk = map_key_fn(k)
        ctr[mk] += counter_ngrams[k]
    l = len(ctr)
    probs = {
        k: ctr[k]/l for k in ctr
    }
    return probs, ctr

def pmi_df_bigram(bill_docs_bigrams, filter_fn=None):
    pm, pmctr = probabilities(bill_docs_bigrams, filter_fn)
    pa, pactr = probabilities_flat(pmctr, lambda x: x[0])
    pb, pbctr = probabilities_flat(pmctr, lambda x: x[1])

    df = pd.DataFrame(data=[(k,) for k in pm.keys()], columns = ["bigram"])
    df["p(a)"] = df["bigram"].apply(lambda x: pa[x[0]])
    df["#a"] = df["bigram"].apply(lambda x: pactr[x[0]])
    df["p(b)"] = df["bigram"].apply(lambda x: pb[x[1]])
    df["#b"] = df["bigram"].apply(lambda x: pbctr[x[1]])
    df["p(a,b)"] = df["bigram"].apply(lambda x: pm[x])
    df["#ab"] = df["bigram"].apply(lambda x: pmctr[x])
    df["pmi"] = np.log(df["p(a,b)"]/ (df["p(a)"]*df["p(b)"]))
    return df#.sort_values(["pmi"], ascending=[0])

def H(a):
    a = np.array(a)
    N = a.sum()
    return np.sum(a/N * np.log(a/N + (a==0)))
                                  
def llr(pmi_df):
    def apply_llr(k):
        return 2*np.sum(k)*(H(k) - H(k.sum(axis=0)) - H(k.sum(axis=1)))
        
    df = pmi_df.copy()
    sum_all = pmi_df["#ab"].sum()
    df["k11"] = pmi_df["#ab"]
    df["k12"] = pmi_df["#a"] - df["k11"]
    df["k21"] = pmi_df["#b"] - df["k11"] 
    df["k22"] = sum_all - (pmi_df["#a"] + pmi_df["#b"] - pmi_df["#ab"])
    df["k"] = df[["k11", "k12","k21", "k22"]].values.tolist()
    df["k"] = df["k"].apply(lambda k: np.array(k).reshape((2,2)))
    df["llr"] = df["k"].apply(apply_llr)
    pmi_df["llr"] = df["llr"]
    return pmi_df.sort_values(["llr"], ascending=[0])

In [35]:
bigram_regex = regex.compile('\p{L}+:\p{L}+')
llrdf = llr(pmi_df_bigram([bigrams], lambda k,_: bigram_regex.match(k[0]) and bigram_regex.match(k[1])))

In [36]:
llrdf.head(30)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi,llr
963,"(który:adj, mowa:subst)",0.259129,809,0.15527,495,0.021599,490,-0.622076,4176.194744
964,"(mowa:subst, w:prep)",0.158552,495,0.634253,2022,0.021599,490,-1.53812,3022.20493
962,"(o:prep, który:adj)",0.419923,1311,0.196361,626,0.021599,490,-1.339616,2934.404715
2336,"(otrzymywać:fin, brzmienie:subst)",0.069827,218,0.103513,330,0.009477,215,0.270925,2230.331855
93,"(rzeczpospolita:subst, polski:adj)",0.058937,184,0.07591,242,0.007846,178,0.561785,1960.915092
27,"(w:prep, artykuł:brev)",0.942985,2944,0.223965,714,0.021423,486,-2.288324,1777.929802
92,"(terytorium:subst, rzeczpospolita:subst)",0.044523,139,0.057716,184,0.005907,134,0.832305,1544.38126
60,"(świadczenie:subst, rodzinny:adj)",0.085842,268,0.074028,236,0.007097,161,0.11046,1451.456891
28,"(w:prep, ustęp:brev)",0.942985,2944,0.151819,484,0.015208,345,-2.24219,1297.638341
2280,"(i:conj, numer:brev)",0.318706,995,0.053952,172,0.007538,171,-0.824699,1291.627326


## Partition

In [37]:
def syntactic_category(bigram):
    return (bigram[0].split(':')[1],bigram[1].split(':')[1])

def syntactic_partition(llrdf):
    partitions = {}
    for bigram, llr in llrdf[["bigram", "llr"]].values:
        cat = syntactic_category(bigram)
        if not cat in partitions:
            partitions[cat] = [(bigram, llr)]
        else:
            partitions[cat].append((bigram, llr))
    return partitions    

In [38]:
partitions = syntactic_partition(llrdf)

In [39]:
pdf = pd.DataFrame(data=[(k, len(partitions[k])) for k in partitions.keys()], columns = ["category", "#"])
pdf = pdf.sort_values(["#"], ascending=[0])
pdf = pdf[:10]
pdf["best 5 llr"] = pdf["category"].apply(lambda x: sorted(partitions[x], key=lambda d: -d[1])[:5] )

In [40]:
from pandas import option_context

with option_context('display.max_colwidth', 400):
    display(pdf.head())

Unnamed: 0,category,#,best 5 llr
6,"(subst, subst)",1541,"[((terytorium:subst, rzeczpospolita:subst), 1544.3812601484437), ((status:subst, uchodźca:subst), 1175.0877837785877), ((dokument:subst, podróż:subst), 767.1722817387937), ((droga:subst, rozporządzenie:subst), 613.0513910534687), ((podstawa:subst, przepis:subst), 342.18168165385345)]"
4,"(subst, adj)",1007,"[((rzeczpospolita:subst, polski:adj), 1960.9150920966129), ((świadczenie:subst, rodzinny:adj), 1451.4568909773425), ((minister:subst, właściwy:adj), 871.6988031002379), ((wypłata:subst, transferowy:adj), 755.0125516581635), ((instytucja:subst, finansowy:adj), 719.3489125339626)]"
0,"(adj, subst)",983,"[((który:adj, mowa:subst), 4176.194743892795), ((następujący:adj, zmiana:subst), 372.35825296576496), ((indywidualny:adj, konto:subst), 302.3365321875082), ((mieszkaniowy:adj, zasób:subst), 282.5808150482702), ((niniejszy:adj, ustawa:subst), 264.0097696649647)]"
9,"(prep, subst)",971,"[((na:prep, podstawa:subst), 958.9465847264664), ((do:prep, sprawa:subst), 588.8339459006428), ((na:prep, terytorium:subst), 558.9563395730723), ((z:prep, dzień:subst), 548.7827148778163), ((na:prep, czas:subst), 500.2977538840747)]"
1,"(subst, prep)",712,"[((mowa:subst, w:prep), 3022.2049302736445), ((ustawa:subst, z:prep), 534.2000893071788), ((przepis:subst, o:prep), 529.2354205308364), ((zezwolenie:subst, na:prep), 509.160041073768), ((prawo:subst, do:prep), 465.97152364458344)]"


### Questions

##### What types of bigrams have been found?
Mostly pairs of type (noun,noun) and many of them are case governments (związek rządu). The phrases appear more often right because of lematization and all different forms are counted together.

##### Which of the category-pairs indicate valuable multiword expressions? Do they have anything in common?
The ones with bigger counts?

##### Which signal: LLR score or syntactic category is more useful for determining genuine multiword expressions?
It definitely depends on the problem that we solve and what we want to find. They will we applicable for different problems. They both works well together as the last task have shown.

##### Can you describe a different use-case where the morphosyntactic category is useful for resolving a real-world problem?
By knowing the rules what follows the words usually, we are able to for examoke build sentences that are answers for questions.