In [1]:
from nlp_common.acts_reader import ActsReader
import regex
import requests
import pandas as pd
from collections import Counter
import numpy as np

In [2]:
reader = ActsReader('../ustawy')
bills = [ act[2] for act in reader.random_n_acts() ]

In [3]:
new_line_re = regex.compile(r'\n+|\s+|\t+')
bills = [new_line_re.sub(' ', bill) for bill in bills]

### Tagging

In [4]:
tagged_response = requests.post('http://localhost:9200', ' '.join(bills).encode(encoding='utf-8'))
tagged_response

<Response [200]>

### Bigrams

In [5]:
def create_unigrams(tagged_corpus):
    splitted = [ l for l in tagged_corpus.split('\n') if l != '']
    unigrams = []
    for i in range(0, len(splitted), 2):
        if splitted[i].startswith('\t') or not splitted[i+1].startswith('\t'):
            raise Exception("Wrong assumption")
            
        tagging =  splitted[i+1].split('\t')
        tags = tagging[2].split(':')
        unigrams.append(f'{tagging[1].lower()}:{tags[0]}')
    
    return unigrams
    
unigrams = create_unigrams(tagged_response.text)

In [6]:
def create_bigrams(unigrams):
    return [(unigrams[i], unigrams[i+1]) for i in range(len(unigrams)-1)]
    
bigrams = create_bigrams(unigrams)

### LLR

In [8]:
def compute_global_counter(ctrs): 
    global_counter = Counter()
    for ctr in ctrs:
        global_counter += ctr
    return global_counter

def probabilities(ngrams, filter_fn=None):
    ctrs = [Counter(doc) for doc in ngrams]
    global_counter = compute_global_counter(ctrs)
    l = len(global_counter)
    probs = { k: global_counter[k]/l for k in global_counter 
            if filter_fn == None or filter_fn(k, global_counter[k])}
    global_counter = { k: global_counter[k] for k in global_counter
            if filter_fn == None or filter_fn(k, global_counter[k]) }
    return probs, global_counter

def probabilities_flat(counter_ngrams, map_key_fn):
    ctr = Counter()
    for k in counter_ngrams:
        mk = map_key_fn(k)
        ctr[mk] += counter_ngrams[k]
    l = len(ctr)
    probs = {
        k: ctr[k]/l for k in ctr
    }
    return probs, ctr

def pmi_df_bigram(bill_docs_bigrams, filter_fn=None):
    pm, pmctr = probabilities(bill_docs_bigrams, filter_fn)
    pa, pactr = probabilities_flat(pmctr, lambda x: x[0])
    pb, pbctr = probabilities_flat(pmctr, lambda x: x[1])

    df = pd.DataFrame(data=[(k,) for k in pm.keys()], columns = ["bigram"])
    df["p(a)"] = df["bigram"].apply(lambda x: pa[x[0]])
    df["#a"] = df["bigram"].apply(lambda x: pactr[x[0]])
    df["p(b)"] = df["bigram"].apply(lambda x: pb[x[1]])
    df["#b"] = df["bigram"].apply(lambda x: pbctr[x[1]])
    df["p(a,b)"] = df["bigram"].apply(lambda x: pm[x])
    df["#ab"] = df["bigram"].apply(lambda x: pmctr[x])
    df["pmi"] = np.log(df["p(a,b)"]/ (df["p(a)"]*df["p(b)"]))
    return df#.sort_values(["pmi"], ascending=[0])

def H(a):
    a = np.array(a)
    N = a.sum()
    return np.sum(a/N * np.log(a/N + (a==0)))
                                  
def llr(pmi_df):
    def apply_llr(k):
        return 2*np.sum(k)*(H(k) - H(k.sum(axis=0)) - H(k.sum(axis=1)))
        
    df = pmi_df.copy()
    sum_all = pmi_df["#ab"].sum()
    df["k11"] = pmi_df["#ab"]
    df["k12"] = pmi_df["#a"] - df["k11"]
    df["k21"] = pmi_df["#b"] - df["k11"] 
    df["k22"] = sum_all - (pmi_df["#a"] + pmi_df["#b"] - pmi_df["#ab"])
    df["k"] = df[["k11", "k12","k21", "k22"]].values.tolist()
    df["k"] = df["k"].apply(lambda k: np.array(k).reshape((2,2)))
    df["llr"] = df["k"].apply(apply_llr)
    pmi_df["llr"] = df["llr"]
    return pmi_df.sort_values(["llr"], ascending=[0])

In [9]:
bigram_regex = regex.compile('\p{L}+:\p{L}+')
llrdf = llr(pmi_df_bigram([bigrams], lambda k,_: bigram_regex.match(k[0]) and bigram_regex.match(k[1])))

In [10]:
llrdf.head(30)

Unnamed: 0,bigram,p(a),#a,p(b),#b,"p(a,b)",#ab,pmi,llr
283,"(który:adj, mowa:subst)",0.19571,593,0.091027,281,0.012894,280,-0.323291,2415.200872
282,"(o:prep, który:adj)",0.240924,730,0.129576,400,0.013032,283,-0.873594,1822.215773
284,"(mowa:subst, w:prep)",0.092739,281,0.493035,1522,0.012894,280,-1.265877,1782.414334
285,"(w:prep, artykuł:brev)",0.70165,2126,0.158082,488,0.014551,316,-2.031102,1187.886174
10797,"(opieka:subst, zdrowotny:adj)",0.044554,135,0.048267,149,0.004927,107,0.829076,1132.874065
11472,"(dodawać:fin, się:qub)",0.033993,103,0.205701,635,0.004605,100,-0.417715,786.620863
5119,"(w:prep, ustęp:brev)",0.70165,2126,0.106576,329,0.009624,209,-2.050253,763.476072
19,"(stosować:fin, się:qub)",0.031023,94,0.205701,635,0.004237,92,-0.409663,727.999143
2052,"(i:conj, numer:brev)",0.262706,796,0.028831,89,0.004052,88,-0.62543,660.810657
11478,"(otrzymywać:fin, brzmienie:subst)",0.020462,62,0.04438,137,0.002717,59,1.09589,657.395126


## Partition

In [11]:
def syntactic_category(bigram):
    return (bigram[0].split(':')[1],bigram[1].split(':')[1])

def syntactic_partition(llrdf):
    partitions = {}
    for bigram, llr in llrdf[["bigram", "llr"]].values:
        cat = syntactic_category(bigram)
        if not cat in partitions:
            partitions[cat] = [(bigram, llr)]
        else:
            partitions[cat].append((bigram, llr))
    return partitions    

In [12]:
partitions = syntactic_partition(llrdf)

In [13]:
pdf = pd.DataFrame(data=[(k, len(partitions[k])) for k in partitions.keys()], columns = ["category", "#"])
pdf = pdf.sort_values(["#"], ascending=[0])
pdf = pdf[:10]
pdf["best 5 llr"] = pdf["category"].apply(lambda x: sorted(partitions[x], key=lambda d: -d[1])[:5] )

In [14]:
from pandas import option_context

with option_context('display.max_colwidth', 400):
    display(pdf.head())

Unnamed: 0,category,#,best 5 llr
11,"(subst, subst)",1442,"[((droga:subst, rozporządzenie:subst), 546.7513389034491), ((płatnik:subst, składka:subst), 537.7940288058142), ((związek:subst, gmina:subst), 387.92512883716205), ((skarb:subst, państwo:subst), 330.3736377333196), ((zakład:subst, opieka:subst), 317.0532384562326)]"
0,"(adj, subst)",1019,"[((który:adj, mowa:subst), 2415.200871823244), ((grupowy:adj, praktyka:subst), 514.1174400936494), ((podstawowy:adj, opieka:subst), 458.31065956436), ((polski:adj, przynależność:subst), 418.42632058236785), ((okręgowy:adj, rada:subst), 323.5757882586543)]"
4,"(subst, adj)",999,"[((opieka:subst, zdrowotny:adj), 1132.874065352449), ((świadectwo:subst, rekompensacyjny:adj), 594.3206319425778), ((awaria:subst, wspólny:adj), 524.5963065982914), ((rzeczpospolita:subst, polski:adj), 518.1912698446247), ((praktyka:subst, lekarski:adj), 505.00428904515)]"
9,"(prep, subst)",993,"[((od:prep, dzień:subst), 562.4036099169556), ((na:prep, podstawa:subst), 501.4163523170286), ((z:prep, dzień:subst), 372.7101003968491), ((z:prep, tytuł:subst), 327.1681528967661), ((za:prep, szkoda:subst), 315.5400836580972)]"
2,"(subst, prep)",640,"[((mowa:subst, w:prep), 1782.4143338423585), ((ustawa:subst, z:prep), 399.9912023183502), ((konwencja:subst, o:prep), 272.0342996204063), ((wpis:subst, do:prep), 183.23443003455202), ((miesiąc:subst, od:prep), 156.79306743018887)]"
