In [95]:
import spacy
from nlp_common.acts_reader import ActsReader
import regex
from spacy.tokenizer import Tokenizer
import numpy as np
from collections import Counter
import math
import pandas as pd

In [85]:
acts_reader = ActsReader('../ustawy')
bills = [ text for _, _, text in acts_reader.all_acts()]

In [86]:
new_line_re = regex.compile(r'\n+|\s+')
bills = [new_line_re.sub(' ', bill) for bill in bills]

### Tokenization

In [87]:
nlp = spacy.load("pl_core_news_sm")
prefix_re = regex.compile(r'''^[\[\("'']+''')
suffix_re = regex.compile(r'''[\]\)"']+$''')
nlp.tokenizer = Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search)

In [88]:
bill_docs_tokenized = list(nlp.tokenizer.pipe(bills))

### Bigrams

In [120]:
def single_tokens(tokens):
    return [ str(token).lower() for token in tokens ]

def bigrams(single_tokens):
    return [(single_tokens[i], single_tokens[i+1]) for i in range(len(single_tokens) - 1)]

bill_docs_single_tokens = [single_tokens(doc_tokens) for doc_tokens in bill_docs_tokenized]
bill_docs_bigrams = [bigrams(doc_tokens) for doc_tokens in bill_docs_single_tokens]

In [121]:
bill_docs_bigrams[0][:10]

[(' ', 'tekst'),
 ('tekst', 'ustawy'),
 ('ustawy', 'ustalony'),
 ('ustalony', 'ostatecznie'),
 ('ostatecznie', 'po'),
 ('po', 'rozpatrzeniu'),
 ('rozpatrzeniu', 'poprawek'),
 ('poprawek', 'senatu'),
 ('senatu', ' '),
 (' ', 'ustawa')]

In [122]:
filter_bigrams_regex = regex.compile(r'^\p{L}+$')
def filter_bigrams(doc_bigrams):
    return [b for b in doc_bigrams 
            if filter_bigrams_regex.match(b[0]) 
            and filter_bigrams_regex.match(b[1])]

bill_docs_bigrams = [filter_bigrams(b) for b in bill_docs_bigrams]

In [123]:
bill_docs_bigrams[0][:10]

[('tekst', 'ustawy'),
 ('ustawy', 'ustalony'),
 ('ustalony', 'ostatecznie'),
 ('ostatecznie', 'po'),
 ('po', 'rozpatrzeniu'),
 ('rozpatrzeniu', 'poprawek'),
 ('poprawek', 'senatu'),
 ('ustawa', 'z'),
 ('z', 'dnia'),
 ('o', 'zmianie')]

### PMI

In [124]:
def compute_global_counter(ctrs): 
    global_counter = Counter()
    for ctr in ctrs:
        global_counter += ctr
    return global_counter

def probabilities(bigrams):
    ctrs = [Counter(doc) for doc in bigrams]
    global_counter = compute_global_counter(ctrs)
    l = len(global_counter)
    return { k: global_counter[k]/l for k in global_counter}, global_counter

p, pctr = probabilities(bill_docs_single_tokens)
pm, pmctr = probabilities(bill_docs_bigrams)

In [128]:
    #return { b, math.log(pm[b]/(p[b[0]]*p[b[1]]))) for b in pm }
df = pd.DataFrame(data=[(k,) for k in pm.keys()], columns = ["bigram"])
df["p(a)"] = df["bigram"].apply(lambda x: p[x[0]])
df["p(b)"] = df["bigram"].apply(lambda x: p[x[1]])
df["p(a,b)"] = df["bigram"].apply(lambda x: pm[x])
df["pmi"] = np.log(df["p(a,b)"]/ (df["p(a)"]*df["p(b)"]))

In [135]:
df = df.sort_values(["pmi"], ascending=[0])

In [136]:
df

Unnamed: 0,bigram,p(a),p(b),"p(a,b)",pmi
555,"(mowa, w)",0.232942,1.627879,0.061650,-1.816598
554,"(których, mowa)",0.144970,0.232942,0.029295,-0.142140
553,"(o, których)",0.521999,0.144970,0.029137,-0.954430
8,"(z, dnia)",0.666524,0.142751,0.020688,-1.525888
616,"(którym, mowa)",0.095286,0.232942,0.019570,-0.125938
...,...,...,...,...,...
438174,"(liczba, stałych)",0.002511,0.000850,0.000002,0.028105
269028,"(stosując, dotychczasowe)",0.000931,0.002292,0.000002,0.028259
190720,"(zasadach, przejrzystości)",0.014644,0.000146,0.000002,0.028289
298564,"(wszelkie, przedmioty)",0.001831,0.001166,0.000002,0.028289
