# Multiword expressions identification and extraction

In [1]:
import spacy
import utils

2022-11-25 10:51:31.349018: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Use SpaCy tokenizer API to tokenize the text from the law corpus

In [2]:
nlp = spacy.load("pl_core_news_sm")
tokenizer = nlp.tokenizer

In [3]:
documents = utils.read_normalized_documents()
text = ' '.join(documents.values())

In [4]:
tokens = [t.orth_ for t in tokenizer(text)]

# Compute bigram counts of downcased tokens

In [12]:
from collections import Counter, namedtuple

In [6]:
NGram = namedtuple("NGram", ["tokens", "freq", "pmi"], defaults=(None, None, None))

In [25]:
def create_bi_grams(tokens):
    return [NGram(*tup) for tup in Counter([ngram for ngram in zip(tokens[:-1], tokens[1:])]).items()]


def create_tri_grams(tokens):
    return [NGram(*tup) for tup in Counter([ngram for ngram in zip(tokens[:-2], tokens[1:-1], tokens[2:])]).items()]

In [26]:
bi_grams = create_bi_grams(tokens)

# Discard bigrams containing characters other than letters

In [27]:
bi_grams_filt = [bi_gram for bi_gram in bi_grams if all(t.isalpha() for t in bi_gram.tokens)]
bi_grams_filt[:5]

[NGram(tokens=('tekst', 'ustawy'), freq=300, pmi=None),
 NGram(tokens=('ustawy', 'ustalony'), freq=82, pmi=None),
 NGram(tokens=('ustalony', 'ostatecznie'), freq=82, pmi=None),
 NGram(tokens=('ostatecznie', 'po'), freq=82, pmi=None),
 NGram(tokens=('po', 'rozpatrzeniu'), freq=115, pmi=None)]

# Use pointwise mutual information to compute the measure for all pairs of words

In [31]:
import math

In [32]:
TOKENS_FREQ = {
    t: cnt
    for t, cnt in Counter(tokens).items()
}

In [33]:
def calculate_pmi(n_gram, tokens_freq):
    tokens, freq, _ = n_gram
    word_freqs = [tokens_freq[t] / len(tokens_freq) for t in tokens]
    pmi = math.log2((freq / len(tokens_freq)) / math.prod(word_freqs))
    return NGram(tokens, freq, pmi)

In [34]:
bi_grams_pmi = [calculate_pmi(bi_gram, TOKENS_FREQ) for bi_gram in bi_grams_filt]
bi_grams_pmi[:5]

[NGram(tokens=('tekst', 'ustawy'), freq=300, pmi=2.1676788753673106),
 NGram(tokens=('ustawy', 'ustalony'), freq=82, pmi=0.8835689036235815),
 NGram(tokens=('ustalony', 'ostatecznie'), freq=82, pmi=7.874237434156478),
 NGram(tokens=('ostatecznie', 'po'), freq=82, pmi=1.923445213338051),
 NGram(tokens=('po', 'rozpatrzeniu'), freq=115, pmi=1.8404959441549096)]

# Determine top 10 entries

In [35]:
bi_grams_sort = sorted(bi_grams_pmi, key=lambda x: x.pmi, reverse=True)
bi_grams_sort[:10]

[NGram(tokens=('rezerwuaru', 'pierwiastka'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('wydobywane', 'lokalnie'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('objaśnieniem', 'figur'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('wkładzie', 'wnoszonym'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('doktorem', 'habilitowanym'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('przedemery', 'talne'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('organa', 'uchwałodawcze'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('kropki', 'wstawić'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('królowa', 'luiza'), freq=1, pmi=15.977973016322787),
 NGram(tokens=('protoplast', 'mieszańcowy'), freq=1, pmi=15.977973016322787)]

# Filter bigrams with number of occurrences lower than 5

In [36]:
bi_grams_sort_filt = [bi_gram for bi_gram in bi_grams_sort if bi_gram.freq > 4]
bi_grams_sort_filt[:10]

[NGram(tokens=('zaszkodzić', 'wynikom'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('świeckie', 'przygotowujące'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('grzegorz', 'schetyna'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('stajnią', 'wyścigową'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('teryto', 'rialnego'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('ręcznego', 'miotacza'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('młyny', 'kulowe'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('młynki', 'młotkowe'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('najnowszych', 'zdobyczy'), freq=5, pmi=13.656044921435424),
 NGram(tokens=('otworami', 'wiertniczymi'), freq=5, pmi=13.656044921435424)]

# Use KRNNT or Clarin-PL to tag and lemmatize the corpus

In [43]:
import requests
import time

In [44]:
CLARIN_URL = "https://ws.clarin-pl.eu/nlprest2/base/"
INPUT_FILE = "ustawy.zip"
OUTPUT_FILE = "analysis.zip"
OUTPUT_PATH = "ustawy_analyzed"

### Compress the corpus

In [None]:
!zip -q ustawy.zip ustawy/*

### Upload the corpus

In [57]:
with open(INPUT_FILE, "rb") as f:
    data = f.read()
res1 = requests.post(f"{CLARIN_URL}/upload",
                     data=data,
                     headers={'Content-Type': "binary/octet-stream"})
res1.raise_for_status()
res1.text

'/users/default/bfb560bc-87ad-4c0e-b904-dc74bb5b4fd5'

In [58]:
file_id = res1.text

### Start analysis

In [73]:
res2 = requests.post(f"{CLARIN_URL}/startTask",
                     json={
                         "lpmn": f'filezip({file_id})|any2txt|wcrft2({{"guesser":false, "morfeusz2":true}})|dir|makezip',
                         "user": "adamksiezyk@student.agh.edu.pl"
                     })
res2.raise_for_status()
res2.text

'cd6b1523-1cc1-4444-abe7-e00e66b95cf6'

In [74]:
task_id = res2.text

### Check status

In [78]:
while True:
    res3 = requests.get(f"{CLARIN_URL}/getStatus/{task_id}")
    res3.raise_for_status()
    res3_json = res3.json()
    print(res3_json)
    if res3_json['status'] == "ERROR":
        raise RuntimeError(res3_json)
    if res3_json['status'] == "DONE":
        break
    time.sleep(5)
    

{'status': 'PROCESSING', 'value': 0.7620865139949109}
{'status': 'PROCESSING', 'value': 0.8748939779474131}
{'status': 'PROCESSING', 'value': 0.9584393553859203}
{'status': 'PROCESSING', 'value': 0.9991518235793045}
{'status': 'PROCESSING', 'value': 0.9991518235793045}
{'status': 'PROCESSING', 'value': 0.9991518235793045}
{'status': 'PROCESSING', 'value': 0.9991518235793045}
{'status': 'DONE', 'value': [{'name': 'dane', 'fileID': '/requests/makezip/97d36baf-01d3-4990-a550-5ec3479e1c47'}]}


In [79]:
file_id = res3_json['value'][0]['fileID']

### Download result

In [80]:
with requests.get(f"{CLARIN_URL}/download{file_id}") as r, open(OUTPUT_FILE, 'wb') as f:
    r.raise_for_status()
    for chunk in r.iter_content(1024):
        f.write(chunk)

In [20]:
!mkdir ustawy_analyzed
!unzip -q analysis -d ustawy_analyzed

### Create corpus

In [38]:
from xml.etree import ElementTree as ET

In [45]:
documents_analyzed = utils.read_documents(OUTPUT_PATH)

In [66]:
def parse_analysis(xml_str):
    tags = []
    root = ET.fromstring(xml_str)
    for lex in root.findall("./chunk/sentence/tok"):
        base = lex.find("lex/base").text
        ctag = lex.find("lex/ctag").text.split(':')[0]
        tags.append(f"{base}:{ctag}")
    return tags

In [67]:
tags = []
for doc in documents_analyzed.values():
    tags.extend(parse_analysis(doc))

tags[:10]

['Dzieje_(Apostolskie):brev',
 '.:interp',
 'u:prep',
 '.:interp',
 'z:prep',
 '2000:num',
 'r:ign',
 '.:interp',
 'nr:subst',
 '48:num']

# Using the tagged corpus compute bigram statistic

In [68]:
tags_bi_grams = create_bi_grams(tags)

In [69]:
tags_bi_grams_filt = [bi_gram for bi_gram in tags_bi_grams if all(t[::-1].replace(':', '', 1).isalpha() for t in bi_gram.tokens)]
tags_bi_grams_filt[:5]

[NGram(tokens=('ustawa:subst', 'z:prep'), freq=8625, pmi=None),
 NGram(tokens=('z:prep', 'dzień:subst'), freq=11360, pmi=None),
 NGram(tokens=('o:prep', 'zmiana:subst'), freq=1409, pmi=None),
 NGram(tokens=('zmiana:subst', 'ustawa:subst'), freq=908, pmi=None),
 NGram(tokens=('ustawa:subst', 'o:prep'), freq=1668, pmi=None)]

# Compute the same statistics as for the non-lemmatized words (i.e. PMI) and print top-10 entries with at least 5 occurrences

In [70]:
TAGS_FREQ = {
    t: cnt
    for t, cnt in Counter(tags).items()
}

In [71]:
tags_bi_grams_pmi = [calculate_pmi(bi_gram, TAGS_FREQ) for bi_gram in tags_bi_grams_filt if bi_gram.freq > 4]
tags_bi_grams_sort = sorted(tags_bi_grams_pmi, key=lambda x: x.pmi, reverse=True)
tags_bi_grams_sort[:10]

[NGram(tokens=('Grzegorz:subst', 'Schetyna:ign'), freq=5, pmi=12.501439145158873),
 NGram(tokens=('młynek:subst', 'młotkowy:adj'), freq=5, pmi=12.501439145158873),
 NGram(tokens=('teryto:ign', 'rialnego:ign'), freq=5, pmi=12.501439145158873),
 NGram(tokens=('odpowiedzieć:fin', 'dzialności:ign'), freq=5, pmi=12.238404739325079),
 NGram(tokens=('pasta:subst', 'emulsyjny:adj'), freq=5, pmi=12.238404739325079),
 NGram(tokens=('Adam:subst', 'Mickiewicz:subst'), freq=6, pmi=12.238404739325079),
 NGram(tokens=('chrom:subst', 'sześciowartościowy:adj'), freq=5, pmi=12.238404739325079),
 NGram(tokens=('łańcuchowa:subst', 'rozszczepienie:subst'), freq=5, pmi=12.238404739325079),
 NGram(tokens=('młyn:subst', 'kulowy:adj'), freq=5, pmi=12.016012317988631),
 NGram(tokens=('Piotrek:subst', 'trybunalski:adj'), freq=7, pmi=12.016012317988631)]

# Compute trigram counts for both corpora and perform the same filtering

### Trigrams for tokenized corpus

In [72]:
tokens_tri_grams = create_tri_grams(tokens)
tokens_tri_grams_filt = [tri_gram for tri_gram in tokens_tri_grams if all(t.isalpha() for t in tri_gram.tokens)]
tokens_tri_grams_filt[:5]

[NGram(tokens=('tekst', 'ustawy', 'ustalony'), freq=82, pmi=None),
 NGram(tokens=('ustawy', 'ustalony', 'ostatecznie'), freq=82, pmi=None),
 NGram(tokens=('ustalony', 'ostatecznie', 'po'), freq=82, pmi=None),
 NGram(tokens=('ostatecznie', 'po', 'rozpatrzeniu'), freq=81, pmi=None),
 NGram(tokens=('po', 'rozpatrzeniu', 'poprawek'), freq=81, pmi=None)]

### Trigrams for tagged corpus

In [73]:
tags_tri_grams = create_tri_grams(tags)
tags_tri_grams_filt = [tri_gram for tri_gram in tags_tri_grams if all(t[::-1].replace(':', '', 1).isalpha() for t in tri_gram.tokens)]
tags_tri_grams_filt[:5]

[NGram(tokens=('ustawa:subst', 'z:prep', 'dzień:subst'), freq=8589, pmi=None),
 NGram(tokens=('o:prep', 'zmiana:subst', 'ustawa:subst'), freq=866, pmi=None),
 NGram(tokens=('zmiana:subst', 'ustawa:subst', 'o:prep'), freq=647, pmi=None),
 NGram(tokens=('ustawa:subst', 'o:prep', 'utworzyć:ger'), freq=12, pmi=None),
 NGram(tokens=('o:prep', 'utworzyć:ger', 'agencja:subst'), freq=34, pmi=None)]

# Use PMI (with 5 occurrence threshold) to compute top 10 results for the trigrams

### PMI for tokenized trigrams

In [59]:
tokens_tri_grams_pmi = [calculate_pmi(tri_gram, TOKENS_FREQ) for tri_gram in tokens_tri_grams_filt if tri_gram.freq > 4]
tokens_tri_grams_sort = sorted(tokens_tri_grams_pmi, key=lambda x: x.pmi, reverse=True)
tokens_tri_grams_sort[:10]

[NGram(tokens=('finałowego', 'turnieju', 'mistrzostw'), freq=10, pmi=24.796074695867183),
 NGram(tokens=('profilem', 'zaufanym', 'epuap'), freq=13, pmi=24.55506659636339),
 NGram(tokens=('cienką', 'sierścią', 'zwierzęcą'), freq=11, pmi=24.496514414008274),
 NGram(tokens=('przedwczesnego', 'wyrębu', 'drzewostanu'), freq=7, pmi=24.370983531924416),
 NGram(tokens=('centralnemu', 'biuru', 'antykorupcyjnemu'), freq=5, pmi=24.104196991229514),
 NGram(tokens=('turnieju', 'mistrzostw', 'europy'), freq=10, pmi=24.030539949504206),
 NGram(tokens=('potwierdzonym', 'profilem', 'zaufanym'), freq=13, pmi=24.007578801060895),
 NGram(tokens=('szybkiemu', 'postępowi', 'technicznemu'), freq=9, pmi=23.879130435594742),
 NGram(tokens=('piłce', 'nożnej', 'uefa'), freq=10, pmi=23.86002161264704),
 NGram(tokens=('wypalonym', 'paliwem', 'jądrowym'), freq=8, pmi=23.656738014258295)]

### PMI for tagged trigrams

In [74]:
tags_tri_grams_pmi = [calculate_pmi(tri_gram, TAGS_FREQ) for tri_gram in tags_tri_grams_filt if tri_gram.freq > 4]
tags_tri_grams_sort = sorted(tags_tri_grams_pmi, key=lambda x: x.pmi, reverse=True)
tags_tri_grams_sort[:10]

[NGram(tokens=('porcelanowy:adj', 'młyn:subst', 'kulowy:adj'), freq=5, pmi=23.517451463147502),
 NGram(tokens=('wymiennik:subst', 'przeponowy:adj', 'rurowy:adj'), freq=7, pmi=22.64673448009247),
 NGram(tokens=('reakcja:subst', 'łańcuchowa:subst', 'rozszczepienie:subst'), freq=5, pmi=21.81384446592773),
 NGram(tokens=('piłka:subst', 'nożny:adj', 'UEFA:subst'), freq=10, pmi=20.887845047371506),
 NGram(tokens=('Stany:subst', 'zjednoczyć:ppas', 'Ameryka:subst'), freq=6, pmi=20.64673448009247),
 NGram(tokens=('finałowy:adj', 'turniej:subst', 'mistrzostwa:subst'), freq=10, pmi=20.605075328455257),
 NGram(tokens=('przedwczesny:adj', 'wyrąb:subst', 'drzewostan:subst'), freq=9, pmi=20.432415359291703),
 NGram(tokens=('kurtka:subst', 'anorak:subst', 'etc:ign'), freq=8, pmi=20.289182475474387),
 NGram(tokens=('mecz:subst', 'piłka:subst', 'nożny:adj'), freq=5, pmi=20.1508794532053),
 NGram(tokens=('profil:subst', 'zaufany:adj', 'ePUAP:ign'), freq=13, pmi=20.063651712589536)]

# Create a table comparing the results for copora without and with tagging and lemmatization

In [61]:
import pandas as pd

### Bigrams comparison

In [77]:
index = pd.MultiIndex.from_tuples([
    ('tokens', 't1'),
    ('tokens', 't2'),
    ('tokens', 'freq'),
    ('tokens', 'pmi'),
    ('tags', 't1'),
    ('tags', 't2'),
    ('tags', 'freq'),
    ('tags', 'pmi'),
], names=["corpus", "field"])
df = pd.DataFrame([(*big_tokens.tokens, big_tokens.freq, big_tokens.pmi, *big_tagas.tokens, big_tagas.freq, big_tagas.pmi)
    for big_tokens, big_tagas in zip(bi_grams_sort_filt[:10], tags_bi_grams_sort[:10])], columns=index)
df

corpus,tokens,tokens,tokens,tokens,tags,tags,tags,tags
field,t1,t2,freq,pmi,t1,t2,freq,pmi
0,zaszkodzić,wynikom,5,13.656045,Grzegorz:subst,Schetyna:ign,5,12.501439
1,świeckie,przygotowujące,5,13.656045,młynek:subst,młotkowy:adj,5,12.501439
2,grzegorz,schetyna,5,13.656045,teryto:ign,rialnego:ign,5,12.501439
3,stajnią,wyścigową,5,13.656045,odpowiedzieć:fin,dzialności:ign,5,12.238405
4,teryto,rialnego,5,13.656045,pasta:subst,emulsyjny:adj,5,12.238405
5,ręcznego,miotacza,5,13.656045,Adam:subst,Mickiewicz:subst,6,12.238405
6,młyny,kulowe,5,13.656045,chrom:subst,sześciowartościowy:adj,5,12.238405
7,młynki,młotkowe,5,13.656045,łańcuchowa:subst,rozszczepienie:subst,5,12.238405
8,najnowszych,zdobyczy,5,13.656045,młyn:subst,kulowy:adj,5,12.016012
9,otworami,wiertniczymi,5,13.656045,Piotrek:subst,trybunalski:adj,7,12.016012


### Trigrams comparison

In [75]:
index = pd.MultiIndex.from_tuples([
    ('tokens', 't1'),
    ('tokens', 't2'),
    ('tokens', 't3'),
    ('tokens', 'freq'),
    ('tokens', 'pmi'),
    ('tags', 't1'),
    ('tags', 't2'),
    ('tags', 't3'),
    ('tags', 'freq'),
    ('tags', 'pmi'),
], names=["corpus", "field"])
df = pd.DataFrame([(*trig_tokens.tokens, trig_tokens.freq, trig_tokens.pmi, *trig_tagas.tokens, trig_tagas.freq, trig_tagas.pmi)
    for trig_tokens, trig_tagas in zip(tokens_tri_grams_sort[:10], tags_tri_grams_sort[:10])], columns=index)
df

corpus,tokens,tokens,tokens,tokens,tokens,tags,tags,tags,tags,tags
field,t1,t2,t3,freq,pmi,t1,t2,t3,freq,pmi
0,finałowego,turnieju,mistrzostw,10,24.796075,porcelanowy:adj,młyn:subst,kulowy:adj,5,23.517451
1,profilem,zaufanym,epuap,13,24.555067,wymiennik:subst,przeponowy:adj,rurowy:adj,7,22.646734
2,cienką,sierścią,zwierzęcą,11,24.496514,reakcja:subst,łańcuchowa:subst,rozszczepienie:subst,5,21.813844
3,przedwczesnego,wyrębu,drzewostanu,7,24.370984,piłka:subst,nożny:adj,UEFA:subst,10,20.887845
4,centralnemu,biuru,antykorupcyjnemu,5,24.104197,Stany:subst,zjednoczyć:ppas,Ameryka:subst,6,20.646734
5,turnieju,mistrzostw,europy,10,24.03054,finałowy:adj,turniej:subst,mistrzostwa:subst,10,20.605075
6,potwierdzonym,profilem,zaufanym,13,24.007579,przedwczesny:adj,wyrąb:subst,drzewostan:subst,9,20.432415
7,szybkiemu,postępowi,technicznemu,9,23.87913,kurtka:subst,anorak:subst,etc:ign,8,20.289182
8,piłce,nożnej,uefa,10,23.860022,mecz:subst,piłka:subst,nożny:adj,5,20.150879
9,wypalonym,paliwem,jądrowym,8,23.656738,profil:subst,zaufany:adj,ePUAP:ign,13,20.063652
