# Laboratory work #2 (n-grams)

In [2]:
import os


import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
from nltk.util import ngrams
from nltk.corpus import stopwords
from collections import Counter

import math
from nltk.probability import FreqDist
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures

[nltk_data] Downloading package punkt to /Users/aleksei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def read_files(root_dir, n=None):
    file_paths = []
    
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.tsv'):
                file_paths.append(os.path.join(subdir, file))

    data = []
    for i, file_path in enumerate(file_paths):
        if n is not None and i >= n:
            break
        d = pd.read_csv(file_path, sep='\t', header=0)
        data.append(d.dropna())
    return data

In [4]:
train = read_files("../assets/annotated-corpus/train", 1000)
val = read_files("../assets/annotated-corpus/val", 100)
test = read_files("../assets/annotated-corpus/test", 100)

In [5]:
train[0].head()

Unnamed: 0,Token,Stem,Lemma
0,WASHINGTON,washington,WASHINGTON
1,Reuters,reuter,Reuters
2,The,the,The
3,head,head,head
4,of,of,of


In [6]:
def get_n_gram_freq(df_list, language='english'):
    stop_words = set(stopwords.words(language))
    
    n_grams = []
    for df in df_list:
        stems = df['Stem'].apply(lambda x: re.sub(r"[^\w\s]", "", x)).str.lower()
        stems = [stem for stem in stems if stem not in stop_words and stem.strip() != '']
        trigrams = list(ngrams(stems, 3))
        n_grams.extend(trigrams)

    n_gram_freq = Counter(n_grams)
    return n_gram_freq

In [7]:
train_freq = get_n_gram_freq(train)

In [8]:
train_freq

Counter({('presid', 'donald', 'trump'): 265,
         ('presid', 'barack', 'obama'): 141,
         ('u', 'presid', 'donald'): 128,
         ('washington', 'reuter', 'u'): 103,
         ('respond', 'request', 'comment'): 55,
         ('reuter', 'u', 'presid'): 54,
         ('south', 'china', 'sea'): 48,
         ('white', 'hous', 'said'): 44,
         ('presid', 'vladimir', 'putin'): 41,
         ('new', 'york', 'reuter'): 38,
         ('elect', 'donald', 'trump'): 37,
         ('u', 'secretari', 'state'): 36,
         ('speaker', 'paul', 'ryan'): 36,
         ('presid', 'elect', 'donald'): 35,
         ('secretari', 'state', 'rex'): 34,
         ('immedi', 'respond', 'request'): 33,
         ('state', 'rex', 'tillerson'): 33,
         ('washington', 'reuter', 'presid'): 32,
         ('georg', 'w', 'bush'): 32,
         ('chancellor', 'angela', 'merkel'): 32,
         ('u', 'hous', 'repres'): 31,
         ('republican', 'presidenti', 'candid'): 30,
         ('nov', '8', 'elect'): 30,
  

In [20]:
def calculate_MI(n_grams, total_words, word_freq):
    mi_scores = {}
    for n_gram in n_grams:
        p_n_gram = n_grams[n_gram] / total_words
        p_w1 = word_freq[n_gram[0]] / total_words
        p_w2 = word_freq[n_gram[1]] / total_words

        mi_score = math.log2(p_n_gram / (p_w1 * p_w2))
        mi_scores[n_gram] = mi_score
    return mi_scores


def get_mi_scores(freq):
    total_words = sum(freq.values())
    word_freq = FreqDist(word for trigram in freq for word in trigram)
    mi_scores = calculate_MI(freq, total_words, word_freq)
    return mi_scores


def get_mi_scores_nltk(df_list, language='english'):
    stop_words = set(stopwords.words(language))
    full_text = ''
    
    for df in df_list:
        words = df['Stem'].apply(lambda x: re.sub(r"[^\w\s]", "", x)).str.lower()
        words = [word for word in words if word not in stop_words and word.strip() != '']
        full_text += ' '.join(words)
        
    tokens = nltk.word_tokenize(full_text, language, True)
    text = nltk.Text(tokens)
        
    trigram_measures = TrigramAssocMeasures()
    finder = TrigramCollocationFinder.from_words(text)
    nltk_mi_scores = finder.score_ngrams(trigram_measures.pmi)
    return nltk_mi_scores

In [21]:
train_mi_scores = get_mi_scores(train_freq)

In [22]:
train_mi_scores_nltk = get_mi_scores_nltk(train)

In [23]:
n = 30
sorted_mi_scores = sorted(train_mi_scores.items(), key=lambda x: x[1], reverse=True)[:n]
print(f'Top {n} trigrams MI:')
for trigram, score in sorted_mi_scores:
    print(f"{trigram}: {score}")

Top 30 trigrams MI:
('abdel', 'fattah', 'al'): 16.259859708173046
('zalmay', 'khalilzad', 'former'): 15.67489720745189
('bobbi', 'jindal', 'former'): 15.67489720745189
('lesli', 'rutledg', 'republican'): 15.67489720745189
('cathi', 'mcmorri', 'rodger'): 15.67489720745189
('mcmorri', 'rodger', 'u'): 15.67489720745189
('bharatiya', 'janata', 'parti'): 15.259859708173046
('khmer', 'roug', 'command'): 15.259859708173046
('kyaw', 'soe', 'oo'): 15.259859708173046
('thom', 'tilli', 'democrat'): 15.259859708173046
('kuala', 'lumpur', 'reuter'): 15.166750303781566
('taro', 'kano', 'said'): 14.937931613285684
('ophthalmologist', 'salomon', 'melgen'): 14.844822208894202
('lone', 'kyaw', 'soe'): 14.800428089535748
('katrina', 'pierson', 'former'): 14.67489720745189
('cambridg', 'massaschusett', 'base'): 14.67489720745189
('musk', 'iger', 'quit'): 14.67489720745189
('erwan', 'monier', 'lead'): 14.67489720745189
('amber', 'rudd', 'wrote'): 14.67489720745189
('butter', 'curtain', 'bed'): 14.674897207

In [24]:
print(f'Top {n} trigrams MI with nltk:')
for trigram, score in train_mi_scores_nltk[:n]:
    print(f"{trigram}: {score}")

Top 30 trigrams MI with nltk:
('abdolrasul', 'dori', 'esfahani'): 35.70272949945396
('adolf', 'hitler', 'benito'): 35.70272949945396
('amen', 'awfa', 'nejm'): 35.70272949945396
('areva', 'toshiba', 'westinghous'): 35.70272949945396
('ballast', 'nedam', 'canari'): 35.70272949945396
('beatriz', 'ruiz', 'zuleima'): 35.70272949945396
('bir', 'kumar', 'yadav'): 35.70272949945396
('bosnian', 'serb', 'gen'): 35.70272949945396
('brigid', 'callahan', 'harrison'): 35.70272949945396
('buana', 'cahaya', 'suks'): 35.70272949945396
('canari', 'wharf', 'songbird'): 35.70272949945396
('catherin', 'cortez', 'masto'): 35.70272949945396
('cheryl', 'boon', 'isaac'): 35.70272949945396
('cognac', 'parma', 'ham'): 35.70272949945396
('colourless', 'unambiti', 'princel'): 35.70272949945396
('cornish', 'pasti', 'roquefort'): 35.70272949945396
('cp', 'cms', 'pageid'): 35.70272949945396
('darrel', 'issa', 'vista'): 35.70272949945396
('den', 'bahariya', 'oasi'): 35.70272949945396
('egan', 'elliot', 'schrage'): 35.