In [20]:
import pandas as pd 
import pickle
import nltk
import numpy as np
from nltk.collocations import *
import spacy
from modules.preprocess_articles import *
from tqdm import tqdm

In [2]:
# Load model
nlp = spacy.load("de_core_news_lg",
                 disable=['ner', 'parser', 'tagger'])


In [3]:
# Load data
with open(r"data/factiva_export.pkl", "rb") as f:
    factiva = pickle.load(f)

In [4]:
# Only consider first 10 articles for demo
for i, item in enumerate(factiva):
    factiva[i] = item[:50]

In [5]:
# Preprocess pipeline 
newspaperPreprocessor = Preprocessor(newspaper_data = factiva, nlp = nlp)
newspaperPreprocessor.tokenize()
newspaperPreprocessor.preprocess()

# Individual articles 
factiva_preprocessed = newspaperPreprocessor.return_preprocessed()

# Joint articles
factiva_preprocessed_long = [j for i in factiva_preprocessed for j in i]

In [7]:
# Load methods for 2 and 3 word pairs 
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [23]:
articles_collocations = []

for doc in tqdm(factiva_preprocessed): 
    # Bigrams 
    finder = BigramCollocationFinder.from_words(doc)
    # only bigrams that appear +2 times 
    finder.apply_freq_filter(2)
    # return 10 n-grams with highest PMI
    articles_collocations.append(finder.nbest(bigram_measures.likelihood_ratio, 10))

# try this: https://stackoverflow.com/questions/41094134/finding-top-bigrams-across-multiple-large-files
# inspo: https://github.com/nicharuc/Collocations/blob/master/Collocations.ipynb
# documentation: https://www.nltk.org/howto/collocations.html


100%|██████████| 50/50 [00:00<00:00, 1813.96it/s]


In [None]:
## Bigrams
finder = BigramCollocationFinder.from_words(
   factiva_preprocessed[33])
# only bigrams that appear 3+ times
finder.apply_freq_filter(2)
# return the 10 n-grams with the highest PMI
print(finder.nbest(bigram_measures.likelihood_ratio, 10))

In [12]:
## Bigrams
finder = BigramCollocationFinder.from_words(
   factiva_preprocessed[33])
# only bigrams that appear 3+ times
finder.apply_freq_filter(2)
# return the 10 n-grams with the highest PMI
print(finder.nbest(bigram_measures.likelihood_ratio, 10))

[]


In [16]:
## Trigrams
finder = TrigramCollocationFinder.from_words(
   factiva_preprocessed[33])
# only trigrams that appear 3+ times
finder.apply_freq_filter(2)
# return the 10 n-grams with the highest PMI
print(finder.nbest(trigram_measures.likelihood_ratio, 10))

[]
