In [1]:
import pandas as pd 
import pickle
import nltk
import numpy as np
from nltk.collocations import *
import spacy
from modules.preprocess_articles import *
from modules.collocation_articles import *
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# use json instead of pickle for persistence
import json
# to load the corpus
with open("data/factiva_data.json", 'r') as f:
    factiva_corpus = json.load(f)

In [3]:
# Load model
spacy_mod = spacy.load("de_core_news_lg",
                 disable=['ner', 'parser', 'tagger'])


In [6]:
# select first 100 articles for demo - but perhaps better to select relevant articles instead?
for i, item in enumerate(factiva_corpus):
    factiva_corpus[i] = item[:100]

In [7]:
# Preprocessing pipeline 
newspaper_preprocessor = Preprocessor(newspaper_data = factiva_corpus, nlp = spacy_mod)
newspaper_preprocessor.tokenize()
newspaper_preprocessor.preprocess()

# Individual articles 
factiva_preprocessed = newspaper_preprocessor.return_preprocessed()

# Joint articles
factiva_preprocessed_joint = [j for i in factiva_preprocessed for j in i]

In [8]:
#print(factiva_preprocessed) # list of pre-proc tokens?
print(factiva_preprocessed_joint)

['physisch', 'Gewalt', 'psychisch', 'Gewalt', 'digital', 'Gewalt', 'wirken', 'Staat', 'Waffe', 'einsetzen', 'erklären', 'Samstag', 'Berlin', 'Dissident', 'Journalist', 'Opfer', 'Spähsoftware', 'Pegasus', 'säße', 'Gehirn', 'Anwalt', 'Mazen', 'Masri', 'Privatsphäre', 'zerstören', 'beschreiben', 'mexikanisch', 'Journalistin', 'Carmen', 'Aristegui', 'Edward', 'Snowden', 'per', 'Video', 'zuschalten', 'warnen', 'eindringlich', 'drohend', 'Erosion', 'Demokratie', 'Aristegui', 'Kollege', 'minderjährig', 'Sohn', 'Pegasus', 'staatlich', 'Organ', 'auf', 'Handy', 'spielen', 'Software', 'erlauben', 'unbemerkt', 'sämtlicher', 'Datum', 'Opfer', 'abgreifen', 'gesamt', 'Kommunikation', 'verfolgen', 'Kamera', 'Mikrofon', 'Gerät', 'überwachen', 'sogar', 'Name', 'Anruf', 'tätig', 'Nachricht', 'verschicken', 'links', 'Netzwerk', 'betroffener', 'infizieren', 'Pegasus', 'israelisch', 'Softwarefirma', 'NSO', 'Group', 'entwickeln', 'Staat', 'Mexiko', 'vereinigt', 'arabisch', 'Emirat', 'Spanien', 'verkaufen', '

In [9]:
# Collocations inside individual articles
articles_collocations = []

for doc in tqdm(factiva_preprocessed): 
    collocations = CollocationArticles(doc, nltk.collocations)
    articles_collocations.append(collocations.BigramCollocations(doc_type = False))

# Collocations across documents 
doc_finder = CollocationArticles(factiva_preprocessed, nltk.collocations)
docs_collocations = doc_finder.BigramCollocations(doc_type = True)

# Collocations in joint articles 
joint_finder = CollocationArticles(factiva_preprocessed_joint, nltk.collocations)
articles_collocations_joint = joint_finder.BigramCollocations(doc_type = False)

# try this: https://stackoverflow.com/questions/41094134/finding-top-bigrams-across-multiple-large-files
# inspo: https://github.com/nicharuc/Collocations/blob/master/Collocations.ipynb
# documentation: https://www.nltk.org/howto/collocations.html


100%|██████████| 100/100 [00:00<00:00, 1176.84it/s]


In [None]:
# let's first make sure the collocations are actually helpful for german news