In [63]:
# check working directory
import os
os.getcwd() # if directory is subfolder, change to home
os.chdir('/home/sukayna/Documents/github/newspaper')

In [64]:
# import usual packages
import json
import nltk
from nltk.collocations import *
import datetime as dt
import locale
from modules.preprocess_articles import *
import spacy
from tqdm import tqdm
import pprint

### Load dataset + check frequencies

In [65]:
# always use json instead of pickle for persistence
# to load the corpus
with open('data/factiva_data.json', 'r') as f:
    factiva_corpus = json.load(f)

In [66]:
# define function to get frequencies from corpus
def Get_Frequency(mylist):
    count = {}
    for i in mylist:
        count[i] = count.get(i, 0) + 1
        count = {key: value for key, value in sorted(
            count.items(), key = lambda item: item[1], reverse=True)}
    return count

In [None]:
# remember corpus structure: title, body, date, newspaper

# frequency of dates and newspapers in factiva corpus
Get_Frequency(factiva_corpus[2]) # for dates
Get_Frequency(factiva_corpus[3]) # for newspapers

### Format dates correctly
Note: this can also be done in the dataframe cleaning step before preparing the corpus

In [68]:
# before pre-processing, change date format
locale.setlocale(locale.LC_ALL, 'de_DE.utf8')

'de_DE.utf8'

In [69]:
# first, convert string dates to datetime format
#factiva_corpus[2] # string date stored in a list

# convert strings to datetime using `datetime` library
factiva_corpus[2] = [dt.datetime.strptime(d, "%d %B %Y") for d in factiva_corpus[2]]

# Year-Month-Day HH:MM:SS is the default output for string dates
# keep date only from datetime object
factiva_corpus[2] = [d.date() for d in factiva_corpus[2]]

In [70]:
# print one element from list to check conversion
print(factiva_corpus[2][1].isoformat()) 
# this is the correct format: %Y-%b-%d

2021-07-05


### Collocation analysis for major newspapers over time
- preprocess corpus - remove punct + symobols + stop words only
- find collocations across documents AND across years
- calculate scores (rawfreq + PMI + chisq?) for comparison
- plot comparisons over time: association strength (dot chart) / netowrk graphs / biplots (using semantic similarity)
- check for collocation strength
- significance testing 

In [71]:
# preprocessing corpus

In [72]:
# Load model
spacy_mod = spacy.load("de_core_news_lg",
                 disable=['ner', 'parser', 'tagger'])

In [None]:
# may be necessary to examine subset only if process is slow
# for i, item in enumerate(factiva_corpus):
#    factiva_corpus[i] = item[:100]

In [74]:
# lowercase corpus text
# you should include this step in 'preprocess_articles.py' 
factiva_corpus[1] = [x.lower() for x in factiva_corpus[1]]


In [75]:
### this section to be replaced by updated functions ###

# run preprocessor on corpus
for doc in tqdm(factiva_corpus):
    newspaper_preprocessor = Preprocessor(newspaper_data = factiva_corpus, nlp = spacy_mod)
    newspaper_preprocessor.tokenize()
    newspaper_preprocessor.preprocess()
   

100%|██████████████████████████████████████████████████████████████████████████| 4/4 [03:53<00:00, 58.43s/it]


In [76]:
# save cleaned corpus
factiva_cleaned = newspaper_preprocessor.return_preprocessed()

In [96]:
print(factiva_cleaned[5][0:5], len(factiva_cleaned), type(factiva_cleaned), 
      sep='\n')
# tokens stored in list of docs 
# no metadata yet

['stark', 'blutend', 'Schnittverletzung', 'alt', 'Mann']
2564
<class 'list'>


### Collocations across all documents

In [124]:
# lowercase corpus text (again)
# the class preprocesser breaks lowercasing :/
# you should include this step in 'preprocess_articles.py' 

factiva_cleaned = [[word.lower() for word in doc] for doc in factiva_cleaned]
print(factiva_cleaned[5][0:5])

['stark', 'blutend', 'schnittverletzung', 'alt', 'mann']


In [None]:
# create bigrams for all documents

In [140]:
finder = nltk.collocations.BigramCollocationFinder.from_documents(factiva_cleaned)
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [141]:
# store bigram measures in dict for easy access
factiva_bigrams = dict(finder.score_ngrams(bigram_measures.raw_freq))

In [173]:
# unfortunately the class Preprocessor does not remove all punct
# so here we will filter it out of the dictionary
filtered_factiva_bigrams = {k:v for (k,v) in factiva_bigrams.items() 
                            if k[0] != '--' and v > 0.0001}

# sort dictionary by value
filtered_factiva_bigrams = dict(sorted(filtered_factiva_bigrams.items(), 
                                       key=lambda item: item[1], reverse=True))

In [177]:
# pretty print sorted dictionary 
# and write to file
pp = pprint.PrettyPrinter(indent=2, stream=open("outputs/factiva_bigrams.txt",'w'), sort_dicts=False)
pp.pprint(filtered_factiva_bigrams)


In [218]:
# extract top 100 bigrams by bmi
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 100)

# write to file
with open("outputs/factiva_bigram_pmi", 'w', encoding='utf-8') as f:
    for item in list(finder.nbest(bigram_measures.pmi, 100)):
        f.write(f'{item}\n')

In [None]:
# create trigrams for all documents 

In [185]:
finder_tri = nltk.collocations.TrigramCollocationFinder.from_documents(factiva_cleaned)
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [186]:
# store trigrams measures in dict for easy access
factiva_trigrams = dict(finder_tri.score_ngrams(trigram_measures.raw_freq))

# filter trigrams to remove punct
filtered_factiva_trigrams = {k:v for (k,v) in factiva_trigrams.items() 
                            if k[0] != '--' and v > 0.0001}
# sort dictionary by value
filtered_factiva_trigrams = dict(sorted(filtered_factiva_trigrams.items(), 
                                       key=lambda item: item[1], reverse=True))

In [187]:
# pretty print sorted dictionary 
# and write to file
pp = pprint.PrettyPrinter(indent=2, stream=open("outputs/factiva_trigrams.txt",'w'), sort_dicts=False)
pp.pprint(filtered_factiva_trigrams)


In [222]:
# extract top 100 trigrams by bmi
finder_tri.apply_freq_filter(3)
finder_tri.nbest(trigram_measures.pmi, 100)

# write to file
with open("outputs/factiva_trigram_pmi", 'w', encoding='utf-8') as f:
    for item in list(finder_tri.nbest(trigram_measures.pmi, 100)):
        f.write(f'{item}\n')

In [None]:
# we can also follow up with quadgrams and so on 
# but ideally remove irrelevant entity names & proper nouns