In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [4]:
# check working directory
import os
os.getcwd() # if directory is subfolder, change to home
os.chdir('/home/sukayna/Documents/github/newspaper')

In [5]:
# import usual packages
import json
import nltk
from nltk.collocations import *
import datetime as dt
import locale
import spacy
from tqdm import tqdm
import pprint
import pandas as pd

In [6]:
# always use json to load the corpus
with open('data/factiva_data.json', 'r') as f:
    factiva_corpus = json.load(f)

# Preprocessing 

### Main steps:

- load (german) spacy model 
- convert corpus to spacy language object
- run preprocessing function to clean corpus

- review stopwords to keep causal language/verbs/pronouns

- create two versions of corpus for comparison: 
    - fully cleaned w/o entities/stopwords/proper nouns
    - partially cleaned w/o entities/proper nouns but w/causal stopwords


In [7]:
# Load model
spacy_mod = spacy.load("de_core_news_lg",
                 disable=['ner', 'parser', 'tagger'])

In [8]:
# create smaller set of acceptable stopwords
# remove verbs, pronouns and connectors with causal meaning from stopwords
spacy_mod.Defaults.stop_words -= {'kam', 'sollte', 'dich', 'achte', 'daraus', 'dir', 'werdet', 'seid', 'unser', 'macht', 'deswegen',
                                  'außerdem', 'damit', 'habe', 'können', 'könnt', 'hatte', 'werde', 'andere', 'deiner', 'meines',
                                  'niemandem', 'achten', 'dürft', 'rechten', 'machte', 'dahinter', 'sah', 'seinen', 'dementsprechend',
                                  'kann', 'muß', 'wäre', 'geworden', 'wegen', 'machen', 'waren', 'dürfen', 'dein', 'mögen', 'würde',
                                  'musst', 'magst', 'ihren', 'aber', 'möchte', 'ihr', 'wir', 'allerdings', 'jedem', 'nicht', 'ihres',
                                  'kommt', 'gibt', 'infolgedessen', 'mögt', 'doch', 'sollten', 'seine', 'keine', 'wollte', 'ich',
                                  'müssen', 'wollten', 'warum', 'ist', 'ihnen', 'mein', 'mochte', 'geht', 'trotzdem', 'gab', 'durfte',
                                  'dagegen', 'sie', 'sind', 'wart', 'wer', 'haben', 'du', 'werden', 'eigene', 'ihn', 'seien', 'eigen',
                                  'meinen', 'seiner', 'hatten', 'müsst', 'wollen', 'indem', 'wollt', 'gehabt', 'deine',  'denn',
                                  'nachdem', 'konnte', 'ihrer', 'seinem', 'gemusst', 'bin', 'währenddem', 'dank', 'willst', 'würden',
                                  'gemocht',  'hätten', 'demzufolge', 'seines', 'mussten', 'nahm', 'daher', 'darauf', 'ging', 'mochten',
                                  'meinem', 'darum', 'gedurft', 'wurden', 'bist', 'ihrem', 'gehen', 'sein', 'kannst', 'gewollt',
                                  'könnte',  'heisst', 'neben', 'meiner', 'euch', 'darfst', 'deshalb', 'konnten', 'ausserdem', 'ihm',
                                  'tun', 'gekannt',  'worden', 'habt', 'darf', 'demgegenüber', 'gewesen', 'sollen', 'soll', 'kommen',
                                  'tat', 'jahre'}

In [9]:
# convert corpus to language object
factiva_spacy = []
for doc in tqdm(factiva_corpus):
    factiva_spacy.append(spacy_mod(doc['body']))

100%|███████████████████████████████████████████████████████████████████| 2564/2564 [00:56<00:00, 45.22it/s]


## Preprocessing the corpus

- Cleaned the corpus: 
    - factiva_cleaned (removes edited stopwords and entities)

In [10]:
# Function for preprocessing
def preprocess(doc: str, remove_ent=False):
    """_summary_

    Args:
        doc (str): String text
        remove_ent (bool, optional): If True, removes entities using spaCy. Defaults to False.
        remove_stop (bool, optional): If True, removes stopwords using adapted spaCy stopword list. Defaults to False.

    Returns:
        doc_preprocessed (list): Preprocessed lower-case corpus with punctuation, non-alphanumeric characters, spaCy stopwords and proper nouns removed.
    """
    
    if remove_ent == True:
        doc_no_ent = []
        ents = [e.text for e in doc.ents]
        for item in doc:
            if item.text in ents:
                pass
            else:
                doc_no_ent.append(item)

        doc_preprocessed = [token.lower_ for token in doc_no_ent if
                            # token is not punctuation
                            token.is_punct == False and
                            # token is alphanumeric character
                            token.is_alpha == True and
                            # token is not stop word
                            token.is_stop == False and
                            # token is not proper noun
                            token.pos_ != "PROPN"]
        
    else: # do not remove entities
        doc_preprocessed = [token.lower_ for token in doc if
                            # token is not punctuation
                            token.is_punct == False and
                            # token is alphanumeric character
                            token.is_alpha == True and
                            # token is not stop word
                            token.is_stop == False and
                            # token is not proper noun
                            token.pos_ != "PROPN"]

    return doc_preprocessed

In [11]:
# preprocess:remove stopwords
factiva_cleaned = []
for doc in tqdm(factiva_spacy): 
    factiva_cleaned.append(preprocess(doc, remove_ent=True))

100%|█████████████████████████████████████████████████████████████████| 2564/2564 [00:00<00:00, 3035.13it/s]


In [12]:
# compare dirty and cleaned corpus
print(factiva_spacy[5][0:30], len(factiva_spacy[5]), sep='\n')
print(factiva_cleaned[5][0:10], len(factiva_cleaned[5]), sep='\n')

['mit stark blutenden stich- und schnittverletzungen ist ein 29 jahre alter mann auf einem gehweg in berlin-schöneberg gefunden worden. ein passant rief am dienstagabend nach 23.00 uhr
145
['stark', 'blutenden', 'schnittverletzungen', 'ist', 'jahre', 'alter', 'mann', 'gehweg', 'gefunden', 'worden']
64


# Collocation Analysis

## Main steps:

- Create bi/tri/quad- ngrams
- Rank order ngrams by raw frequency (print to txt files)

- Examine association measures for top 50 ngrams
- Analyse differences in ngrams across documents
- Check for collocation strength/significance testing

- plot comparisons: association strength (dot chart) / network graphs / biplots (using semantic similarity)

## 1) Creating & Ranking Ngrams

- Bigrams from cleaned and unprocessed corpus

In [59]:
# rank bigrams by given metric 
from nltk.collocations import BigramCollocationFinder
from nltk.metrics.association import BigramAssocMeasures

def rank_bigrams(corpus, metric, from_words=False, threshold=int, path=None):
    """
    Find and rank ngrams from the supplied corpus using the given
    association metric. Write the trigrams out to the given path if
    supplied otherwise return the list in memory.
    
    Args:
    from_words (bool, optional): If True, unlist all documents and create ngrams from words.
    threshold (int): ignore all ngrams which occur less than n times in the corpus
    path (str): path to save ngram list as .txt file
    
    """
    
    if from_words == True:
        #turn all doc tokens into one single list
        unlist_corpus = [item for items in corpus for item in items]
        # Create a collocation ranking utility from corpus
        finder = BigramCollocationFinder.from_words(unlist_corpus)
        # Apply frequency filter = at least 3 times
        finder.apply_freq_filter(threshold)
        # Rank collocations by an association metric
        scored = finder.score_ngrams(metric)
            
    else: 
        finder = BigramCollocationFinder.from_documents(corpus)
        finder.apply_freq_filter(threshold)
        scored = finder.score_ngrams(metric)

    if path:
    # Write to disk as tab-delimited file
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored

In [65]:
# raw_freq + minimum 3 occurences + from_words
rank_bigrams(factiva_cleaned, BigramAssocMeasures.raw_freq, from_words=True, threshold=3, path='outputs/factiva_bigrams_from_words.txt')

In [None]:
# NOT RUN - rank_* function can be used to output all available measures for ngram comparison

# rank_bigrams(factiva_cleaned, BigramAssocMeasures.pmi,
#              'outputs/factiva_bigrams_pmi.txt')
# rank_bigrams(factiva_cleaned, BigramAssocMeasures.chi_sq,
#              'outputs/factiva_bigrams_chisq.txt')
# rank_bigrams(factiva_cleaned, BigramAssocMeasures.student_t,
#              'outputs/factiva_bigrams_ttest.txt')
# rank_bigrams(factiva_cleaned, BigramAssocMeasures.likelihood_ratio,
#              'outputs/factiva_bigrams_likelihood.txt')

- Trigrams from cleaned and unprocessed corpus

In [68]:
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics.association import TrigramAssocMeasures

def rank_trigrams(corpus, metric, from_words=False, threshold=int, path=None):
    """
    Find and rank ngrams from the supplied corpus using the given
    association metric. Write the trigrams out to the given path if
    supplied otherwise return the list in memory.
    
    Args:
    from_words (bool, optional): If True, unlist all documents and create ngrams from words.
    threshold (int): ignore all ngrams which occur less than n times in the corpus
    path (str): path to save ngram list as .txt file
    
    """
    
    if from_words == True:
        #turn all doc tokens into one single list
        unlist_corpus = [item for items in corpus for item in items]
        # Create a collocation ranking utility from corpus
        finder = TrigramCollocationFinder.from_words(unlist_corpus)
        # Apply frequency filter = at least 3 times
        finder.apply_freq_filter(threshold)
        # Rank collocations by an association metric
        scored = finder.score_ngrams(metric)
            
    else: 
        finder = TrigramCollocationFinder.from_documents(corpus)
        finder.apply_freq_filter(threshold)
        scored = finder.score_ngrams(metric)

    if path:
    # Write to disk as tab-delimited file
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored
    

In [69]:
# for cleaned factiva corpus
rank_trigrams(factiva_cleaned, TrigramAssocMeasures.raw_freq, from_words=True, threshold=3, path='outputs/factiva_trigrams_from_words.txt')

- Quadgrams from cleaned and unprocessed corpus

In [74]:
# rank quadgrams by given metric 
from nltk.collocations import QuadgramCollocationFinder
from nltk.metrics.association import QuadgramAssocMeasures

def rank_quadgrams(corpus, metric, from_words=False, threshold=int, path=None):
    """
    Find and rank ngrams from the supplied corpus using the given
    association metric. Write the trigrams out to the given path if
    supplied otherwise return the list in memory.
    
    Args:
    from_words (bool, optional): If True, unlist all documents and create ngrams from words.
    threshold (int): ignore all ngrams which occur less than n times in the corpus
    path (str): path to save ngram list as .txt file
    
    """
    
    if from_words == True:
        #turn all doc tokens into one single list
        unlist_corpus = [item for items in corpus for item in items]
        # Create a collocation ranking utility from corpus
        finder = QuadgramCollocationFinder.from_words(unlist_corpus)
        # Apply frequency filter = at least 3 times
        finder.apply_freq_filter(threshold)
        # Rank collocations by an association metric
        scored = finder.score_ngrams(metric)
            
    else: 
        finder = QuadgramCollocationFinder.from_documents(corpus)
        finder.apply_freq_filter(threshold)
        scored = finder.score_ngrams(metric)

    if path:
    # Write to disk as tab-delimited file
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored
    

In [75]:
# for cleaned factiva corpus
rank_quadgrams(factiva_cleaned, QuadgramAssocMeasures.raw_freq, from_words=True, threshold=3, path='outputs/factiva_quadgrams_from_docs.txt')

### NOTES

**Observations in the trigrams**

Overall general observations:
- Among the 42 most frequent trigrams, 20 contain the bigram “häusliche[r] gewalt”, which shows the importance of this bigram in the sub-corpus construction by Factiva.
    => results in a majority of articles that talk more generally about the phenomenon and less frequently about actual cases
- A lot of trigrams contain “gewalt”, which underlines the strong importance of this term for subcorpus construction
    => suggests that there might be a lack of recall for articles without “Gewalt”

General observations:
- “Fälle [von]” = providing context
- “insgesamt” or “Zahl [der Fälle]” = Statistik
- Plural (“Väter | Ehemänner | etc.] = providing context above individual cases (note: “partner” = singular and plural)
- Verbs in Singular (“starb”, “stirbt”) are a strong marker of individual cases
- Description of age (“-jährige”, “Jahre alte”) can be markers of individual cases
- “Prügel” or “verprügelt” seem to be strong markers of individual cases (not used as much in description of statistics or general situations)

Interesting trigrams:
- “sechsmonatigen kontaktverbot verurteilt” => very concrete situation
- “beamten sprachen annäherungsverbot” =>
- “blauen flecken gesicht” => case description
- “grün blau geschlagen” => case description
- “getrennt lebende Frau” => singular case
- “mann	vorläufig festgenommen” => police reporting
- “mutmaßliche täter festgenommen” => police reporting
- “psychisch druck gesetzt” => may be general or specific
- “erlag verletzungen krankenhaus” => police reporting
- “mutter erlag verletzungen” => police reporting
- “faust gesicht geschlagen” => police or justice reporting in a singular case
- “gemeinsame wohnung verlassen” => descriptives of a single case story
- “antrag opfers angeordnet” => indicates agency of the victim

Some relevant bigrams in the list:
- “gewaltbetroffen[xx] [Person[xx] | Frau[xx]” <= adjective for victim of violence 
- vs “gewalttätig[xx] [Mann[xx] | Partner | Vater etc.] <= adjective perpetrator of violence
- “seine [Ehe]Frau” “seine Partnerin” “seine Lebensgefährtin” + violence terms = hint for reporting on individual cases, with clear reference to the perpetrator being intimately linked
    => NOTE: “seine” is not included in bi- and trigrams because it is removed as frequent word, but important marker for individual case description
- “einstweilige verfügung” => singular case, police reporting
- “krankenhaus gebracht” => rather singluar case
- “name geändert” => annonymising victim or perpetrator
- “polizei mitteilte” => police reporting
- “ehefrau geschlagen” => violence in the marriage
- “Johnny Depp” “Amber Heard”

Interesting unigrams:
- “Gewaltopfer” vs “Gewalttäter” => german words for victim and perpetrator of violence
- “Beziehungsgewalt” => not domestic but relationship violence
- “Partnerschaft” vs “Beziehung” => check for these synonyms
- “mutmaßlich[xx]” => police reporting

Observations of what is missing in the trigrams:
- The elimination of the frequent words erases typical trigrams such as “Gewalt gegen Frauen”
    => question to what degree stopword removal deletes relevant markers

Other observations:
- There are certain ways of formulating general statements vs. formulating descriptions of single cases (e.g. temporal description-verb-subject: vs. subject-verb-temporal description)
- “Fast an jedem dritten Tag im Jahr stirbt eine Frau durch Ex-Partner oder Partner…” (general reporting)
    vs.
- “Das Opfer starb am nächsten Tag.” (invented, case reporting)
    vs.
- “Die Opfer sterben häufig…” (invented, general reporting)

### Comparison of association measures for ngrams

- Create bigrams, trigrams and quadgram association measures: top frequency, pmi, t-test, chi-square, likelihood ratio test

In [81]:
def ngram_comparison(corpus, ngram=['bigram', 'trigram', 'quadgram'], threshold=int, top=int, path=None):
    
    """
    Create comparison table for given ngram-type for 
    NLTK association measures based on supplied corpus.
    
    Specify how many top ranking ngrams should be compared. 
    
    Write the ngram table out to the given path if 
    supplied otherwise return the table in memory.
    
    Args:
    corpus (str): corpus that you want to examine
    ngram (str): type of ngram (only one option per function run)
    threshold (int): ignore all ngrams which occur less than n times in the corpus
    top (int): number of top ngrams to be displayed
    path (str): path to save ngram list as .txt file
    
    """
    
    unlist_corpus = [item for items in corpus for item in items]    
    
    if ngram == 'bigram':
        Finder = nltk.collocations.BigramCollocationFinder.from_words(unlist_corpus)
        metric = nltk.collocations.BigramAssocMeasures()
        Finder.apply_freq_filter(threshold)
    
    if ngram == 'trigram':
        Finder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_corpus)
        metric = nltk.collocations.TrigramAssocMeasures()
        Finder.apply_freq_filter(threshold)
        
    if ngram == 'quadgram':
        Finder = nltk.collocations.QuadgramCollocationFinder.from_words(unlist_corpus)
        metric = nltk.collocations.QuadgramAssocMeasures()
        Finder.apply_freq_filter(threshold)

    try:        
        freq_top = pd.DataFrame(list(Finder.score_ngrams(metric.raw_freq)), 
                              columns=['ngram','freq']).sort_values(by='freq', ascending=False)[:top].ngram.values
        
        pmi_top = pd.DataFrame(list(Finder.score_ngrams(metric.pmi)), 
                             columns=['ngram','pmi']).sort_values(by='pmi', ascending=False)[:top].ngram.values
       
        ttest_top = pd.DataFrame(list(Finder.score_ngrams(metric.student_t)), 
                             columns=['ngram','t-test']).sort_values(by='t-test', ascending=False)[:top].ngram.values
       
        chisq_top = pd.DataFrame(list(Finder.score_ngrams(metric.chi_sq)), 
                             columns=['ngram','chisq']).sort_values(by='chisq', ascending=False)[:top].ngram.values
       
        lrt_top = pd.DataFrame(list(Finder.score_ngrams(metric.likelihood_ratio)), 
                             columns=['ngram','lrt']).sort_values(by='lrt', ascending=False)[:top].ngram.values
        
    finally: 
        ngram_comparison = pd.DataFrame([freq_top, pmi_top, ttest_top, chisq_top, lrt_top]).T
        ngram_comparison.columns = ['Frequency', 'PMI', 'T-Test', 'Chi-Square', 'Likelihood Ratio']
    
    if path:
        with open(path, 'w') as f:
            ngram_comparison.to_csv(f, sep='\t', index = False)
            
    else: 
        return ngram_comparison

In [88]:
# compare assoc measures for bigrams
ngram_comparison(factiva_cleaned, ngram = 'bigram', threshold=3, top=50, path='outputs/cleaned_bigrams_assoc_measures.csv')

In [91]:
# compare assoc measures for trigrams
ngram_comparison(factiva_cleaned, ngram = 'trigram', threshold=3, top = 50, path='outputs/cleaned_trigrams_assoc_measures.csv')

In [92]:
# compare assoc measures for quadgrams
ngram_comparison(factiva_cleaned, ngram = 'quadgram', threshold=3, top = 50, path='outputs/cleaned_quadgrams_assoc_measures.csv')

In [98]:
# visualise comparison tabel for quadgrams 
ngram_comparison(factiva_cleaned, ngram = 'quadgram', threshold=8, top = 20)

Unnamed: 0,Frequency,PMI,T-Test,Chi-Square,Likelihood Ratio
0,"(verhütung, bekämpfung, gewalt, frauen)","(schichten, nationen, familienverhältnissen, b...","(verhütung, bekämpfung, gewalt, frauen)","(schichten, nationen, familienverhältnissen, b...","(häuslicher, gewalt, frauen, kinder)"
1,"(gewalt, frauen, häuslicher, gewalt)","(deutsche, fassung, enw, bda)","(gewalt, frauen, häuslicher, gewalt)","(autor, deutsche, fassung, enw)","(fälle, häuslicher, gewalt, frauen)"
2,"(bekämpfung, gewalt, frauen, häuslicher)","(autor, deutsche, fassung, enw)","(bekämpfung, gewalt, frauen, häuslicher)","(deutsche, fassung, enw, bda)","(häuslicher, gewalt, gewalt, frauen)"
3,"(verein, frauen, helfen, frauen)","(veröffentlichung, bestimmt, kontakte, autor)","(verein, frauen, helfen, frauen)","(veröffentlichung, bestimmt, kontakte, autor)","(gewalt, frauen, häuslicher, gewalt)"
4,"(europarats, verhütung, bekämpfung, gewalt)","(deutsche, fassung, redaktion, enw)","(europarats, verhütung, bekämpfung, gewalt)","(veröffentlichung, bestimmt, kontakte, autorin)","(opfer, häuslicher, gewalt, geworden)"
5,"(folgenden, informationen, sind, nicht)","(besetzt, landeskriminalamt, kriminalpräventio...","(nicht, veröffentlichung, bestimmt, kontakte)","(kontakte, autor, deutsche, fassung)","(zahl, opfer, häuslicher, gewalt)"
6,"(informationen, sind, nicht, veröffentlichung)","(kontakte, autor, deutsche, fassung)","(sind, nicht, veröffentlichung, bestimmt)","(deutsche, fassung, redaktion, enw)","(prozent, opfer, häuslicher, gewalt)"
7,"(nicht, veröffentlichung, bestimmt, kontakte)","(veröffentlichung, bestimmt, kontakte, autorin)","(folgenden, informationen, sind, nicht)","(übereinkommen, europarats, verhütung, bekämpf...","(männliche, opfer, häuslicher, gewalt)"
8,"(sind, nicht, veröffentlichung, bestimmt)","(suchstichwörter, text, folgenden, informationen)","(informationen, sind, nicht, veröffentlichung)","(bestimmt, kontakte, autor, deutsche)","(opfer, häuslicher, gewalt, schützen)"
9,"(häuslicher, gewalt, betroffen, sind)","(landeskriminalamt, kriminalprävention, bietet...","(häuslicher, gewalt, betroffen, sind)","(landeskriminalamt, kriminalprävention, bietet...","(schutz, opfer, häuslicher, gewalt)"


#### END OF CODE 