### Imports

In [1]:
# import usual packages
import json
import nltk
from nltk.collocations import *
import datetime as dt
import locale
import spacy
from tqdm import tqdm
import pprint
import pandas as pd
from ast import literal_eval
import os

### Read Data

In [3]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#change cwd
%cd drive/MyDrive/Work/Frontline/data

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [None]:
# import spacy package
#!python -m spacy download de_core_news_lg

#### Read Data

In [10]:
df=pd.read_csv("sample.csv")

In [11]:
df.shape

(50, 9)

# Preprocessing 

### Main steps:

- load (german) spacy model 
- convert corpus to spacy language object
- run preprocessing function to clean corpus

- review stopwords to keep causal language/verbs/pronouns

- create two versions of corpus for comparison: 
    - fully cleaned w/o entities/stopwords/proper nouns
    - partially cleaned w/o entities/proper nouns but w/causal stopwords


In [12]:
# custom module
import preprocessing

In [15]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [16]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [17]:
smaller_df=df.iloc[:10000,:]

In [18]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(smaller_df.iterrows()): 
  spacy_lang.append(spacy_mod("".join(doc['text'])))

50it [00:03, 14.39it/s]


In [19]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang): 
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 50/50 [00:00<00:00, 1197.05it/s]


In [20]:
# compare dirty and cleaned corpus
print(spacy_lang[0][0:30], len(spacy_lang[0]), sep='\n')
print(spacy_cleaned[0][0:10], len(spacy_cleaned[0]), sep='\n')

['(lnw). Eine 36-jährige Frau, die am Dienstag in Krefeld niedergestochen wurde, ist am selben Abend im Krankenhaus gestorben. Das teilte die Polizei
94
['lnw', 'frau', 'dienstag', 'niedergestochen', 'selben', 'abend', 'krankenhaus', 'gestorben', 'teilte', 'polizei']
31


In [21]:
spacy_lang=[]

# Collocation Analysis

## Main steps:

- Create bi/tri/quad- ngrams
- Rank order ngrams by raw frequency (print to txt files)

- Examine association measures for top 10 ngrams
- Analyse differences in ngrams across documents
- Check for collocation strength/significance testing

- plot comparisons: association strength (dot chart) / network graphs / biplots (using semantic similarity)

## Methods:
- rank_bigrams: ranks bigrams by given metric 
- rank_trigrams: ranks trigrams by given metric 
- rank_quadgrams: ranks quadgrams by given metric 
- ngram_comparison: compares ngram by different metrics

In [22]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics.association import BigramAssocMeasures

from nltk.collocations import TrigramCollocationFinder
from nltk.metrics.association import TrigramAssocMeasures

from nltk.collocations import QuadgramCollocationFinder
from nltk.metrics.association import QuadgramAssocMeasures


In [23]:
# rank bigrams by given metric 
def rank_bigrams(corpus, metric, from_words=False, threshold=int, path=None):
    """
    Find and rank ngrams from the supplied corpus using the given
    association metric. Write the trigrams out to the given path if
    supplied otherwise return the list in memory.
    
    Args:
    from_words (bool, optional): If True, unlist all documents and create ngrams from words.
    threshold (int): ignore all ngrams which occur less than n times in the corpus
    path (str): path to save ngram list as .txt file
    
    """
    
    if from_words == True:
        #turn all doc tokens into one single list
        unlist_corpus = [item for items in corpus for item in items]
        # Create a collocation ranking utility from corpus
        finder = BigramCollocationFinder.from_words(unlist_corpus)
        # Apply frequency filter = at least 3 times
        finder.apply_freq_filter(threshold)
        # Rank collocations by an association metric
        scored = finder.score_ngrams(metric)
            
    else: 
        finder = BigramCollocationFinder.from_documents(corpus)
        finder.apply_freq_filter(threshold)
        scored = finder.score_ngrams(metric)

    if path:
    # Write to disk as tab-delimited file
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored

In [24]:
#ranks trigrams by given metric
def rank_trigrams(corpus, metric, from_words=False, threshold=int, path=None):
    """
    Find and rank ngrams from the supplied corpus using the given
    association metric. Write the trigrams out to the given path if
    supplied otherwise return the list in memory.
    
    Args:
    from_words (bool, optional): If True, unlist all documents and create ngrams from words.
    threshold (int): ignore all ngrams which occur less than n times in the corpus
    path (str): path to save ngram list as .txt file
    
    """
    
    if from_words == True:
        #turn all doc tokens into one single list
        unlist_corpus = [item for items in corpus for item in items]
        # Create a collocation ranking utility from corpus
        finder = TrigramCollocationFinder.from_words(unlist_corpus)
        # Apply frequency filter = at least 3 times
        finder.apply_freq_filter(threshold)
        # Rank collocations by an association metric
        scored = finder.score_ngrams(metric)
            
    else: 
        finder = TrigramCollocationFinder.from_documents(corpus)
        finder.apply_freq_filter(threshold)
        scored = finder.score_ngrams(metric)

    if path:
    # Write to disk as tab-delimited file
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored
    

In [25]:
# rank quadgrams by given metric 
def rank_quadgrams(corpus, metric, from_words=False, threshold=int, path=None):
    """
    Find and rank ngrams from the supplied corpus using the given
    association metric. Write the trigrams out to the given path if
    supplied otherwise return the list in memory.
    
    Args:
    from_words (bool, optional): If True, unlist all documents and create ngrams from words.
    threshold (int): ignore all ngrams which occur less than n times in the corpus
    path (str): path to save ngram list as .txt file
    
    """
    
    if from_words == True:
        #turn all doc tokens into one single list
        unlist_corpus = [item for items in corpus for item in items]
        # Create a collocation ranking utility from corpus
        finder = QuadgramCollocationFinder.from_words(unlist_corpus)
        # Apply frequency filter = at least 3 times
        finder.apply_freq_filter(threshold)
        # Rank collocations by an association metric
        scored = finder.score_ngrams(metric)
            
    else: 
        finder = QuadgramCollocationFinder.from_documents(corpus)
        finder.apply_freq_filter(threshold)
        scored = finder.score_ngrams(metric)

    if path:
    # Write to disk as tab-delimited file
        with open(path, 'w') as f:
            f.write("Collocation\tScore ({})\n".format(metric.__name__))
            for ngram, score in scored:
                f.write("{}\t{}\n".format(repr(ngram), score))
    else:
        return scored
    

In [26]:
def ngram_comparison(corpus, ngram=['bigram', 'trigram', 'quadgram'], threshold=int, top=int, path=None):
    
    """
    Create comparison table for given ngram-type for 
    NLTK association measures based on supplied corpus.
    
    Specify how many top ranking ngrams should be compared. 
    
    Write the ngram table out to the given path if 
    supplied otherwise return the table in memory.
    
    Args:
    corpus (str): corpus that you want to examine
    ngram (str): type of ngram (only one option per function run)
    threshold (int): ignore all ngrams which occur less than n times in the corpus
    top (int): number of top ngrams to be displayed
    path (str): path to save ngram list as .txt file
    
    """
    
    unlist_corpus = [item for items in corpus for item in items]    
    
    if ngram == 'bigram':
        Finder = nltk.collocations.BigramCollocationFinder.from_words(unlist_corpus)
        metric = nltk.collocations.BigramAssocMeasures()
        Finder.apply_freq_filter(threshold)
    
    if ngram == 'trigram':
        Finder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_corpus)
        metric = nltk.collocations.TrigramAssocMeasures()
        Finder.apply_freq_filter(threshold)
        
    if ngram == 'quadgram':
        Finder = nltk.collocations.QuadgramCollocationFinder.from_words(unlist_corpus)
        metric = nltk.collocations.QuadgramAssocMeasures()
        Finder.apply_freq_filter(threshold)

    try:        
        freq_top = pd.DataFrame(list(Finder.score_ngrams(metric.raw_freq)), 
                              columns=['ngram','freq']).sort_values(by='freq', ascending=False)[:top].ngram.values
        
        pmi_top = pd.DataFrame(list(Finder.score_ngrams(metric.pmi)), 
                             columns=['ngram','pmi']).sort_values(by='pmi', ascending=False)[:top].ngram.values
       
        ttest_top = pd.DataFrame(list(Finder.score_ngrams(metric.student_t)), 
                             columns=['ngram','t-test']).sort_values(by='t-test', ascending=False)[:top].ngram.values
       
        chisq_top = pd.DataFrame(list(Finder.score_ngrams(metric.chi_sq)), 
                             columns=['ngram','chisq']).sort_values(by='chisq', ascending=False)[:top].ngram.values
       
        lrt_top = pd.DataFrame(list(Finder.score_ngrams(metric.likelihood_ratio)), 
                             columns=['ngram','lrt']).sort_values(by='lrt', ascending=False)[:top].ngram.values
        
    finally: 
        ngram_comparison = pd.DataFrame([freq_top, pmi_top, ttest_top, chisq_top, lrt_top]).T
        ngram_comparison.columns = ['Frequency', 'PMI', 'T-Test', 'Chi-Square', 'Likelihood Ratio']
    
    if path:
        with open(path, 'w') as f:
            ngram_comparison.to_csv(f, sep='\t', index = False)
            
    else: 
        return ngram_comparison

## 1) Creating & Ranking Ngrams

- Bigrams from cleaned and unprocessed corpus

In [33]:
# raw_freq + minimum 3 occurences + from_words
rank_bigrams(spacy_cleaned, BigramAssocMeasures.raw_freq, from_words=True, threshold=5)

[(('häuslicher', 'gewalt'), 0.006795979045731275),
 (('gewalt', 'frauen'), 0.0032564066260795696),
 (('häusliche', 'gewalt'), 0.002831657935721365),
 (('frauen', 'opfer'), 0.0014158289678606825),
 (('opfer', 'häuslicher'), 0.0012742460710746142),
 (('weißen', 'rings'), 0.001132663174288546),
 (('fälle', 'häuslicher'), 0.0009910802775024777),
 (('männer', 'frauen'), 0.0009910802775024777),
 (('weißen', 'ring'), 0.0009910802775024777),
 (('frauen', 'kinder'), 0.0008494973807164094),
 (('mord', 'totschlag'), 0.0008494973807164094),
 (('frauen', 'helfen'), 0.0007079144839303412),
 (('frauen', 'männer'), 0.0007079144839303412),
 (('gefährlicher', 'körperverletzung'), 0.0007079144839303412),
 (('weiße', 'ring'), 0.0007079144839303412)]

- Trigrams from cleaned and unprocessed corpus

In [34]:
# for cleaned factiva corpus
rank_trigrams(spacy_cleaned, TrigramAssocMeasures.raw_freq, from_words=True, threshold=3,)

[(('opfer', 'häuslicher', 'gewalt'), 0.0012742460710746142),
 (('fälle', 'häuslicher', 'gewalt'), 0.0009910802775024777),
 (('frauen', 'helfen', 'frauen'), 0.000566331587144273),
 (('schutz', 'häuslicher', 'gewalt'), 0.000566331587144273),
 (('häuslicher', 'gewalt', 'frau'), 0.0004247486903582047)]

- Quadgrams from cleaned and unprocessed corpus

In [37]:
# for cleaned factiva corpus
rank_quadgrams(spacy_cleaned, QuadgramAssocMeasures.raw_freq, from_words=True, threshold=3,)

[]

## Comparison of association measures for ngrams

- Create bigrams, trigrams and quadgram association measures: top frequency, pmi, t-test, chi-square, likelihood ratio test

In [30]:
# compare assoc measures for bigrams
ngram_comparison(spacy_cleaned, ngram = 'bigram', threshold=3, top=10,)# path='outputs/cleaned_bigrams_assoc_measures.csv')

Unnamed: 0,Frequency,PMI,T-Test,Chi-Square,Likelihood Ratio
0,"(häuslicher, gewalt)","(entsprechenden, antrag)","(häuslicher, gewalt)","(forensische, ambulanz)","(häuslicher, gewalt)"
1,"(gewalt, frauen)","(forensische, ambulanz)","(häusliche, gewalt)","(mord, totschlag)","(häusliche, gewalt)"
2,"(häusliche, gewalt)","(bedrohung, nötigung)","(gewalt, frauen)","(entsprechenden, antrag)","(weißen, rings)"
3,"(frauen, opfer)","(mord, totschlag)","(opfer, häuslicher)","(bedrohung, nötigung)","(mord, totschlag)"
4,"(opfer, häuslicher)","(sexuellen, übergriffen)","(weißen, rings)","(sexuellen, übergriffen)","(gewalt, frauen)"
5,"(weißen, rings)","(anzeige, erstatten)","(frauen, opfer)","(weißen, rings)","(weißen, ring)"
6,"(fälle, häuslicher)","(flucht, alten)","(weißen, ring)","(gefährlicher, körperverletzung)","(forensische, ambulanz)"
7,"(männer, frauen)","(stier, heblich)","(fälle, häuslicher)","(anzeige, erstatten)","(gefährlicher, körperverletzung)"
8,"(weißen, ring)","(gefährlicher, körperverletzung)","(mord, totschlag)","(weiße, ring)","(weiße, ring)"
9,"(mord, totschlag)","(genios, style)","(männer, frauen)","(häuslicher, gewalt)","(anzeige, erstatten)"


In [31]:
# compare assoc measures for trigrams
ngram_comparison(spacy_cleaned, ngram = 'trigram', threshold=2, top = 10,)# path='outputs/cleaned_trigrams_assoc_measures.csv')

Unnamed: 0,Frequency,PMI,T-Test,Chi-Square,Likelihood Ratio
0,"(opfer, häuslicher, gewalt)","(rassistisch, motivierte, aggression)","(opfer, häuslicher, gewalt)","(rassistisch, motivierte, aggression)","(häuslicher, gewalt, frauen)"
1,"(fälle, häuslicher, gewalt)","(zweckgemeinschaft, unterm, strich)","(fälle, häuslicher, gewalt)","(zweckgemeinschaft, unterm, strich)","(fälle, häuslicher, gewalt)"
2,"(frauen, helfen, frauen)","(unterm, strich, tragisch)","(schutz, häuslicher, gewalt)","(unterm, strich, tragisch)","(opfer, häuslicher, gewalt)"
3,"(schutz, häuslicher, gewalt)","(angriff, offenheit, hochschulen)","(frauen, helfen, frauen)","(angriff, offenheit, hochschulen)","(schutz, häuslicher, gewalt)"
4,"(häuslicher, gewalt, frau)","(job, freunde, vereine)","(häuslicher, gewalt, frau)","(job, freunde, vereine)","(anzeigen, häuslicher, gewalt)"
5,"(männer, opfer, häuslicher)","(versuchte, vollendete, delikte)","(rassistisch, motivierte, aggression)","(versuchte, vollendete, delikte)","(fall, häuslicher, gewalt)"
6,"(quasi, letzte, instanz)","(neuer, job, freunde)","(unterm, strich, tragisch)","(neuer, job, freunde)","(häuslicher, gewalt, landkreis)"
7,"(opfer, gefährlicher, körperverletzung)","(raus, wichtigste, regel)","(zweckgemeinschaft, unterm, strich)","(raus, wichtigste, regel)","(häuslicher, gewalt, frau)"
8,"(neues, soziales, umfeld)","(kinderschutzeinrichtung, vereins, wegweiser)","(angriff, offenheit, hochschulen)","(kinderschutzeinrichtung, vereins, wegweiser)","(häuslicher, gewalt, prozent)"
9,"(neuer, job, freunde)","(quasi, letzte, instanz)","(job, freunde, vereine)","(quasi, letzte, instanz)","(häusliche, gewalt, erlitten)"


In [32]:
# compare assoc measures for quadgrams
ngram_comparison(spacy_cleaned, ngram = 'quadgram', threshold=2, top = 10,)# path='outputs/cleaned_quadgrams_assoc_measures.csv')

Unnamed: 0,Frequency,PMI,T-Test,Chi-Square,Likelihood Ratio
0,"(absolut, kontakt, alte, umfeld)","(zweckgemeinschaft, unterm, strich, tragisch)","(zweckgemeinschaft, unterm, strich, tragisch)","(zweckgemeinschaft, unterm, strich, tragisch)","(männer, opfer, häuslicher, gewalt)"
1,"(neues, soziales, umfeld, neuer)","(neuer, job, freunde, vereine)","(neuer, job, freunde, vereine)","(neuer, job, freunde, vereine)","(schutz, häuslicher, gewalt, landkreis)"
2,"(wichtigste, regel, lautet, absolut)","(wichtigste, regel, lautet, absolut)","(wichtigste, regel, lautet, absolut)","(wichtigste, regel, lautet, absolut)","(zweckgemeinschaft, unterm, strich, tragisch)"
3,"(verein, frauen, helfen, frauen)","(raus, wichtigste, regel, lautet)","(raus, wichtigste, regel, lautet)","(raus, wichtigste, regel, lautet)","(neuer, job, freunde, vereine)"
4,"(umfeld, neuer, job, freunde)","(bemächtigt, all, daten, passwörter)","(bemächtigt, all, daten, passwörter)","(bemächtigt, all, daten, passwörter)","(alternative, flucht, alten, leben)"
5,"(soziales, umfeld, neuer, job)","(soziales, umfeld, neuer, job)","(soziales, umfeld, neuer, job)","(soziales, umfeld, neuer, job)","(wichtigste, regel, lautet, absolut)"
6,"(schutz, häuslicher, gewalt, landkreis)","(regel, lautet, absolut, kontakt)","(regel, lautet, absolut, kontakt)","(regel, lautet, absolut, kontakt)","(bemächtigt, all, daten, passwörter)"
7,"(schutz, frauen, mädchen, geschlechtsspezifisc...","(umfeld, neuer, job, freunde)","(umfeld, neuer, job, freunde)","(umfeld, neuer, job, freunde)","(raus, wichtigste, regel, lautet)"
8,"(regel, lautet, absolut, kontakt)","(neues, soziales, umfeld, neuer)","(neues, soziales, umfeld, neuer)","(neues, soziales, umfeld, neuer)","(soziales, umfeld, neuer, job)"
9,"(raus, wichtigste, regel, lautet)","(lautet, absolut, kontakt, alte)","(lautet, absolut, kontakt, alte)","(lautet, absolut, kontakt, alte)","(regel, lautet, absolut, kontakt)"


## Observations
- Bigrams:
  - gewalt + frauen : most reporting on case with women
  - weißen + rings: mentioning help for victims
  - mord + totschlag: reporting on extreme cases
  -gefährlicher + körperverletzung: reporting on physical violence
- Trigrams:
  - fälle + häuslicher + gewalt: general reporting on domestic violence
  - schutz + häuslicher + gewalt: mentioning help for victims/ campaigns against domestic voilence


#### END OF CODE 