In [2]:
# check working directory
import os
os.getcwd() # if directory is subfolder, change to home
os.chdir('/home/sukayna/Documents/github/newspaper')

In [4]:
# import usual packages
import json
import nltk
from nltk.collocations import *
import datetime as dt
import locale
import spacy
from tqdm import tqdm
import pprint
import pandas as pd

In [23]:
# download POS tagger from NLTK
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sukayna/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [5]:
# always use json to load the corpus
with open('data/factiva_data.json', 'r') as f:
    factiva_corpus = json.load(f)

In [24]:
# Load model (with pos tagger component activated)
spacy_mod = spacy.load("de_core_news_lg",
                 disable=['ner', 'parser'])

In [25]:
# create smaller set of acceptable stopwords
# remove verbs, pronouns and connectors with causal meaning from stopwords
spacy_mod.Defaults.stop_words -= {'kam', 'sollte', 'dich', 'achte', 'daraus', 'dir', 'werdet', 'seid', 'unser', 'macht', 'deswegen',
                                  'außerdem', 'damit', 'habe', 'können', 'könnt', 'hatte', 'werde', 'andere', 'deiner', 'meines',
                                  'niemandem', 'achten', 'dürft', 'rechten', 'machte', 'dahinter', 'sah', 'seinen', 'dementsprechend',
                                  'kann', 'muß', 'wäre', 'geworden', 'wegen', 'machen', 'waren', 'dürfen', 'dein', 'mögen', 'würde',
                                  'musst', 'magst', 'ihren', 'aber', 'möchte', 'ihr', 'wir', 'allerdings', 'jedem', 'nicht', 'ihres',
                                  'kommt', 'gibt', 'infolgedessen', 'mögt', 'doch', 'sollten', 'seine', 'keine', 'wollte', 'ich',
                                  'müssen', 'wollten', 'warum', 'ist', 'ihnen', 'mein', 'mochte', 'geht', 'trotzdem', 'gab', 'durfte',
                                  'dagegen', 'sie', 'sind', 'wart', 'wer', 'haben', 'du', 'werden', 'eigene', 'ihn', 'seien', 'eigen',
                                  'meinen', 'seiner', 'hatten', 'müsst', 'wollen', 'indem', 'wollt', 'gehabt', 'deine',  'denn',
                                  'nachdem', 'konnte', 'ihrer', 'seinem', 'gemusst', 'bin', 'währenddem', 'dank', 'willst', 'würden',
                                  'gemocht',  'hätten', 'demzufolge', 'seines', 'mussten', 'nahm', 'daher', 'darauf', 'ging', 'mochten',
                                  'meinem', 'darum', 'gedurft', 'wurden', 'bist', 'ihrem', 'gehen', 'sein', 'kannst', 'gewollt',
                                  'könnte',  'heisst', 'neben', 'meiner', 'euch', 'darfst', 'deshalb', 'konnten', 'ausserdem', 'ihm',
                                  'tun', 'gekannt',  'worden', 'habt', 'darf', 'demgegenüber', 'gewesen', 'sollen', 'soll', 'kommen',
                                  'tat', 'jahre'}

In [26]:
# convert corpus to language object
factiva_spacy = []
for doc in tqdm(factiva_corpus):
    factiva_spacy.append(spacy_mod(doc['body']))

100%|████████████████████████████████████████████████████████████| 2564/2564 [00:59<00:00, 42.89it/s]


In [27]:
# Function for preprocessing (does not remove stopwords)
def preprocess(doc: str, remove_ent=False):
    """_summary_

    Args:
        doc (str): String text
        remove_ent (bool, optional): If True, removes entities using spaCy. Defaults to False.

    Returns:
        doc_preprocessed (list): Preprocessed lower-case corpus with punctuation, non-alphanumeric characters, spaCy stopwords and proper nouns removed.
    """
    
    if remove_ent == True:
        doc_no_ent = []
        ents = [e.text for e in doc.ents]
        for item in doc:
            if item.text in ents:
                pass
            else:
                doc_no_ent.append(item)

        doc_preprocessed = [token.lower_ for token in doc_no_ent if
                            # token is not punctuation
                            token.is_punct == False and
                            # token is alphanumeric character
                            token.is_alpha == True and
                            # token is not proper noun
                            token.pos_ != "PROPN"]
        
    else: # do not remove entities
        doc_preprocessed = [token.lower_ for token in doc if
                            # token is not punctuation
                            token.is_punct == False and
                            # token is alphanumeric character
                            token.is_alpha == True and
                            # token is not proper noun
                            token.pos_ != "PROPN"]

    return doc_preprocessed

In [28]:
# preprocess (only minimal cleaning!)
factiva_cleaned = []
for doc in tqdm(factiva_spacy): 
    factiva_cleaned.append(preprocess(doc, remove_ent=True))

100%|██████████████████████████████████████████████████████████| 2564/2564 [00:01<00:00, 2486.81it/s]


In [29]:
# unlist corpus for extracting ngrams from words instead of docs
unlisted = [item for items in factiva_cleaned for item in items]
len(unlisted) # total tokens in corpus

1032054

In [30]:
# initialise bigram finder and assoc measures
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlisted)
bigrams = nltk.collocations.BigramAssocMeasures()

In [31]:
# extract frequency
bigram_freq = bigramFinder.ngram_fd.items()

In [32]:
# create bigram freq table
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [33]:
bigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"(in, der)",4278
1,"(häuslicher, gewalt)",3375
2,"(in, den)",2002
3,"(häusliche, gewalt)",1886
4,"(für, die)",1531


### Use POS tags to filter bigrams 

Note: needs to be improved/fixed

In [34]:
# get english stopwords
de_stopwords = set(spacy_mod.Defaults.stop_words)

In [35]:
# function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in de_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [40]:
# filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [41]:
filtered_bi[:10]

Unnamed: 0,bigram,freq
1779,"(häuslicher, gewalt)",3375
3046,"(häusliche, gewalt)",1886
1778,"(wegen, häuslicher)",524
2137,"(opfer, häuslicher)",395
4810,"(häuslichen, gewalt)",292
3617,"(sie, nicht)",281
7242,"(seine, frau)",274
11031,"(aber, nicht)",274
20420,"(wir, haben)",274
4425,"(ich, habe)",259
