In [1]:
import nltk

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.cistem import Cistem
import itertools
from datetime import datetime
import pandas as pd
import os
from pandas import read_csv
import csv

In [9]:
import dict_zurich
from dict_zurich import emotional_dictionary, positive_data_list_stemmed, negative_data_list_stemmed,positive_data_list_cleaned
from dict_zurich import negative_data_list_cleaned
from dict_zurich import dict_neg_words, dict_pos_words, dict_neg_words_stemmed, dict_pos_words_stemmed

# Dictionary Frequencies

The following piece of research did not make it to the official publication. However, it was still interesting to see what kind of emotionally charged vocabulary sources use depending on their political affiliation. At the bottom of this notebook, you can find the frequency dictionary insights that we got after cleaning the output and making some sense of it. 

In [3]:
list_of_pos_neg_values_stemmed = negative_data_list_stemmed + positive_data_list_stemmed + emotional_dictionary[1] + dict_neg_words_stemmed + dict_pos_words_stemmed
list_of_pos_neg_values_unstemmed = negative_data_list_cleaned + positive_data_list_cleaned + emotional_dictionary[0] + dict_pos_words + dict_neg_words
list_of_empty_values = [0]*len(list_of_pos_neg_values_stemmed)

In [10]:
def dict_of_freq(direct):
    
    """this function returns a frequency dictionary for
    a corpus located in the passed directory; it also 
    solves the problem with stemmed and unstemmed words, 
    removing some coincidental matches"""
    
    dictionary_of_freq_stemmed = {}
    dictionary_of_keywords = dict(zip(list_of_pos_neg_values_stemmed,list_of_empty_values))
    dictionary_of_stem_to_unstem = {}
    
    for file in os.listdir(direct):
        if file.endswith(".txt"):
            text = open(direct + file, encoding="utf8")
            tokenizer = RegexpTokenizer(r'\w+')
            stemmer = Cistem()
            text = [tokenizer.tokenize(word) for word in text]
            text = list(filter(None, text))
            text = list(itertools.chain.from_iterable(text))
            unstemmed_text = text
            text = [stemmer.stem(word) for word in text]
            for i, stemmed_word in enumerate(text): 
                if stemmed_word in dictionary_of_keywords:
                    unstemmed_word = unstemmed_text[i]
                    if stemmed_word in dictionary_of_freq_stemmed:
                        dictionary_of_freq_stemmed[stemmed_word] += 1
                        if unstemmed_word in dictionary_of_stem_to_unstem[stemmed_word]:
                            pass
                        else:
                            dictionary_of_stem_to_unstem[stemmed_word].append(unstemmed_word)
                    else:
                        dictionary_of_freq_stemmed[stemmed_word] = 1
                        dictionary_of_stem_to_unstem[stemmed_word] = [unstemmed_word]


    final_dictionary_of_freq = {key:[val, dictionary_of_stem_to_unstem[key]] for key, val in dictionary_of_freq_stemmed.items()}
    freq = [small_list[0] for small_list in list(final_dictionary_of_freq.values())]
    word = [small_list[1] for small_list in list(final_dictionary_of_freq.values())]
    dict_merged = {k: v for k, v in zip(freq, word)}
    dict_merged = {key:val for key,val in sorted(dict_merged.items(), key=lambda item: item[0], reverse = True)}

    return dict_merged

In [11]:
stern_freq_dic = dict_of_freq('./texts/stern/')
taz_freq_dic = dict_of_freq('./texts/')
dk_freq_dic = dict_of_freq('./texts/deutschland kurier/')

In [84]:
taz = pd.DataFrame.from_dict(taz_freq_dic,orient='index').transpose()
taz.to_excel('taz_dict_freq.xlsx')

### Below are some insights into the frequencies in our corpora

Taz | Deutschland Kurier | Stern
---|---|---
Hilfe (145) | illegal (98) | retten (89)
Anzeige (83) | Abschiebung (91) | arm (62)
Haus (74) | fordern (76)| Kampf (55)
Abschiebung (73) | links (72) | Recht (53)
Problem (71) | Gesetz (66) | Gesetz (50)
Gesetz (70) | Bürger (63) | Kritik (40)
Schutz (59)| Recht (60) | Lösung (33)
Recht (58)| Straftat (51) | Tod (32)
Gefahr (54)| erhalten (47) | Solidarität (24)
offen (52)| Gewalt (40) | Hilfe (23)

The following is just some speculations, and yet, they are quite curious. 

#### Taz
For the liberal source, Taz, we see that the discourse about refugees revolves around *help* (Hilfe) and *complaints* (Anzeige). Housing is a big topic as well as refugees' *rights* and *legislation* regarding their presence in their country. Overall, the liberal source seems to be more compassionate to refugees and defend their rights; 

#### Deutschland Kurier
The conservative source, Deutschland Kurier, is pretty straightforward about the association between refugees and illegal activities. Abschiebung (sending them home) comes close second. There are many mentions of crime and violence as well as appeals to law and human rights. 

#### Stern
The socia-democratic Stern shows compassion to refugees (arm - poor) and discusses their rescue a lot (retten). The vocabulary is also decidedly more negative than that of the other sources. It contains words such as war (Kampf) and death (Tod).  