#### Imports

In [34]:
import json
import nltk
from nltk.collocations import *
import datetime as dt
import locale
import spacy
from tqdm import tqdm
import pprint
import pandas as pd
from ast import literal_eval
import os
import gensim
import gensim.corpora as corpora
from pprint import pprint


In [10]:
#!python -m spacy download de_core_news_lg

#### Read Data

In [22]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#change cwd
%cd drive/MyDrive/Work/Frontline/data

In [20]:
# paths
FILTERED_PATH="filtered_4_26"

In [40]:
dfs = []

# loop through files 
for filename in os.listdir(FILTERED_PATH):
    # if csv file, load and add to dfs  
    if filename.endswith(".csv"):
        file_path = os.path.join(FILTERED_PATH, filename)
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)

# combine files in df
df = pd.concat(dfs, ignore_index=True)

### Topic Modelling


#### Prepare Data


In [41]:
# custom module
import preprocessing

In [42]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [43]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [45]:
smaller_df=df.iloc[:10000,:]

In [46]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(smaller_df.iterrows()): 
  spacy_lang.append(spacy_mod("".join(doc['text'])))

10000it [05:41, 29.29it/s]


In [47]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang): 
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 10000/10000 [00:05<00:00, 1721.74it/s]


#### Topic Analysis

In [48]:
# Create Dictionary
id2word = corpora.Dictionary(spacy_cleaned)

# Create Corpus
texts = spacy_cleaned

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [49]:
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word, num_topics=num_topics)
# Print the key words for each topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]



[(0,
  '0.033*"telefon" + 0.012*"gewalt" + 0.011*"bereitschaftsdienst" + '
  '0.009*"frauen" + 0.005*"häuslicher" + 0.005*"menschen" + 0.004*"beratung" + '
  '0.004*"prozent" + 0.004*"kinder" + 0.003*"sa"'),
 (1,
  '0.018*"telefon" + 0.006*"polizei" + 0.005*"gewalt" + 0.005*"menschen" + '
  '0.005*"frauen" + 0.005*"frau" + 0.004*"mann" + 0.004*"beratung" + '
  '0.003*"leben" + 0.003*"kinder"'),
 (2,
  '0.053*"bereitschaftsdienst" + 0.033*"do" + 0.030*"mi" + 0.026*"telefon" + '
  '0.026*"gewalt" + 0.021*"häuslicher" + 0.018*"jugendtelefon" + '
  '0.018*"entstörungsdienst" + 0.017*"interventionsstelle" + 0.017*"kummer"'),
 (3,
  '0.016*"gewalt" + 0.013*"frauen" + 0.007*"polizei" + 0.006*"frau" + '
  '0.005*"kinder" + 0.005*"opfer" + 0.005*"prozent" + 0.005*"häuslicher" + '
  '0.005*"menschen" + 0.004*"mann"'),
 (4,
  '0.066*"telefon" + 0.016*"bereitschaftsdienst" + 0.011*"mi" + '
  '0.008*"beratung" + 0.007*"gewalt" + 0.006*"montag" + 0.006*"jugendtelefon" '
  '+ 0.006*"sprechstunde" + 0

In [50]:
## VISUALiZATION NOT WORKING ##

# import pyLDAvis
# import pyLDAvis.gensim
# import pickle 
# pyLDAvis.enable_notebook()
# pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

END OF CODE