#### Imports

In [1]:
import json
import nltk
from nltk.collocations import *
import datetime as dt
import locale
import spacy
from tqdm import tqdm
import pprint
import pandas as pd
from ast import literal_eval
import os
import gensim
import gensim.corpora as corpora
from pprint import pprint


In [None]:
!python -m spacy download de_core_news_lg

In [None]:
#NOTE: default version doesntwork in colab
!pip install pyLDAvis==2.1.2

#### Read Data

In [4]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
#change cwd
%cd drive/MyDrive/Work/Frontline/data

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [6]:
# paths
FILTERED_PATH="filtered_4_26"

In [7]:
## FOR NOW: USE SMALL DATA SET
df=pd.read_csv("sample.csv")

### Topic Modelling


#### Prepare Data


In [8]:
# custom module
import preprocessing

In [9]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [10]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [11]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(df.iterrows()): 
  spacy_lang.append(spacy_mod("".join(doc['text'])))

50it [00:02, 22.20it/s]


In [12]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang): 
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 50/50 [00:00<00:00, 1732.73it/s]


#### Topic Analysis

In [13]:
import pyLDAvis
import pyLDAvis.gensim
import pickle 

#### Creating the model

In [30]:
# Create Dictionary
id2word = corpora.Dictionary(spacy_cleaned)

# Create Corpus
texts = spacy_cleaned

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

  and should_run_async(code)


In [29]:
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word, num_topics=num_topics)
# Print the key words for each topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

  and should_run_async(code)


[(0,
  '0.009*"frauen" + 0.009*"frau" + 0.007*"gewalt" + 0.004*"häuslicher" + '
  '0.004*"opfer" + 0.004*"mann" + 0.003*"polizei" + 0.003*"männer" + '
  '0.002*"kinder" + 0.002*"menschen"'),
 (1,
  '0.010*"frauen" + 0.008*"gewalt" + 0.006*"opfer" + 0.006*"frau" + '
  '0.005*"mann" + 0.004*"häuslicher" + 0.003*"männer" + 0.003*"leben" + '
  '0.003*"landkreis" + 0.002*"menschen"'),
 (2,
  '0.011*"frau" + 0.009*"gewalt" + 0.007*"mann" + 0.007*"frauen" + '
  '0.006*"opfer" + 0.005*"polizei" + 0.003*"häuslicher" + 0.003*"männer" + '
  '0.003*"leben" + 0.002*"fälle"'),
 (3,
  '0.016*"gewalt" + 0.011*"frauen" + 0.008*"opfer" + 0.006*"frau" + '
  '0.005*"häuslicher" + 0.004*"mann" + 0.003*"männer" + 0.003*"prozent" + '
  '0.003*"polizei" + 0.003*"leben"'),
 (4,
  '0.019*"gewalt" + 0.018*"frauen" + 0.007*"opfer" + 0.007*"frau" + '
  '0.005*"häuslicher" + 0.005*"mann" + 0.004*"männer" + 0.004*"polizei" + '
  '0.004*"prozent" + 0.003*"fälle"')]


In [34]:
# saving model
lda_model.save("models/tmodel_sample")

  and should_run_async(code)


#### Loading a previously saved model

In [36]:
lda_model=gensim.models.LdaMulticore.load("models/tmodel_sample", mmap="r")
lda_model.print_topics()

  and should_run_async(code)


[(0,
  '0.009*"frauen" + 0.009*"frau" + 0.007*"gewalt" + 0.004*"häuslicher" + 0.004*"opfer" + 0.004*"mann" + 0.003*"polizei" + 0.003*"männer" + 0.002*"kinder" + 0.002*"menschen"'),
 (1,
  '0.010*"frauen" + 0.008*"gewalt" + 0.006*"opfer" + 0.006*"frau" + 0.005*"mann" + 0.004*"häuslicher" + 0.003*"männer" + 0.003*"leben" + 0.003*"landkreis" + 0.002*"menschen"'),
 (2,
  '0.011*"frau" + 0.009*"gewalt" + 0.007*"mann" + 0.007*"frauen" + 0.006*"opfer" + 0.005*"polizei" + 0.003*"häuslicher" + 0.003*"männer" + 0.003*"leben" + 0.002*"fälle"'),
 (3,
  '0.016*"gewalt" + 0.011*"frauen" + 0.008*"opfer" + 0.006*"frau" + 0.005*"häuslicher" + 0.004*"mann" + 0.003*"männer" + 0.003*"prozent" + 0.003*"polizei" + 0.003*"leben"'),
 (4,
  '0.019*"gewalt" + 0.018*"frauen" + 0.007*"opfer" + 0.007*"frau" + 0.005*"häuslicher" + 0.005*"mann" + 0.004*"männer" + 0.004*"polizei" + 0.004*"prozent" + 0.003*"fälle"')]

In [40]:
# Create Corpus
texts = spacy_cleaned

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Saving dictionary of imported model
id2word=lda_model.id2word

  and should_run_async(code)


### Vizualizing models

In [41]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

  and should_run_async(code)
  head(R).drop('saliency', 1)


END OF CODE