<a href="https://colab.research.google.com/github/blue-create/langlens/blob/main/scripts/topic_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Imports

In [1]:
import json
import nltk
from nltk.collocations import *
import datetime as dt
import locale
import spacy
from tqdm import tqdm
import pprint
import pandas as pd
from ast import literal_eval
import os
import gensim
import gensim.corpora as corpora
from pprint import pprint
from tqdm import tqdm 


In [None]:
!python -m spacy download de_core_news_lg

#### Read Data

In [3]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#change cwd
#%cd drive/MyDrive/Work/Frontline/data
%cd /content/drive/MyDrive/data/

/content/drive/MyDrive/data


In [5]:
# paths
FILTERED_PATH="filtered_4_26"

In [6]:
dfs = []

# loop through files 
for filename in tqdm(os.listdir(FILTERED_PATH)):
    # if csv file, load and add to dfs  
    if filename.endswith(".csv"):
        file_path = os.path.join(FILTERED_PATH, filename)
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)

# combine files in df
df = pd.concat(dfs, ignore_index=True)

100%|██████████| 207/207 [00:17<00:00, 11.68it/s]


### Topic Modelling


#### Prepare Data


In [7]:
# custom module
import preprocessing

In [8]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [9]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [18]:
smaller_df=df.iloc[:10000,:]

In [10]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(df.iterrows()): 
  spacy_lang.append(spacy_mod("".join(doc['text'])))

63359it [39:45, 26.57it/s]


In [11]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang): 
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 63359/63359 [00:40<00:00, 1550.59it/s]


#### Topic Analysis

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(spacy_cleaned)

# Create Corpus
texts = spacy_cleaned

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [13]:
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word, num_topics=num_topics)
# Print the key words for each topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.138*"telefon" + 0.017*"beratung" + 0.016*"infos" + 0.016*"kontakt" + '
  '0.014*"menschen" + 0.013*"montag" + 0.010*"monat" + 0.010*"treffen" + '
  '0.010*"donnerstag" + 0.010*"freitag"'),
 (1,
  '0.008*"kinder" + 0.006*"frauen" + 0.005*"gewalt" + 0.005*"menschen" + '
  '0.005*"b" + 0.004*"häuslicher" + 0.003*"eltern" + 0.003*"telefon" + '
  '0.003*"euro" + 0.003*"i"'),
 (2,
  '0.008*"polizei" + 0.008*"frau" + 0.007*"mann" + 0.006*"gewalt" + '
  '0.005*"frauen" + 0.004*"menschen" + 0.004*"opfer" + 0.003*"täter" + '
  '0.003*"kinder" + 0.003*"leben"'),
 (3,
  '0.023*"telefon" + 0.018*"style" + 0.017*"straße" + 0.016*"gewalt" + '
  '0.016*"do" + 0.012*"bereitschaftsdienst" + 0.012*"frauen" + 0.012*"mi" + '
  '0.011*"häuslicher" + 0.011*"beratung"'),
 (4,
  '0.024*"gewalt" + 0.023*"frauen" + 0.006*"style" + 0.006*"häuslicher" + '
  '0.006*"kinder" + 0.006*"prozent" + 0.005*"häusliche" + 0.004*"opfer" + '
  '0.004*"menschen" + 0.004*"polizei"')]


In [None]:
## VISUALiZATION NOT WORKING ##

# import pyLDAvis
# import pyLDAvis.gensim
# import pickle 
# pyLDAvis.enable_notebook()
# pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

END OF CODE