In [107]:
import os
import json
import spacy
from spacy.lang.de import German
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora


In [108]:
folder = os.path.join('data','REFINED','ndr')

In [109]:
def iteratefiles(filepath):
    list_dir = os.listdir(filepath)
    data = []
    for item in list_dir:
        with open(os.path.join(folder,item),"r",encoding='utf-8') as f:
            res = f.read() 
        dict = json.loads(res)
        content = dict['content'].values()
        c = ''
        for l in content: 
            c += l[1]
        content = c
        tokens = prepare_text_for_lda(content)
        data.append(tokens)
    return data

text = iteratefiles(folder)

In [110]:
parser = German()

def tokenize(text): 
    # tokenize
    tokens = []
    t = parser(text)
    for token in t:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            tokens.append('URL')
        else:
            tokens.append(token.lower_)
    return tokens


def lemmatize(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def return_lemma(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
stop = set(nltk.corpus.stopwords.words('german'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop]
    tokens = [lemmatize(token) for token in tokens]
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\teres\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [111]:
data = iteratefiles(folder)

In [116]:
dictionary = corpora.Dictionary(data)
corpus = [dictionary.doc2bow(token) for token in data]
num_topics = 100
ldamodel = gensim.models.ldamodel.LdaModel(corpus,num_topics=num_topics,id2word=dictionary,passes=15)
ldamodel.save('model5.gensim')

In [117]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(38, '0.000*"virus" + 0.000*"schon" + 0.000*"sagen" + 0.000*"beispiel" + 0.000*"immer" + 0.000*"natürlich" + 0.000*"prozent" + 0.000*"vielleicht" + 0.000*"einfach" + 0.000*"genau"')
(11, '0.000*"schon" + 0.000*"sagen" + 0.000*"immer" + 0.000*"natürlich" + 0.000*"beispiel" + 0.000*"virus" + 0.000*"einfach" + 0.000*"vielleicht" + 0.000*"genau" + 0.000*"studie"')
(26, '0.000*"virus" + 0.000*"sagen" + 0.000*"immer" + 0.000*"schon" + 0.000*"natürlich" + 0.000*"eigentlich" + 0.000*"studie" + 0.000*"beispiel" + 0.000*"einfach" + 0.000*"patienten"')
(89, '0.000*"schon" + 0.000*"virus" + 0.000*"immer" + 0.000*"beispiel" + 0.000*"natürlich" + 0.000*"sagen" + 0.000*"einfach" + 0.000*"studie" + 0.000*"vielleicht" + 0.000*"patienten"')
(34, '0.000*"schon" + 0.000*"sagen" + 0.000*"virus" + 0.000*"beispiel" + 0.000*"einfach" + 0.000*"vielleicht" + 0.000*"patienten" + 0.000*"prozent" + 0.000*"wirklich" + 0.000*"immer"')
(45, '0.012*"türklinke" + 0.010*"eingeübt" + 0.008*"einspielen" + 0.008*"hinkommt"