In [1]:
from os.path import join
import pandas as pd

import gensim
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import TfidfModel, LdaModel, LdaMulticore
from gensim.models.phrases import Phrases, Phraser

import pyLDAvis as ldavis
import pyLDAvis.gensim
ldavis.enable_notebook()

import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from constants import FULL_PATH, ETL_PATH, NLP_PATH, SMPL_PATH, POS, NOUN, PROPN, TOKEN, HASH, SENT_IDX

def docs_to_lists(token_series):
    return tuple(token_series.tolist())

In [4]:
dataset = ['OnlineParticipation', 'FAZ'][1]
df = pd.read_pickle(join(SMPL_PATH, dataset + '_simple.pickle'))
df = df[df[POS].isin({NOUN, PROPN, 'NER', 'NPHRASE'})]
df = df.groupby([HASH])[TOKEN].agg(docs_to_lists)

In [8]:
# loads a list of random document ids for a balanced dataset (~2000 documents per category)
# for the FAZ dataset
faz_sample = join(ETL_PATH, 'FAZ_document_sample.pickle')
faz_sample = pd.read_pickle(faz_sample)
faz_sample_ids = faz_sample.index
faz_sample_ids

Int64Index([ 4301134201484101152, -5696035671390904143, -6533682718526575818,
            -3346664675579691204, -8358676581287387714, -2785236834914555528,
             4539470495046700910, -6291733065589777869,  3008594611884955597,
             2667660060079641753,
            ...
            -9201876886379869587,  -682332148408733217, -1477403614105270264,
            -6046850756068127617, -8426549411078832312, -4079889429213597579,
            -2812039998456727519, -4001184660305417764,   903203255141181056,
            -3327279407656915164],
           dtype='int64', name='hash', length=13643)

In [9]:
import numpy as np
intersect = np.intersect1d(faz_sample.index.values, df.index.values)
print(intersect)
print(intersect.shape)
union = np.union1d(faz_sample.index.values, df.index.values)
print(union)
print(union.shape)

[-6541785991313574912 -4282370157978565120 -3678630174048332800
 -3595003702759523840 -2779766014965026304 -2307713969445782016
 -2191663350948864256 -2180314715888122624 -2011831270334207232
 -1771118729572549120 -1703077179928423936 -1467507433706368256
 -1465005113624309248 -1122323515148345088  -852078218234603136
  -306863025342409024  -277170277859988320  -157035938325186656
   -89604267562993744   -77822968137134912   -48992859930099920
   -42678520111220440   -33351537468997156   -31115158608833216
   -27818198464873360   -10506512567653280    -8670420270801952
    -8301321570435398    -5791255954927385    -4511041435369750
    -1369466082829884     -294469541870598      263848088864845
      374369825057928     1443054829291776     4040132589990543
     4824689116034861     5286315956173707     7720799316789147
     9270184643551342    14930834443638534    15812556456997382
    16300360208839788    16919702910232478    27226271131462520
    27349755401085380    306790357997635

In [5]:
df

hash
-9222966757875651584    (Bereich, Feingeist, Grobstoffel, Deutschland,...
-9222617066393794560    (Name, Hurrikan, Irma, Harvey, Harvey, Irma, J...
-9222111851211850752    (Serie, Arbeit, Kaffeeküche, Karriere, Hort, K...
-9221822663289604096    (Juppé, Ersatzkandidat, Konservativer, Fillons...
-9221773540373386240    (Facebook-Datenskandal, Peter, Thiels, Firma, ...
-9221136108316593152    (Sex-Skandal, Oxfam, Fall, Sex-Skandal, Oxfam,...
-9220846605603615744    (Arbeitsrecht, VW, Islamist, Vw-Mitarbeiter, A...
-9220723349603270656    (Bundeskanzlerin, Tag, Nachdenken, Angela_Merk...
-9220026754916126720    (Nobelpreisträger-Tagung_Lindau, Intelligenz, ...
-9219891685417034752    (Börsenkorrektur, Besitzer, Aktienfonds, Börse...
-9219492558743169024    (Ausstellung, Adsorbens_Haag, Mondrian, Kunst,...
-9219211593729905664    (Gerichtsbeschluss, terrorverdächtig_Tunesier,...
-9219200106309165056    (Norm, Benziner, Porsche, Abgasnebel, stuttgar...
-9218322072988174336    (Gipfelfi

In [53]:
df_balanced = df[df.index.isin(faz_sample_ids)]
df_balanced.size

74

In [30]:
texts = df.values
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.5)

# filter some noice (e.g. special characters)
bad_tokens = ['–']
bad_token_ids = [dictionary.token2id[token] for token in bad_tokens]
dictionary.filter_tokens(bad_ids=bad_token_ids, good_ids=None)

bow_corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

In [31]:
corpus = [bow_corpus, tfidf_corpus][0]
MmCorpus.serialize('../data/{}.mm'.format(dataset), corpus)
corpus_fake = MmCorpus('../data/{}.mm'.format(dataset))

In [19]:
alphas=[0.001, 0.1, 1, 10]
etas=[0.001, 0.1, 1, 10]#[0]

In [20]:
%%time

ldamodel_alphas = []
for alpha in alphas:
    ldamodel_etas = []
    for eta in etas:
        #ldamodel = LdaModel(
        ldamodel = LdaMulticore(
            random_state=42,
            # corpus=bow_corpus, 
            corpus=tfidf_corpus,
            num_topics=10, 
            id2word=dictionary, 
            alpha=alpha,  # 'auto',
            eta=eta,
            # gamma_threshold
            # minimum_phi_value
            # offset
            # decay
            # update_every=1, # not in multicore implementation 
            chunksize=10000, 
            passes=1,
        )
        ldamodel_etas.append(ldamodel)
    ldamodel_alphas.append(ldamodel_etas)
    
#prepared_data = ldavis.gensim.prepare(ldamodel, corpus_fake, dictionary)

CPU times: user 11min 38s, sys: 26.4 s, total: 12min 4s
Wall time: 11min 57s


In [23]:
for a in ldamodel_alphas:
    for e in ldamodel_etas:
        for topic in e.print_topics(num_topics=10, num_words=10):
            print(topic)
        print()
    print()

(0, '0.002*"Prozent" + 0.002*"Trump" + 0.002*"Mann" + 0.002*"Präsident" + 0.001*"Euro" + 0.001*"Mensch" + 0.001*"Frau" + 0.001*"Deutschland" + 0.001*"Kind" + 0.001*"Land"')
(1, '0.002*"Trump" + 0.002*"Prozent" + 0.002*"Präsident" + 0.002*"Unternehmen" + 0.001*"–" + 0.001*"Mensch" + 0.001*"Euro" + 0.001*"Kind" + 0.001*"Million" + 0.001*"Frau"')
(2, '0.002*"Trump" + 0.002*"Euro" + 0.002*"Prozent" + 0.001*"Präsident" + 0.001*"Frau" + 0.001*"Deutschland" + 0.001*"Polizei" + 0.001*"Unternehmen" + 0.001*"–" + 0.001*"Mensch"')
(3, '0.002*"Trump" + 0.002*"Prozent" + 0.001*"Deutschland" + 0.001*"Unternehmen" + 0.001*"Land" + 0.001*"Präsident" + 0.001*"Dollar" + 0.001*"Euro" + 0.001*"Kind" + 0.001*"Polizei"')
(4, '0.002*"Trump" + 0.002*"Prozent" + 0.002*"Deutschland" + 0.002*"Frau" + 0.002*"Präsident" + 0.001*"Euro" + 0.001*"Mensch" + 0.001*"Kind" + 0.001*"Land" + 0.001*"Milliarde"')
(5, '0.003*"Trump" + 0.002*"Prozent" + 0.002*"Euro" + 0.001*"Mann" + 0.001*"–" + 0.001*"Deutschland" + 0.001*"Prä

In [32]:
alpha=0.001
eta=0.001

ldamodel = LdaMulticore(
    random_state=42,
    # corpus=bow_corpus, 
    corpus=tfidf_corpus,
    num_topics=10, 
    id2word=dictionary, 
    alpha=alpha,  # 'auto',
    eta=eta,
    # gamma_threshold
    # minimum_phi_value
    # offset
    # decay
    # update_every=1, # not in multicore implementation 
    chunksize=10000, 
    passes=1,
)

for topic in e.print_topics(num_topics=10, num_words=10):
    print(topic)

prepared_data = ldavis.gensim.prepare(ldamodel, corpus_fake, dictionary)
prepared_data

In [33]:
alpha=0.1
eta=0.001

ldamodel = LdaMulticore(
    random_state=42,
    # corpus=bow_corpus, 
    corpus=tfidf_corpus,
    num_topics=10, 
    id2word=dictionary, 
    alpha=alpha,  # 'auto',
    eta=eta,
    # gamma_threshold
    # minimum_phi_value
    # offset
    # decay
    # update_every=1, # not in multicore implementation 
    chunksize=10000, 
    passes=1,
)

for topic in e.print_topics(num_topics=10, num_words=10):
    print(topic)

prepared_data = ldavis.gensim.prepare(ldamodel, corpus_fake, dictionary)
prepared_data

(0, '0.000*"Prozent" + 0.000*"Trump" + 0.000*"Mann" + 0.000*"Präsident" + 0.000*"Euro" + 0.000*"Mensch" + 0.000*"Frau" + 0.000*"Deutschland" + 0.000*"Kind" + 0.000*"Land"')
(1, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Präsident" + 0.000*"Unternehmen" + 0.000*"–" + 0.000*"Mensch" + 0.000*"Euro" + 0.000*"Kind" + 0.000*"Million" + 0.000*"Frau"')
(2, '0.000*"Trump" + 0.000*"Euro" + 0.000*"Prozent" + 0.000*"Präsident" + 0.000*"Frau" + 0.000*"Deutschland" + 0.000*"Polizei" + 0.000*"Unternehmen" + 0.000*"–" + 0.000*"Mensch"')
(3, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Deutschland" + 0.000*"Unternehmen" + 0.000*"Land" + 0.000*"Präsident" + 0.000*"Dollar" + 0.000*"Euro" + 0.000*"Kind" + 0.000*"Polizei"')
(4, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Deutschland" + 0.000*"Frau" + 0.000*"Präsident" + 0.000*"Euro" + 0.000*"Mensch" + 0.000*"Kind" + 0.000*"Land" + 0.000*"Milliarde"')
(5, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Euro" + 0.000*"Mann" + 0.000*"–" + 0.000*"Deutschland" + 0.000*"Prä

In [34]:
alpha=1
eta=0.001

ldamodel = LdaMulticore(
    random_state=42,
    # corpus=bow_corpus, 
    corpus=tfidf_corpus,
    num_topics=10, 
    id2word=dictionary, 
    alpha=alpha,  # 'auto',
    eta=eta,
    # gamma_threshold
    # minimum_phi_value
    # offset
    # decay
    # update_every=1, # not in multicore implementation 
    chunksize=10000, 
    passes=1,
)

for topic in e.print_topics(num_topics=10, num_words=10):
    print(topic)

prepared_data = ldavis.gensim.prepare(ldamodel, corpus_fake, dictionary)
prepared_data

(0, '0.000*"Prozent" + 0.000*"Trump" + 0.000*"Mann" + 0.000*"Präsident" + 0.000*"Euro" + 0.000*"Mensch" + 0.000*"Frau" + 0.000*"Deutschland" + 0.000*"Kind" + 0.000*"Land"')
(1, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Präsident" + 0.000*"Unternehmen" + 0.000*"–" + 0.000*"Mensch" + 0.000*"Euro" + 0.000*"Kind" + 0.000*"Million" + 0.000*"Frau"')
(2, '0.000*"Trump" + 0.000*"Euro" + 0.000*"Prozent" + 0.000*"Präsident" + 0.000*"Frau" + 0.000*"Deutschland" + 0.000*"Polizei" + 0.000*"Unternehmen" + 0.000*"–" + 0.000*"Mensch"')
(3, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Deutschland" + 0.000*"Unternehmen" + 0.000*"Land" + 0.000*"Präsident" + 0.000*"Dollar" + 0.000*"Euro" + 0.000*"Kind" + 0.000*"Polizei"')
(4, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Deutschland" + 0.000*"Frau" + 0.000*"Präsident" + 0.000*"Euro" + 0.000*"Mensch" + 0.000*"Kind" + 0.000*"Land" + 0.000*"Milliarde"')
(5, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Euro" + 0.000*"Mann" + 0.000*"–" + 0.000*"Deutschland" + 0.000*"Prä

In [35]:
alpha=1
eta=0.00001

ldamodel = LdaMulticore(
    random_state=42,
    # corpus=bow_corpus, 
    corpus=tfidf_corpus,
    num_topics=10, 
    id2word=dictionary, 
    alpha=alpha,  # 'auto',
    eta=eta,
    # gamma_threshold
    # minimum_phi_value
    # offset
    # decay
    # update_every=1, # not in multicore implementation 
    chunksize=10000, 
    passes=1,
)

for topic in e.print_topics(num_topics=10, num_words=10):
    print(topic)

prepared_data = ldavis.gensim.prepare(ldamodel, corpus_fake, dictionary)
prepared_data

(0, '0.000*"Prozent" + 0.000*"Trump" + 0.000*"Mann" + 0.000*"Präsident" + 0.000*"Euro" + 0.000*"Mensch" + 0.000*"Frau" + 0.000*"Deutschland" + 0.000*"Kind" + 0.000*"Land"')
(1, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Präsident" + 0.000*"Unternehmen" + 0.000*"–" + 0.000*"Mensch" + 0.000*"Euro" + 0.000*"Kind" + 0.000*"Million" + 0.000*"Frau"')
(2, '0.000*"Trump" + 0.000*"Euro" + 0.000*"Prozent" + 0.000*"Präsident" + 0.000*"Frau" + 0.000*"Deutschland" + 0.000*"Polizei" + 0.000*"Unternehmen" + 0.000*"–" + 0.000*"Mensch"')
(3, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Deutschland" + 0.000*"Unternehmen" + 0.000*"Land" + 0.000*"Präsident" + 0.000*"Dollar" + 0.000*"Euro" + 0.000*"Kind" + 0.000*"Polizei"')
(4, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Deutschland" + 0.000*"Frau" + 0.000*"Präsident" + 0.000*"Euro" + 0.000*"Mensch" + 0.000*"Kind" + 0.000*"Land" + 0.000*"Milliarde"')
(5, '0.000*"Trump" + 0.000*"Prozent" + 0.000*"Euro" + 0.000*"Mann" + 0.000*"–" + 0.000*"Deutschland" + 0.000*"Prä

### Evaluation