In [1]:
import matplotlib.pyplot as plt
import gensim
import pandas as pd

from tmtoolkit import corpus as c
from tmtoolkit.topicmod import tm_gensim
from tmtoolkit.utils import pickle_data, enable_logging
from tmtoolkit.topicmod.evaluate import results_by_parameter
from tmtoolkit.topicmod.visualize import plot_eval_results

In [2]:
enable_logging()

In [3]:
print('loading data...')
bt18 = pd.read_pickle('data/bt18_sample_1000.pickle')
print('loaded %d documents' % len(bt18))
doc_labels = ['%s_%s' % info for info in zip(bt18.sitzung, bt18.sequence)]

loading data...
loaded 1000 documents


In [4]:
print('loading and tokenizing documents')
# minimal pipeline
bt18corp = c.Corpus(dict(zip(doc_labels, bt18.text)), language='de', load_features=[], max_workers=1.0)
del bt18
c.print_summary(bt18corp)

loading and tokenizing documents


2023-03-26 22:17:32,060:INFO:tmtoolkit:creating Corpus instance with 1000 documents
2023-03-26 22:17:32,061:INFO:tmtoolkit:using parallel processing with 12 workers
2023-03-26 22:17:32,062:INFO:tmtoolkit:running NLP pipeline on 1000 documents
2023-03-26 22:18:53,309:INFO:tmtoolkit:generating document texts


Corpus with 1000 documents in German
> 101_6369 (854 tokens): Herr Präsident ! Liebe Kolleginnen und Kollegen ! ...
> 101_6387 (2230 tokens): Frau Präsidentin ! Liebe Kolleginnen und Kollegen ...
> 100_6325 (584 tokens): Sehr geehrte Frau Präsidentin ! Liebe Kolleginnen ...
> 100_6278 (635 tokens): Sehr geehrte Frau Präsidentin ! Meine sehr geehrte...
> 102_6453 (881 tokens): Vielen Dank . - Frau Präsidentin ! Meine Damen und...
> 101_6392 (1973 tokens): Sehr verehrte Frau Präsidentin ! Sehr verehrte Kol...
> 102_6455 (906 tokens): Frau Präsidentin ! Liebe Kolleginnen und Kollegen ...
> 101_6376 (1225 tokens): Herr Präsident ! Meine Damen und Herren ! Herr Zim...
> 102_6429 (72 tokens): Herr Bundesminister Gabriel , Sie haben vorhin in ...
> 100_6319 (656 tokens): Sehr geehrter Herr Präsident ! Liebe Gäste auf den...
(and 990 more documents)
total number of tokens: 925947 / vocabulary size: 48728


In [5]:
print('preprocessing data...')

c.stem(bt18corp)
c.filter_clean_tokens(bt18corp)

c.print_summary(bt18corp)

preprocessing data...


2023-03-26 22:19:02,995:INFO:tmtoolkit:replacing 43993 token hashes
2023-03-26 22:19:04,338:INFO:tmtoolkit:filtered tokens by mask: num. tokens was 925947 and is now 321112
2023-03-26 22:19:04,825:INFO:tmtoolkit:generating document texts


Corpus with 1000 documents in German
> 101_6369 (295 tokens): herr prasident lieb kolleginn kolleg beitrag frag ...
> 101_6387 (776 tokens): frau prasidentin lieb kolleginn kolleg dam herr no...
> 100_6325 (225 tokens): geehrt frau prasidentin lieb kolleginn kolleg deut...
> 100_6278 (229 tokens): geehrt frau prasidentin geehrt dam herr energiekon...
> 102_6453 (339 tokens): frau prasidentin dam herr herr kolleg strobel fakt...
> 101_6392 (676 tokens): verehrt frau prasidentin verehrt kolleginn geehrt ...
> 102_6455 (326 tokens): frau prasidentin lieb kolleginn kolleg froh heutig...
> 101_6376 (421 tokens): herr prasident dam herr herr zimm wort begriff kre...
> 102_6429 (28 tokens): herr bundesminist gabriel vorhin bericht ausgefuhr...
> 100_6319 (261 tokens): geehrt herr prasident lieb gast tribun wert kolleg...
(and 990 more documents)
total number of tokens: 321112 / vocabulary size: 33418


In [6]:
print('creating gensim corpus...')

texts = list(c.doc_tokens(bt18corp).values())
gnsm_dict = gensim.corpora.Dictionary.from_documents(texts)
gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts]

del bt18corp

creating gensim corpus...


In [7]:
# evaluate topic models with different parameters
const_params = dict(update_every=0, passes=10)
ks = list(range(10, 140, 10)) + list(range(140, 200, 20))
varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks]

print(f'evaluating {len(varying_params)} topic models')
eval_results = tm_gensim.evaluate_topic_models((gnsm_dict, gnsm_corpus), varying_params, const_params,
                                               coherence_gensim_texts=texts)   # necessary for coherence C_V metric

# save the results as pickle
print('saving results')
pickle_data(eval_results, 'data/gensim_evaluation_results.pickle')

# plot the results
print('plotting evaluation results')
plt.style.use('ggplot')
results_by_n_topics = results_by_parameter(eval_results, 'num_topics')
plot_eval_results(results_by_n_topics, xaxislabel='num. topics k',
                  title='Evaluation results', figsize=(8, 6))
plt.savefig('data/gensim_evaluation_plot.png')
plt.show()

2023-03-26 22:19:17,852:INFO:tmtoolkit:init with 12 workers


evaluating 16 topic models


2023-03-26 22:20:38,280:INFO:tmtoolkit:multiproc models: starting with 16 parameter sets on 1 datasets (= 16 tasks) and 12 processes
