In [1]:
from os.path import join
import pandas as pd

import gensim
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import TfidfModel, LdaModel, LdaMulticore
from gensim.models.phrases import Phrases, Phraser

import pyLDAvis as ldavis
import pyLDAvis.gensim
ldavis.enable_notebook()

import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from constants import FULL_PATH, ETL_PATH, NLP_PATH, SMPL_PATH, POS, NOUN, PROPN, TOKEN, HASH, SENT_IDX, PUNCT
pd.options.display.max_rows = 2001

def docs_to_lists(token_series):
    return tuple(token_series.tolist())

In [17]:
dataset = [
    'Europarl',
    'FAZ_combined',
    'FOCUS_cleansed',
    'OnlineParticipation',
    'PoliticalSpeeches',
][4]
df = pd.read_pickle(join(SMPL_PATH, dataset + '_simple.pickle'))

In [None]:
# for specific datasets only
df_balanced = df[df.index.isin(faz_sample_ids)]
df_balanced.size

In [18]:
# fixing bad POS tagging
mask = df.token.isin(['[', ']', '<', '>', '/', '–', '%'])
df.loc[mask, POS] = PUNCT

# using only certain POS tags
df = df[df[POS].isin({NOUN, PROPN, 'NER', 'NPHRASE'})]
df = df.groupby([HASH])[TOKEN].agg(docs_to_lists)

In [19]:
bad_tokens = {
    'OnlineParticipation': [
        'Re', '@#1', '@#2', '@#3', '@#4', '@#5', '@#6', '@#7', '@#8', '@#9', '@#1.1', 'Für', 'Muss', 'etc', 'sorry', 'Ggf', 'u.a.',
        'B.', 'stimmt', ';-)', 'lieber', 'o.', 'Ja', 'Desweiteren',
    ],
    'Europarl': [],
    'FOCUS_cleansed': [],
    'OnlineParticipation': [],
    'PoliticalSpeeches': [],
}

In [20]:
texts = df.values
dictionary = Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.5)

# filter some noice (e.g. special characters)
bad_token_ids = [dictionary.token2id[token] for token in bad_tokens[dataset]]
dictionary.filter_tokens(bad_ids=bad_token_ids, good_ids=None)

bow_corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]
#corpus = [bow_corpus, tfidf_corpus][0]

In [21]:
# model name: x42_100
nbtopics = 100
ldamodel = LdaModel(
    random_state=42,
    # corpus=bow_corpus, 
    corpus=tfidf_corpus,
    alpha='auto',
    eta='auto',
    id2word=dictionary, num_topics=nbtopics, chunksize=2000, passes=20, 
    #workers=1, 
    eval_every=10, iterations=1000)

  diff = np.log(self.expElogbeta)


In [25]:
# model name: x42_50
nbtopics = 50
ldamodel = LdaModel(
    random_state=42,
    # corpus=bow_corpus, 
    corpus=tfidf_corpus,
    alpha='auto',
    eta='auto',
    id2word=dictionary, num_topics=nbtopics, chunksize=2000, passes=20, 
    #workers=1, 
    eval_every=10, iterations=1000)

In [27]:
topics = [[dataset] + [dictionary[term[0]] for term in ldamodel.get_topic_terms(i)] for i in range(nbtopics)]
df_topics = pd.DataFrame(topics, columns=['dataset']+['term'+str(i) for i in range(10)])
df_topics

Unnamed: 0,dataset,term0,term1,term2,term3,term4,term5,term6,term7,term8,term9
0,PoliticalSpeeches,Wiedereröffnung,Martyrium,AJC,American_Jewish_Committee,Start-Vertrag,Albert_Einstein,Verklärung,Unterhalt,Freiwilligenagentur,Bittsteller
1,PoliticalSpeeches,Neuhardenberg,Begrüßungsansprache,Zorn,Konzertreise,Klimagipfel,Handelsrecht,Festschrift,Franc,Lettland,Verräter
2,PoliticalSpeeches,Museumsbau,Mitfühlen,Karre,Panelist,Fleckchen,Stadtschloss,Abraham,Wirtschaftsfachmann,Fragment,Isaak
3,PoliticalSpeeches,Selbstbindung,Schwesig,Liberté,Weimars,Nordafrikas,Bamberg,Schiffbrüchiger,Lehrmeister,Stadt_Weimar,Ostasiens
4,PoliticalSpeeches,Baustoff,Betrug,Winterspiele,Lebenslage,Kritikpunkt,Zertifikat,Umweltveränderung,Sportlerin,Ernüchterung,Julian
5,PoliticalSpeeches,UNESCO,Umsiedlung,Amos,deutsch_Unesco-Kommission,Ausbilder,Unrechtsregime,Aufrechnen,Fußballspiel,Exemplar,silbern_Lorbeerblatt
6,PoliticalSpeeches,Nichtdeutsch,Code,Bundesrechnungshof,Opferverband,Pult,Rechnungshof,Weltbund,Awacs-Einsatz,Slawe,Reinhard_Mohn
7,PoliticalSpeeches,Schweizer,Nichtverbreitung,Vertretung,Gewerkschafterin,Pomp,Verdächtigungen,Hof,Haben,Helmut_Kohl,Repräsentation
8,PoliticalSpeeches,Chart,Lein,Bundesverdienstorden,Baukultur,Friedensstifter,Unamid,Burda,Andrássy-Universität,OAE,deutsch_Computerspielpreis
9,PoliticalSpeeches,Reformation,Staatsoberhaupt,Ebert,Friedrich_Ebert,Arbeiterschaft,Bundesministerin,Kardinal,Kasper,Großherzog,Kuppel


In [28]:
df_topics.to_csv(f'../data/preprocessed/LDAmodel/{dataset}_topics_x42_{nbtopics}.csv')

In [29]:
ldamodel.save(f'../data/preprocessed/LDAmodel/{dataset}_ldamdodel_x42_{nbtopics}')

### Evaluation

In [None]:
MmCorpus.serialize('../data/{}.mm'.format(dataset), corpus)
corpus_fake = MmCorpus('../data/{}.mm'.format(dataset))
prepared_data = ldavis.gensim.prepare(ldamodel, corpus_fake, dictionary)
prepared_data