In [1]:
# coding: utf-8
from os import listdir, makedirs
from os.path import join, isfile, isdir, exists
import pandas as pd
import gc
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import CoherenceModel, TfidfModel, LdaModel, LdaMulticore
from gensim.models.hdpmodel import HdpModel, HdpTopicFormatter
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric
from itertools import chain, islice
from constants import (
    FULL_PATH, ETL_PATH, NLP_PATH, SMPL_PATH, POS, NOUN, PROPN, TOKEN, HASH, SENT_IDX, PUNCT
)
import logging
import json
import numpy as np

#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#pd.options.display.max_rows = 2001

In [2]:
def docs_to_lists(token_series):
    return tuple(token_series.tolist())

def docs2corpora(documents, tfidf=True, stopwords=None, filter_below=5, filter_above=0.5,
                split=False, max_test_size_rel=0.1, max_test_size_abs=5000):
    dictionary = Dictionary(documents)
    dictionary.filter_extremes(no_below=filter_below, no_above=filter_above)

    # filter some noice (e.g. special characters)
    if stopwords:
        stopword_ids = [dictionary.token2id[token] for token in stopwords]
        dictionary.filter_tokens(bad_ids=stopword_ids, good_ids=None)
        
    length = len(documents)
    corpora = dict()
    if split:
        if length*max_test_size_rel < max_test_size_abs:
            split1 = int(length*(1-(2*max_test_size_rel)))
            split2 = int(length*(1-max_test_size_rel))
        else:
            split1 = length-(2*max_test_size_abs)
            split2 = length-max_test_size_abs
        training_texts = documents[:split1]
        holdout_texts = documents[split1:split2]
        test_texts = documents[split2:]
        print(
            f'split dataset. size of:',
            f'train_set={len(training_texts)},',
            f'val_set={len(holdout_texts)},',
            f'test_set={len(test_texts)},'
        )
        corpora['training_corpus'] = [dictionary.doc2bow(text) for text in training_texts]
        corpora['holdout_corpus'] = [dictionary.doc2bow(text) for text in holdout_texts]
        corpora['test_corpus'] = [dictionary.doc2bow(text) for text in test_texts]
    else:
        training_texts = documents
        corpora['training_corpus'] = [dictionary.doc2bow(text) for text in training_texts]
        corpora['holdout_corpus'], corpora['test_corpus'] = None, None

    if tfidf:
        for key, bow_corpus in corpora.items():
            tfidf_model = TfidfModel(bow_corpus)
            corpora[key] = tfidf_model[bow_corpus]
    return corpora, dictionary

In [8]:
datasets = {
    'E': 'Europarl',
    'FA': 'FAZ_combined',
    'FO': 'FOCUS_cleansed',
    'O': 'OnlineParticipation',
    'P': 'PoliticalSpeeches',
    'dewi': 'dewiki',
    'dewa': 'dewac',
}
goodids = {
    # filetered via some fixed rules and similarity measure to character distribution
    'dewac': join(ETL_PATH, 'dewac_good_ids.pickle'),
    'dewiki': join(ETL_PATH, 'dewiki_good_ids.pickle'),
    # the samples contain only a small subset of all articles
    # the reason for this is that the samples are roughly equal in size per category
    # 'FAZ_combined': join(ETL_PATH, 'FAZ_document_sample3.pickle'),
    # 'FOCUS_cleansed': join(ETL_PATH, 'FOCUS_document_sample3.pickle'),
}
bad_tokens = {
    'Europarl': [
        'E.', 'Kerr', 'The', 'la', 'ia', 'For', 'Ieke', 'the',
    ],
    'FAZ_combined': [
        'S.', 'j.reinecke@faz.de', 'B.',
    ],
    'FOCUS_cleansed': [],
    'OnlineParticipation': [
        'Re', '@#1', '@#2', '@#3', '@#4', '@#5', '@#6', '@#7', '@#8', '@#9', '@#1.1', 'Für', 'Muss',
        'etc', 'sorry', 'Ggf', 'u.a.', 'z.B.'
        'B.', 'stimmt', ';-)', 'lieber', 'o.', 'Ja', 'Desweiteren',
    ],
    'PoliticalSpeeches': [],
    'dewiki': [],
    'dewac': [],
}
all_bad_tokens = set(chain(*bad_tokens.values()))

In [24]:
#for dataset in datasets[1:2]:
dataset = datasets['P']
print('dataset:', dataset)
print('-' * 5)
dir_path = join(SMPL_PATH, 'wiki_phrases')
files = sorted([f for f in listdir(dir_path) if f.startswith(dataset)])
for name in files:
    full_path = join(dir_path, name)
    if isdir(full_path):
        subdir = sorted([join(name, f) for f in listdir(full_path) if f.startswith(dataset)])
        files += subdir

keepids = None
if dataset in goodids:
    keepids = pd.read_pickle(goodids[dataset])

nb_words = 0
documents = []
for name in files:
    gc.collect()
    full_path = join(dir_path, name)
    if not isfile(full_path):
        continue

    print('reading', name)
    df = pd.read_pickle(join(dir_path, name))
    print('    initial number of words:', len(df))
    if keepids is not None:
        # some datasets have already been filtered so you may not see a difference in any case
        df = df[df.hash.isin(keepids.index)]

    # fixing bad POS tagging
    mask = df.token.isin(list('[]<>/–%'))
    df.loc[mask, POS] = PUNCT

    # using only certain POS tags
    reduction_pos_set = {NOUN, PROPN, 'NER', 'NPHRASE'}
    df = df[df.POS.isin(reduction_pos_set)]
    df[TOKEN] = df[TOKEN].map(lambda x: x.strip('-/'))
    df = df[df.token.str.len() > 1]
    df = df[~df.token.isin(all_bad_tokens)]
    nb_words += len(df)
    print('    remaining number of words:', len(df))
    # groupby sorts the documents by hash-id which is equal to shuffeling the dataset before building the model
    df = df.groupby([HASH])[TOKEN].agg(docs_to_lists)
    print('    number of documents:', len(df))
    documents += df.values.tolist()

nb_docs = len(documents)
print('-' * 5)
print('total number of documents:', nb_docs)
print('total number of words:', nb_words)
stats = dict(dataset=dataset, pos_set=sorted(reduction_pos_set), nb_docs=nb_docs, nb_words=nb_words)
del keepids, files
gc.collect();

dataset: PoliticalSpeeches
-----
reading PoliticalSpeeches_simple_wiki_phrases.pickle
    initial number of words: 11297096
    remaining number of words: 2294565
    number of documents: 6037
-----
total number of documents: 6037
total number of words: 2294565


In [25]:
stats

{'dataset': 'PoliticalSpeeches',
 'pos_set': ['NER', 'NOUN', 'NPHRASE', 'PROPN'],
 'nb_docs': 6037,
 'nb_words': 2294565}

In [20]:
corpora, dictionary = docs2corpora(
    documents, tfidf=False,
    #stopwords=bad_tokens[dataset],  # stopword removale has been moved to the pandas preprocessing pipeline
    filter_below=5, filter_above=0.5,
    split=True,
)
training_corpus = corpora['training_corpus']
holdout_corpus = corpora['holdout_corpus']
test_corpus = corpora['test_corpus']

def init_callbacks(viz_env=None, title_suffix=''):
    # define perplexity callback for hold_out and test corpus
    pl_holdout = PerplexityMetric(
        corpus=holdout_corpus, logger="visdom", viz_env=viz_env, title="Perplexity (hold_out)"+title_suffix
    )
    pl_test = PerplexityMetric(
        corpus=test_corpus, logger="visdom", viz_env=viz_env, title="Perplexity (test)"+title_suffix
    )
    # define other remaining metrics available
    ch_umass = CoherenceMetric(
        corpus=training_corpus, coherence="u_mass", topn=10, logger="visdom", viz_env=viz_env, title="Coherence (u_mass)"+title_suffix
    )
    ch_cv = CoherenceMetric(
        corpus=training_corpus, texts=documents, coherence="c_v", topn=10, logger="visdom", viz_env=viz_env, title="Coherence (c_v)"+title_suffix
    )
    diff_kl = DiffMetric(
        distance="kullback_leibler", logger="visdom", viz_env=viz_env, title="Diff (kullback_leibler)"+title_suffix
    )
    convergence_kl = ConvergenceMetric(
        distance="jaccard", logger="visdom", viz_env=viz_env, title="Convergence (jaccard)"+title_suffix
    )
    return [pl_holdout, pl_test, ch_umass, ch_cv, diff_kl, convergence_kl]

In [27]:
def get_parameterset(corpus, dictionary, callbacks=None, nbtopics=100, parametrization='a42', eval_every=None):
    print(f'building LDA model "{parametrization}" with {nbtopics} number of topics')
    default = dict(random_state=42, corpus=corpus, id2word=dictionary, num_topics=nbtopics, 
                   eval_every=eval_every, callbacks=callbacks, chunksize=20_000)
    ldamodels = {
        'a42': dict(passes=10),
        'b42': dict(passes=10, iterations=200),
        'c42': dict(passes=10, iterations=1_000),
        'd42': dict(passes=10, iterations=200, alpha=0.1, eta=0.01),
        'e42': dict(passes=20, iterations=200, alpha='auto', eta='auto'),
    }
    for key, dic in ldamodels.items():
        dic.update(default)
    return ldamodels[parametrization]

Install Visdom via ```pip install visdom``` and run ```python -m visdom.server``` to start the server.
You will be able to view online log stats @ http://localhost:8097/

Attention: gensim is currently not fully compatible with visdom. See https://github.com/RaRe-Technologies/gensim/issues/2155 for details.

To fix this issue change in gensim.models.callbacks.Callback.on_epoch_end the following line
```
self.viz.updateTrace(
    Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, win=self.windows[i]
)
```
to
```
self.viz.line(
    Y=np.array([value]), X=np.array([epoch]), env=metric.viz_env, win=self.windows[i], update='append'
)
```
You might need to re-import gensim (and probably even restart this notebook).

In [None]:
params_list = ['a42', 'b42', 'c42', 'd42', 'e42']
implementations = [
    ('LDAmodel', LdaModel),
    ('LDAmulticore', LdaMulticore),
    ('LDAmallet', LdaMallet),
]
choice = 0
model_name = implementations[choice][0]
UsedModel = implementations[choice][1]
save = True
metrics = []
#params = params_list[3]
for params in params_list:
    env_id = f"{dataset}-{model_name}"
    for nbtopics in [10, 25, 50, 100]: #range(10, 101, 10):
        # Choose α from [0.05, 0.1, 0.5, 1, 5, 10]
        # Choose β from [0.05, 0.1, 0.5, 1, 5, 10]
        callbacks = init_callbacks(viz_env=env_id, title_suffix=f", {params}, {nbtopics}tpx")
        kwargs = get_parameterset(training_corpus, dictionary, callbacks=callbacks, nbtopics=nbtopics, parametrization=params)
        if 'multicore' in model_name:
            kwargs['workers'] = 3
            kwargs.pop('callbacks', None)
        ldamodel = UsedModel(**kwargs)

        topics = [[dataset] + [dictionary[term[0]] for term in ldamodel.get_topic_terms(i)] for i in range(nbtopics)]
        df_lda = pd.DataFrame(topics, columns=['dataset']+['term'+str(i) for i in range(10)])

        # calculate (average) UMass score
        #top_topics = ldamodel.top_topics(training_corpus, topn=10, processes=8)
        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
        #avg_topic_coherence = sum([t[1] for t in top_topics]) / nbtopics
        #print('Average topic coherence: %.4f.' % avg_topic_coherence)
        current_metrics = ldamodel.metrics
        #print(current_metrics)
        metrics.append(('env_id', current_metrics))

        if save:
            out_dir = join(ETL_PATH, f'{model_name}/{params}')
            if not exists(out_dir):
                makedirs(out_dir)
            out = join(out_dir, f'{dataset}_{model_name}_{params}_{nbtopics}')
            print('saving to', out)
            df_lda.to_csv(out + '.csv')
            ldamodel.save(out)
            with open(out + '_stats.json', 'w') as fp:
                json.dump(stats, fp)
            with open(out + '_metrics.json', 'w') as fp:
                serializable_metrics = {}
                for k, v in current_metrics.items():
                    if isinstance(v[0], np.ndarray):
                        serializable_metrics[k] = [x.tolist() for x in v]
                    else:
                        serializable_metrics[k] = [float(x) for x in v]
                json.dump(serializable_metrics, fp)

building LDA model "a42" with 10 number of topics
defaultdict(<class 'list'>, {'Perplexity (hold_out), a42, 10tpx': [1066.155265168939, 901.876008128025, 797.4424582247051, 733.1676245094116, 691.4658523940104, 662.7885078407593, 642.1881337355279, 626.8340722586177, 615.0329775094036, 605.6659115124453], 'Perplexity (test), a42, 10tpx': [1093.6731735910587, 923.5706863366727, 815.396356327316, 748.8072740549223, 705.6650463613357, 676.2241597059231, 655.1527062496039, 639.4516023691201, 627.3619900381618, 617.7092384370222], 'Coherence (u_mass), a42, 10tpx': [-0.8009364948961004, -0.8505636932065463, -0.9168081988202612, -0.9246697286939506, -0.9541508264880445, -0.9877845429089639, -0.9904310569697727, -1.0415385629991323, -1.0629786456423533, -1.0387191547545904], 'Coherence (c_v), a42, 10tpx': [0.34773001982711826, 0.3894817127484244, 0.4114107529483988, 0.4349594104650773, 0.43606033286290813, 0.44292783648967193, 0.4508647236857083, 0.454207548786903, 0.4583841411203733, 0.465267

building LDA model "a42" with 100 number of topics


  diff = np.log(self.expElogbeta)


defaultdict(<class 'list'>, {'Perplexity (hold_out), a42, 100tpx': [41142282.0956554, 3689346.1077639647, 660497.5784135207, 199219.29221598982, 85909.32240539364, 47195.69441218002, 30568.38910512667, 22146.244018759837, 17319.997445052908, 14292.638932258396], 'Perplexity (test), a42, 100tpx': [46265999.175433405, 4083318.1894045565, 718867.0715484209, 214183.85760885032, 91597.36380682101, 50059.651392757376, 32296.626532919458, 23320.86838656372, 18199.292976798923, 14991.567802886351], 'Coherence (u_mass), a42, 100tpx': [-0.8756740220792083, -1.0842662424751337, -1.4610074895202836, -1.7088468470436913, -1.930029637582416, -1.9720707577348437, -1.9949264860468643, -2.0430253562232426, -2.117298434889901, -2.1659832604372142], 'Coherence (c_v), a42, 100tpx': [0.36003991664061047, 0.3917628599471059, 0.4153940992482941, 0.4339135730527215, 0.4404085893491743, 0.4502822630790256, 0.45612355758209233, 0.4609707093772296, 0.4645624218608981, 0.46965765684184047], 'Diff (kullback_leible

building LDA model "b42" with 10 number of topics
defaultdict(<class 'list'>, {'Perplexity (hold_out), b42, 10tpx': [929.3716848469205, 824.2020917597533, 755.0293483628097, 709.2679776817753, 677.4583168961032, 654.4598910176161, 637.0587766897183, 623.5421933587983, 612.7328587919367, 603.8667649062044], 'Perplexity (test), b42, 10tpx': [951.4530605299179, 842.8210051078066, 771.1738218722844, 723.7852919383769, 691.0036934870843, 667.2407185465048, 649.3732156789273, 635.4258556487023, 624.2480252268443, 615.0907554283142], 'Coherence (u_mass), b42, 10tpx': [-0.8006704484880821, -0.8919342545868714, -0.9108707854176978, -0.9028821347313019, -0.9619114515275171, -0.9683594189488675, -0.9946588633239368, -0.9897991075120542, -1.0665462838291673, -1.0785267438747799], 'Coherence (c_v), b42, 10tpx': [0.3668208063632113, 0.40771515881785597, 0.41868526769099057, 0.431614343924573, 0.4398747936201651, 0.44189871556303845, 0.451653013642279, 0.4503038005585607, 0.4571127885711547, 0.453796

building LDA model "b42" with 100 number of topics
defaultdict(<class 'list'>, {'Perplexity (hold_out), b42, 100tpx': [800950.5282080374, 330926.40555204, 158490.48375493468, 88646.08907105314, 56367.133391812014, 39592.82639975678, 29980.50240807381, 24019.45577581638, 20051.006917930208, 17282.82304384438], 'Perplexity (test), b42, 100tpx': [870639.9917979507, 356489.56032974104, 169506.90584741835, 94266.18793895989, 59603.98866536935, 41702.7335763019, 31497.677922745614, 25176.213254684975, 20997.26256449445, 18066.31533668009], 'Coherence (u_mass), b42, 100tpx': [-1.0391263801751318, -1.2686369172114578, -1.4673436079586428, -1.5860187331856124, -1.6483059210694646, -1.6389210048030742, -1.736709297468357, -1.750343590579265, -1.7980908987693536, -1.8343826326407084], 'Coherence (c_v), b42, 100tpx': [0.3705748637304614, 0.4013067025521291, 0.4188804025492223, 0.4360726817762062, 0.4472570144130138, 0.4605393880068156, 0.47138774216724466, 0.4783258246585886, 0.4822180455563185, 0

building LDA model "c42" with 10 number of topics
defaultdict(<class 'list'>, {'Perplexity (hold_out), c42, 10tpx': [905.282550366012, 816.3063932332427, 753.0072512712123, 709.581023568407, 678.8476915145582, 656.1211783288195, 638.7892290602887, 625.0679864501591, 613.9618370201885, 604.785083802946], 'Perplexity (test), c42, 10tpx': [926.3666527583206, 834.4878994122703, 769.2898394879502, 724.5355907313177, 692.7789344321532, 669.3164202572639, 651.3937470084687, 637.2353002353744, 625.799236391531, 616.322959454991], 'Coherence (u_mass), c42, 10tpx': [-0.8537228061734871, -0.8555281101063074, -0.9179561790770663, -0.9094243888503506, -1.0023934622101933, -0.9878325454576034, -0.9128639250275047, -0.961040903319876, -0.996524435271073, -1.0348017598478942], 'Coherence (c_v), c42, 10tpx': [0.3784376750731543, 0.41133961056656176, 0.42296438060656294, 0.42364462392413954, 0.42874106070299833, 0.4341424183400523, 0.4508124309341907, 0.4569291863568218, 0.4628794637698258, 0.4658973527

-----

In [None]:
top_topics = ldamodel.top_topics(corpus, topn=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / len(top_topics)
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

In [None]:
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary, random_state=42)

In [None]:
htf = HdpTopicFormatter(dictionary, topic_data=hdpmodel.get_topics())
topics = [[dataset] + [term[0] for term in htf.show_topic(i, topn=10, formatted=False)] for i in range(nbtopics)]
df_hdp = pd.DataFrame(topics, columns=['dataset']+['term'+str(i) for i in range(10)])
df_hdp

In [None]:
ldamodel.metrics

In [None]:
def topic_prob_extractor(gensim_hdp):
    shown_topics = gensim_hdp.show_topics(num_topics=-1, formatted=False)
    topics_nos = [x[0] for x in shown_topics ]
    weights = [ sum([item[1] for item in shown_topics[topicN][1]]) for topicN in topics_nos ]

    return pd.DataFrame({'topic_id' : topics_nos, 'weight' : weights})

topic_prob_extractor(hdpmodel)

In [None]:
topics = []
for topic_id, topic in hdpmodel.show_topics(num_topics=10, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)
topics[:2]

In [None]:
coherences = []
for topic in topics:
    cm = CoherenceModel(topics=[topic], corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherences.append(cm.get_coherence())
coherences

In [None]:
sum(coherences) / len(coherences)

In [None]:
cmlda = CoherenceModel(model=ldamodel, corpus=corpus, coherence='u_mass')
cmlda.get_coherence()
cmhdp = CoherenceModel(model=hdpmodel, corpus=corpus, coherence='u_mass')
cmhdp.get_coherence()

In [None]:
cm = CoherenceModel.for_topics(topics_as_topn_terms=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass')

In [None]:
cm.get_coherence_per_topic(segmented_topics=topics, with_std=True, with_support=True)


### Evaluation

In [None]:
MmCorpus.serialize('../data/{}.mm'.format(dataset), corpus)
corpus_fake = MmCorpus('../data/{}.mm'.format(dataset))
prepared_data = ldavis.gensim.prepare(ldamodel, corpus_fake, dictionary)
prepared_data

In [None]:
# To get the topic words from the model
topics = []
for topic_id, topic in hm.show_topics(num_topics=10, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)
topics[:2]

# Initialize CoherenceModel using `topics` parameter
coherences = []
for topic in topics:
    cm = CoherenceModel(topics=[topic], corpus=corpus, dictionary=dictionary, coherence='u_mass')
    coherences.append(cm.get_coherence())
coherences

sum(coherences) / len(coherences)

### Start c_v coherence measure
This is expected to take much more time since `c_v` uses a sliding window to perform probability estimation and uses the cosine similarity indirect confirmation measure.

In [None]:
%%time

cm = CoherenceModel(topics=usable_topics, texts=texts, dictionary=dictionary, coherence='c_v')
c_v = cm.get_coherence_per_topic()
print("Calculated c_v coherence for %d topics" % len(c_v))

### Start c_uci and c_npmi coherence measures
c_v and c_uci and c_npmi all use the boolean sliding window approach of estimating probabilities. Since the `CoherenceModel` caches the accumulated statistics, calculation of c_uci and c_npmi are practically free after calculating c_v coherence. These two methods are simpler and were shown to correlate less with human judgements than c_v but more so than u_mass.

In [None]:
%%time

cm.coherence = 'c_uci'
c_uci = cm.get_coherence_per_topic()
print("Calculated c_uci coherence for %d topics" % len(c_uci))

In [None]:
%%time

cm.coherence = 'c_npmi'
c_npmi = cm.get_coherence_per_topic()
print("Calculated c_npmi coherence for %d topics" % len(c_npmi))

In [None]:
final_scores = [
    score for i, score in enumerate(human_scores)
    if i not in invalid_topic_indices
]
len(final_scores)

The [values in the paper](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf) were:

__`u_mass` correlation__ : 0.093

__`c_v` correlation__    : 0.548

__`c_uci` correlation__  : 0.473

__`c_npmi` correlation__ : 0.438

Our values are also very similar to these values which is good. This validates the correctness of our pipeline, as we can reasonably attribute the differences to differences in preprocessing.

In [None]:
for our_scores in (u_mass, c_v, c_uci, c_npmi):
    print(pearsonr(our_scores, final_scores)[0])

In [None]:
sum(u_mass)/99, sum(c_v)/99, sum(c_uci)/99, sum(c_npmi)/99

In [None]:
cm.save('movies_coherence_model')

In [None]:
from gensim import utils


def report_on_oov_terms(cm, topic_models):
    """OOV = out-of-vocabulary"""
    topics_as_topn_terms = [
        models.CoherenceModel.top_topics_as_word_lists(model, dictionary)
        for model in topic_models
    ]

    oov_words = cm._accumulator.not_in_vocab(topics_as_topn_terms)
    print('number of oov words: %d' % len(oov_words))
    
    for num_topics, words in zip(trained_models.keys(), topics_as_topn_terms):
        oov_words = cm._accumulator.not_in_vocab(words)
        print('number of oov words for num_topics=%d: %d' % (num_topics, len(oov_words)))

report_on_oov_terms(cm, trained_models.values())