In [6]:
import json
from itertools import chain, islice
from os.path import join
#from time import time

#import numpy as np
import pandas as pd
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import CoherenceModel, LdaModel
#from pandas.core.common import SettingWithCopyWarning

from constants import (
    ETL_PATH
)
from topic_coherence_experiments import TopicsLoader

#import warnings
#warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
#from utils import tprint

pd.options.display.max_rows = 2001
pd.options.display.precision = 3
#np.set_printoptions(precision=3, threshold=None, edgeitems=None, linewidth=800, suppress=None)

datasets = {
    'E': 'Europarl',
    'FA': 'FAZ_combined',
    'FO': 'FOCUS_cleansed',
    'O': 'OnlineParticipation',
    'P': 'PoliticalSpeeches',
    'dewi': 'dewiki',
    'dewa': 'dewac',
}
dataset=datasets['O']
params_list = ['a42', 'b42', 'c42', 'd42']
nbs_topics = [10, 25, 50, 100]
param_id = params_list[0]
nb_topics = nbs_topics[0]

topicsloader = TopicsLoader(dataset=dataset, param_ids=[param_id], nbs_topics=[nb_topics])
ldamodel = topicsloader.ldamodels[0]
corpus = topicsloader.corpus

loading dictionary from ../data/preprocessed/LDAmodel/OnlineParticipation_fullset_nouns_bow.dict
loading corpus from ../data/preprocessed/LDAmodel/OnlineParticipation_fullset_nouns_bow.mm
loading texts from ../data/preprocessed/LDAmodel/OnlineParticipation_fullset_nouns_texts.json
Loading model from ../data/preprocessed/LDAmodel/a42/OnlineParticipation_LDAmodel_a42_10


In [11]:
def split_corpus(corpus, max_test_size_rel=0.1, max_test_size_abs=5000):
    length = len(corpus)
    corpora = dict()
    if length*max_test_size_rel < max_test_size_abs:
        split1 = int(length*(1-(2*max_test_size_rel)))
        split2 = int(length*(1-max_test_size_rel))
    else:
        split1 = length-(2*max_test_size_abs)
        split2 = length-max_test_size_abs
    corpora['training_corpus'] = corpus[:split1]
    corpora['holdout_corpus'] = corpus[split1:split2]
    corpora['test_corpus'] = corpus[split2:]
    print(
        f'split dataset. size of:',
        f'train_set={split1},',
        f'val_set={split2 - split1},',
        f'test_set={len(corpus) - split2},'
    )
    return corpora

statsfile = f'{dataset}_LDAmodel_{param_id}_{nb_topics}_stats.json'
statspath = join(ETL_PATH, 'LDAmodel', param_id, statsfile)
with open(statspath, 'r') as fp:
    print('Loadin', statspath)
    stats = json.load(fp)
    
corpora = split_corpus(corpus)

Loadin ../data/preprocessed/LDAmodel/a42/OnlineParticipation_LDAmodel_a42_10_stats.json
split dataset. size of: train_set=20531, val_set=2566, test_set=2567,


In [13]:
iterations = {
    'a42': None,
    'b42': 200,
    'c42': 1_000,
    'd42': 200,
}
ldamodel[-1].update(
    corpus=corpora['training_corpus'], 
    chunksize=20_000,
    passes=10, 
    iterations=iterations[param_id],
)

In [21]:
import numpy as np

metrics = ldamodel[-1].metrics
out_dir = join(ETL_PATH, f'LDAmodel/{param_id}')
out = join(out_dir, f'{dataset}_LDAmodel_{param_id}_{nb_topics}')
with open(out + '_resume_metrics.json', 'w') as fp:
    serializable_metrics = {}
    for k, v in metrics.items():
        if isinstance(v[0], np.ndarray):
            serializable_metrics[k] = [x.tolist() for x in v]
        else:
            serializable_metrics[k] = [float(x) for x in v]
    json.dump(serializable_metrics, fp)
