In [1]:
import json
from collections import Counter
from pathlib import Path
import sys
import itertools

import numpy as np
import pandas as pd
import yaml

from calculate_coherence import (
    make_runs, gen_measure_name, SLURM_HEADER, save_json, save_text, load_json
)

## Calculate a bunch of coherence metrics for the selected models

In [3]:
results_dir = "./results/full-mindf_power_law-maxdf_0.9"
output_dir = "./outputs/full-mindf_power_law-maxdf_0.9"
coherence_measure = "c_npmi_10_full"
overlapping_word_threshold = 5

In [7]:
# selected with collect-runs.ipynb
mallet = load_json(Path(results_dir, f"mallet-topics-best-{coherence_measure}.json"))
dvae = load_json(Path(results_dir, f"dvae-topics-best-{coherence_measure}.json"))
etm = load_json(Path(results_dir, f"etm-topics-best-{coherence_measure}.json"))

model_results = {"mallet": mallet, "dvae": dvae, "etm": etm}

class DummyArgs:
    input_dir = None
    start_at = None
    eval_every_n = None
    eval_last_only = True
    coherence_measure = None
    reference_corpus = None
    top_n = 10
    window_size = None
    python_path = "/workspace/.conda/envs/gensim/bin/python"
    update_existing = False

args = DummyArgs()

In [42]:
def gen_commands(references, metrics, model_results):
    commands = []
    for ref, metric in itertools.product(references, metrics):
        for model in model_results:
            for data in model_results[model]:
                args.coherence_measure = metric
                args.input_dir = model_results[model][data]["path"]
                args.reference_corpus = "full" if data in ref else ref
                args.window_size = 10 if metric == "c_npmi" else None
                command = make_runs(args, save=False)
                if command:
                    commands += command
    return commands

npmis_and_c_v = gen_commands(["wikitext_full", "nytimes_full", "train", "val", "test"], ["c_npmi", "c_v"], model_results)
other_metrics = gen_commands(["full"], ["u_mass", "c_uci"], model_results)
commands = npmis_and_c_v + other_metrics

In [43]:
slurm_log_dir = Path(output_dir, "_run-logs/coherence/slurm-logs")
slurm_header = SLURM_HEADER.format(n_jobs=len(commands)-1, log_dir=slurm_log_dir)
commands = [slurm_header] + [
    f"test ${{SLURM_ARRAY_TASK_ID}} -eq {run_id} && sleep {run_id}s && {run_command}"
    for run_id, run_command in enumerate(commands)
]
slurm_sbatch_script = "\n".join(commands)
print(f"found {len(commands)} runs")
save_text(slurm_sbatch_script, "./coherence-best-model-runs.sh")

found 55 runs


## Collect results

In [28]:
datasets = ["wikitext", "nytimes"]
coherences = {}
for data in datasets:
    coherences[data] = {}
    for model in model_results:
        coherences[data][model] = {}
        coherences[data][model]["metrics"] = {}
        coherences[data][model]["topics"] = [t[:20] for t in model_results[model][data]["topics"]]

        path = model_results[model][data]["path"]
        coh_data = load_json(Path(path, "coherences.json"))
        for metric in sorted(coh_data):
            coh_by_epoch = list(coh_data[metric].values())
            assert(len(coh_by_epoch) == 1)
            final_coh = coh_by_epoch[-1]["by_topic"]
            coherences[data][model]["metrics"][metric] = final_coh

In [29]:
save_json(coherences, Path(results_dir, "coherences-for-selected-models.json"))

In [30]:
sorted(coh_data.keys())

['c_npmi_10_full',
 'c_npmi_10_test',
 'c_npmi_10_train',
 'c_npmi_10_val',
 'c_npmi_10_wikitext_full',
 'c_uci_full',
 'c_v_full',
 'c_v_test',
 'c_v_train',
 'c_v_val',
 'c_v_wikitext_full',
 'u_mass_full']

'c_npmi_10_full',cool
'c_npmi_10_test',cool
'c_npmi_10_train',cool
'c_npmi_10_val',cool
'c_npmi_10_wikitext_full',cool
'c_uci_full',cool
'c_v_full',cool
'c_v_test',cool
'c_v_train',cool
'c_v_val',cool
'c_v_wikitext_full',cool
'u_mass_full'

* c_npmi_10_full' : NPMI with 10-word window on full (train+val+test) data, about ~1.82m docs for NYT, ~4.65m for wiki, following Lau 2014. This is used for model selection.
* c_npmi_10_test: NPMI with 10-word window on test data, about . Basically here for completeness since the sizes are very similar to "full"
* c_npmi_10_train: NPMI with 10-word window on train data. 28k docs for wiki, 
* c_npmi_10_val: 4200 for wiki
* c_npmi_10_wikitext/nytimes_full:
* c_uci_full:
* c_v_full:
* c_v_test:
* c_v_train:
* c_v_val:
* c_v_wikitext_full:
* u_mass_full: 