In [2]:
import json
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import yaml

sys.path.append("/workspace/topic-preprocessing/soup_nuts/models/dvae/")
from utils import compute_to, compute_tu

In [3]:
def load_json(path):
    with open(path) as infile:
        return json.load(infile)

def load_yaml(path):
    with open(path) as infile:
        return yaml.load(infile, Loader=yaml.FullLoader)

def load_text(path):
    with open(path) as infile:
        return [text.strip().split(" ") for text in infile]

def save_json(obj, path):
    with open(path, 'w') as outfile:
        return json.dump(obj, outfile)

## Coherences Computed Outside with calculate-coherence.py

In [16]:
run_dir = "./outputs/full-mindf_power_law-maxdf_0.9"
#run_dir = "./outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9"
coherence_measure = "c_npmi_10_full" # npmi test, npmi full yield same results

max_overlapping_words = 5

In [12]:
mallet_paths = [p for p in Path(run_dir).glob("**/mallet/**/coherences.json")]
dvae_paths = [p for p in Path(run_dir).glob("**/dvae/**/coherences.json")]

In [13]:
run_name = Path(run_dir).name
input_dir_map = {
    f"/workspace/topic-preprocessing/data/nytimes/processed/{run_name}": "nytimes",
    f"/workspace/topic-preprocessing/data/wikitext/processed/{run_name}": "wikitext",
    f"/workspace/topic-preprocessing/data/bbc/processed/{run_name}": "bbc",
}

In [17]:
mallet_config_keys = ['alpha', 'beta', 'input_dir', 'run_seeds', 'iterations']
mallet_results = []
for p in mallet_paths:
    coherences = load_json(p)
    config = {k: v for k, v in load_yaml(p.parent / "config.yml").items() if k in mallet_config_keys}
    config["input_dir"] = input_dir_map[config["input_dir"]]
    # TODO: select best if desired
    final_coherence = list(coherences[coherence_measure].values())[-1]
    topics = load_text(final_coherence["path"])
    to, overlaps = compute_to(topics, n=max_overlapping_words, return_overlaps=True)
    tu = np.mean(compute_tu(topics, n=10))

    mallet_results.append({
        coherence_measure: np.nan if np.isinf(final_coherence["aggregate"]) else final_coherence["aggregate"],
        f"{coherence_measure}_sd": np.std(final_coherence["by_topic"]),
        "tu": tu,
        "to": to,
        f"mean_{coherence_measure}_tu": np.mean([tu, final_coherence["aggregate"]]),
        "overlaps": np.sum(overlaps >= max_overlapping_words),
        **config,
        "topics": topics,
        "path": str(p.parent),
    })
mallet_results = pd.DataFrame(mallet_results)

In [18]:
mallet_top_coh = (
    mallet_results.loc[mallet_results.overlaps == 0]
                  .sort_values(["input_dir", coherence_measure], ascending=False)
                  .groupby("input_dir").head(1)
)
mallet_top_topics = {
    input_dir: data.to_dict('records')[0]
    for input_dir, data in mallet_top_coh.groupby("input_dir")
}

In [19]:
dvae_config_keys = [
    "input_dir",
    "alpha_prior",
    "learning_rate",
    "encoder_hidden_dim",
    "topic_word_regularization",
    "num_epochs",
    "epochs_to_anneal_bn",
    "epochs_to_anneal_kl",
    "run_seeds"
]

dvae_results = []
for p in dvae_paths:
    coherences = load_json(p)
    config = {k: v for k, v in load_yaml(p.parent / "config.yml").items() if k in dvae_config_keys}
    config["input_dir"] = input_dir_map[config["input_dir"]]
    final_coherence = list(coherences[coherence_measure].values())[-1]
    topics = load_text(final_coherence["path"])
    to, overlaps = compute_to(topics, n=max_overlapping_words, return_overlaps=True)
    tu = np.mean(compute_tu(topics, n=10))

    coh_values = np.nan_to_num(final_coherence["by_topic"], nan=0, posinf=0)

    dvae_results.append({
        coherence_measure: np.mean(coh_values),
        f"{coherence_measure}_sd": np.std(coh_values),
        "tu": tu,
        "to": to,
        f"mean_{coherence_measure}_tu": np.mean([tu, final_coherence["aggregate"]]),
        "overlaps": np.sum(overlaps >= max_overlapping_words),
        **config,
        "topics": topics,
        "path": str(p.parent),
    })
dvae_results = pd.DataFrame(dvae_results)

In [20]:
dvae_top_coh = (
    dvae_results.loc[dvae_results.overlaps == 0]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1)
)
dvae_top_topics = {
    input_dir: data.to_dict('records')[0]
    for input_dir, data in dvae_top_coh.groupby("input_dir")
}

In [21]:
out_path = Path("./results", Path(run_dir).name)
out_path.mkdir(exist_ok=True)
save_json(mallet_top_topics, Path(out_path, f"mallet-topics-best-{coherence_measure}.json"))
save_json(dvae_top_topics, Path(out_path, f"dvae-topics-best-{coherence_measure}.json"))

In [22]:
n = 10
pd.DataFrame(
    [model, dataset, ", ".join(topic[:n])]
    for (model, model_topics) in [('mallet',  mallet_top_topics), ('dvae', dvae_top_topics)]
    for dataset, dataset_topics in model_topics.items()
    for topic in dataset_topics['topics']
).to_csv(out_path / f"topics-best-{coherence_measure}.csv", index=False)

In [23]:
sorted_dvae = (dvae_results.loc[dvae_results.overlaps == 0]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1))
sorted_dvae

Unnamed: 0,c_npmi_10_test,c_npmi_10_test_sd,tu,to,mean_c_npmi_10_test_tu,overlaps,alpha_prior,encoder_hidden_dim,epochs_to_anneal_bn,epochs_to_anneal_kl,input_dir,learning_rate,num_epochs,run_seeds,topic_word_regularization,topics,path
74,0.215076,0.081543,0.92,0.055102,0.567538,0,0.01,0,1,100,wikitext,0.01,500,42,0.0,"[[certifications, mtv_news, australian_recordi...",outputs/full-mindf_power_law-maxdf_0.9/wikitex...
130,0.25288,0.132479,0.948,0.046939,0.60044,0,0.01,0,200,200,nytimes,0.01,500,5591,0.1,"[[bridegroom, officiated, laude, bride, cum, m...",outputs/full-mindf_power_law-maxdf_0.9/nytimes...


In [24]:
for idx, row in sorted_dvae.iterrows():
    print(f"\n\n===={row.input_dir} ({idx})====\n{Path(row.path).parent.name}")
    for topic in row.topics:
        print("  ".join(topic[:5]))



====wikitext (74)====
alpha_0.01-lr_0.01-h2dim_0-reg_0.0-epochs_500-anneal_bn_1-anneal_kl_100
certifications  mtv_news  australian_recording_industry_association  chart  sal_cinquemani
stonework  nave  castle  vaulted  architectural
house_of_commons  church_of_england  protestant  highness  queen_victoria
juveniles  females  iucn  males  species
manhattan_project  los_alamos_laboratory  robert_oppenheimer  enrico_fermi  physicist
epidemiology  symptoms  clinical  diagnosis  therapy
album  certifications  chart  billboard  recording_industry_association_of_america
supreme_court  constitutional  courts  statutory  statute
hindu  inscriptions  dynasty  deity  temple
nhl  national_hockey_league  playoffs  american_hockey_league  hockey
film  filmography  screenplay  roger_ebert  times_of_india
composer  composers  orchestral  opera  soloists
theory  philosopher  empirical  philosophers  thinkers
demography  parish  constituency  councillors  domesday
painting  paintings  painter  literar

In [25]:
sorted_mallet = (mallet_results.loc[mallet_results.overlaps == 0]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1))
sorted_mallet

Unnamed: 0,c_npmi_10_test,c_npmi_10_test_sd,tu,to,mean_c_npmi_10_test_tu,overlaps,alpha,beta,input_dir,iterations,run_seeds,topics,path
63,0.133184,0.049758,0.742,0.247959,0.437592,0,1.0,0.05,wikitext,2000,42,"[[water, area, river, park, miles, years, feet...",outputs/full-mindf_power_law-maxdf_0.9/wikitex...
134,0.158064,0.079289,0.768,0.197449,0.463032,0,0.25,0.1,nytimes,1000,11235,"[[oil, water, plant, environmental, gas, power...",outputs/full-mindf_power_law-maxdf_0.9/nytimes...


In [26]:
for idx, row in sorted_mallet.iterrows():
    print(f"\n\n===={row.input_dir} ({idx})====\n{Path(row.path).parent.name}")
    for topic in row.topics:
        print("  ".join(topic[:5]))



====wikitext (63)====
alpha_1.0-beta_0.05-iter_2000-opt_500
water  area  river  park  miles
species  birds  males  females  bird
match  championship  team  event  title
world  won  race  games  time
club  season  team  cup  match
horses  horse  breed  coins  silver
family  life  years  time  father
episode  season  series  episodes  character
route  highway  road  state  north
election  president  state  government  party
building  built  century  house  site
chinese  china  century  government  world
arab  muslim  israel  egypt  jewish
game  team  season  yards  yard
india  temple  indian  century  king
million  company  year  announced  business
album  band  music  song  released
race  stage  lap  team  time
forces  war  attack  division  troops
disease  cells  blood  cell  risk
station  line  bridge  railway  trains
american  war  united_states  washington  new_york
music  musical  opera  works  composer
police  people  found  death  prison
school  students  university  college  y

Diagnosing nans:

In [34]:
from collections import Counter

bad_words = Counter()
n_bad_topics = 0
for p in dvae_paths:
    coherences = load_json(p)
    try:
        final_coherence = list(coherences[coherence_measure].values())[-1]
    except KeyError:
        final_coherence = list(coherences['c_npmi_10_full'].values())[-1]
        print(f"Missing coherence for {p.parent}, {final_coherence['aggregate']:0.3f}")
    topics = load_text(final_coherence["path"])
    bad_coherence = np.isnan(final_coherence["by_topic"]) | np.isinf(final_coherence["by_topic"])
    #bad_coherence = np.isnan(final_coherence["by_topic"])
    if bad_coherence.sum() == 0:
        continue
    bad_topics = [topic for i, topic in enumerate(topics) if bad_coherence[i]]
    print(f"\n\n{p}")
    for bad_topic in bad_topics:
        print(" ".join(bad_topic[:10]))
        bad_words.update(bad_topic[:10])
        n_bad_topics += 1

In [31]:
n_bad_topics, bad_words.most_common(10)

(25,
 [('paula_vitaris', 22),
  ('dana_scully', 18),
  ('fox_mulder', 18),
  ('gillian_anderson', 17),
  ('lars_pearson', 15),
  ('robert_shearman', 14),
  ('mulder', 14),
  ('scully', 14),
  ('frank_spotnitz', 12),
  ('david_duchovny', 10)])