In [2]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import yaml

In [1]:
def load_json(path):
    with open(path) as infile:
        return json.load(infile)

def load_yaml(path):
    with open(path) as infile:
        return yaml.load(infile, Loader=yaml.FullLoader)

def load_text(path):
    with open(path) as infile:
        return [text.strip().split(" ") for text in infile]

def save_json(obj, path):
    with open(path, 'w') as outfile:
        return json.dump(obj, outfile)

## Collect results from runs

In [90]:

# run_dir = "./outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9"
run_dir = "./outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9"

mallet_paths = [p for p in Path(run_dir).glob("**/mallet/**/metrics.json") if 'with_unk' not in str(p)]
dvae_paths = [p for p in Path(run_dir).glob("**/dvae/**/results.csv") if 'with_unk' not in str(p)]

In [77]:
def config_to_cols(path_col, config_keys):
    return path_col.apply(
        lambda x: pd.Series({
            k: v for k, v in load_yaml(x / 'config.yml').items()
            if k in config_keys
        })
    )

In [78]:
mallet_results = pd.DataFrame(
    {
        k: v for k, v in list(load_json(p).items())
        if k in ['npmi', 'tu_mean', 'to', 'entire_overlaps', 'path']
    }
    for p in mallet_paths
)

mallet_results['model'] = 'mallet'
mallet_results['path'] = [p.parent for p in mallet_paths]

In [79]:
mallet_config_keys = ['alpha', 'beta', 'input_dir', 'run_seeds', 'iterations']
mallet_results = pd.concat(
    [mallet_results, config_to_cols(mallet_results.path, mallet_config_keys)], axis=1
)

In [80]:
dvae_results = pd.concat([
    pd.read_csv(p.parent / "run_results.csv", index_col=0) for p in dvae_paths
], ignore_index=True)
dvae_results['model'] = 'dvae'
dvae_results['path'] = [p.parent for p in dvae_paths]

In [81]:
dvae_config_keys = [
    "input_dir",
    "alpha_prior",
    "learning_rate",
    "encoder_hidden_dim",
    "topic_word_regularization",
    "num_epochs",
    "epochs_to_anneal_bn",
    "epochs_to_anneal_kl",
    "run_seeds"
]
dvae_results = pd.concat(
    [dvae_results, config_to_cols(dvae_results.path, dvae_config_keys)], axis=1
)

In [82]:
run_name = Path(run_dir).name
input_dir_map = {
    f"/workspace/topic-preprocessing/data/nytimes/processed/{run_name}": "nytimes",
    f"/workspace/topic-preprocessing/data/wikitext/processed/{run_name}": "wikitext",
    f"/workspace/topic-preprocessing/data/bbc/processed/{run_name}": "bbc",
}
mallet_results['input_dir'] = mallet_results.input_dir.replace(input_dir_map)
dvae_results['input_dir'] = dvae_results.input_dir.replace(input_dir_map)

## Retrieve best-performing models

In [83]:
import numpy as np
dvae_results['mean_npmi_to'] = np.array(
    [dvae_results.best_npmi, 1 - dvae_results.best_to_at_best_npmi]
).mean(axis=0)
dvae_results.iloc[dvae_results.groupby("input_dir")["mean_npmi_to"].idxmax()]

Unnamed: 0,seed,best_npmi,best_npmi_epoch,best_tu_at_best_npmi,best_to_at_best_npmi,overlaps_at_best_npmi,model,path,alpha_prior,encoder_hidden_dim,epochs_to_anneal_bn,epochs_to_anneal_kl,input_dir,learning_rate,num_epochs,run_seeds,topic_word_regularization,mean_npmi_to
91,42,0.405203,137,0.98,0.01345,0,dvae,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.1,0,100,100,bbc,0.01,200,42,0.01,0.695876
45,42,0.526151,91,0.99,0.008187,0,dvae,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.001,0,1,200,nytimes,0.01,500,42,0.0,0.758982
161,5591,0.557972,151,1.0,0.0,0,dvae,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.1,0,100,200,wikitext,0.001,500,5591,0.0,0.778986


In [84]:
import numpy as np
mallet_results['mean_npmi_to'] = np.array(
    [mallet_results.npmi, 1 - mallet_results.to]
).mean(axis=0)
mallet_results.iloc[mallet_results.groupby("input_dir")["mean_npmi_to"].idxmax()]

Unnamed: 0,npmi,tu_mean,to,entire_overlaps,model,path,alpha,beta,input_dir,iterations,run_seeds,mean_npmi_to
130,0.191584,0.715,0.177778,0,mallet,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,1.0,0.1,bbc,1000,5591,0.506903
69,0.255506,0.74,0.310234,0,mallet,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.1,0.05,nytimes,1000,5591,0.472636
180,0.24938,0.8,0.162281,0,mallet,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.1,0.1,wikitext,1000,5591,0.54355


In [85]:
dvae_top_npmi = dvae_results.iloc[dvae_results.groupby("input_dir")["best_npmi"].idxmax()]
dvae_top_npmi

Unnamed: 0,seed,best_npmi,best_npmi_epoch,best_tu_at_best_npmi,best_to_at_best_npmi,overlaps_at_best_npmi,model,path,alpha_prior,encoder_hidden_dim,epochs_to_anneal_bn,epochs_to_anneal_kl,input_dir,learning_rate,num_epochs,run_seeds,topic_word_regularization,mean_npmi_to
91,42,0.405203,137,0.98,0.01345,0,dvae,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.1,0,100,100,bbc,0.01,200,42,0.01,0.695876
44,11235,0.582163,271,0.06,0.983333,17,dvae,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.001,0,200,200,nytimes,0.01,500,11235,0.0,0.299415
161,5591,0.557972,151,1.0,0.0,0,dvae,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.1,0,100,200,wikitext,0.001,500,5591,0.0,0.778986


In [86]:
mallet_top_npmi = mallet_results.iloc[mallet_results.groupby("input_dir")["npmi"].idxmax()]
mallet_top_npmi

Unnamed: 0,npmi,tu_mean,to,entire_overlaps,model,path,alpha,beta,input_dir,iterations,run_seeds,mean_npmi_to
108,0.217124,0.75,0.259942,0,mallet,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.1,0.01,bbc,2000,11235,0.478591
65,0.268696,0.735,0.460819,0,mallet,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.25,0.1,nytimes,2000,11235,0.403939
194,0.251148,0.8,0.164327,0,mallet,outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9/...,0.1,0.1,wikitext,2000,42,0.54341


In [87]:
mallet_top_topics = {
    row.input_dir: {
        'topics': load_text(row.path / "topics.txt"),
        'path': str(row.path),
    }
    for idx, row in mallet_top_npmi.iterrows()
}

dvae_top_topics = {
    row.input_dir: {
        'topics': load_text(row.path / "topics.txt"),
        'path': str(row.path),
    }
    for idx, row in dvae_top_npmi.iterrows()
}

In [88]:
out_path = Path("./results", Path(run_dir).name)
out_path.mkdir(exist_ok=True)
save_json(mallet_top_topics, Path(out_path, "mallet-topics-best-npmi.json"))
save_json(dvae_top_topics, Path(out_path, "dvae-topics-best-npmi.json"))

In [89]:
n = 20
pd.DataFrame(
    [model, dataset, ", ".join(topic[:n])]
    for (model, model_topics) in [('mallet',  mallet_top_topics), ('dvae', dvae_top_topics)]
    for dataset, dataset_topics in model_topics.items()
    for topic in dataset_topics['topics']
).to_csv(out_path / f"topics-best-npmi.csv", index=False)

## Get bad hyperparams

In [15]:
dvae_results['npmi_rank'] = dvae_results.groupby("input_dir")["best_npmi"].rank(pct=True, ascending=False)
dvae_results["kl_bn_anneal"] = (
    "bn_" + dvae_results.epochs_to_anneal_bn.astype(str) +
    "-kl_" + dvae_results.epochs_to_anneal_kl.astype(str)
)
(
    pd.melt(dvae_results, id_vars=['path', 'best_npmi', 'best_to_at_best_npmi', 'npmi_rank'], value_vars=dvae_config_keys + ['kl_bn_anneal'])
      .groupby(["variable", "value"])[["npmi_rank", "best_npmi"]]
      .mean()
      .sort_values("npmi_rank")
)

Unnamed: 0_level_0,Unnamed: 1_level_0,npmi_rank,best_npmi
variable,value,Unnamed: 2_level_1,Unnamed: 3_level_1
learning_rate,0.01,0.339639,0.420558
kl_bn_anneal,bn_1-kl_100,0.345546,0.40594
kl_bn_anneal,bn_100-kl_100,0.389201,0.38757
topic_word_regularization,0.1,0.427566,0.37042
epochs_to_anneal_bn,100,0.446917,0.370799
learning_rate,0.001,0.449209,0.37132
num_epochs,500,0.453567,0.37312
epochs_to_anneal_kl,100,0.46951,0.367679
kl_bn_anneal,bn_0-kl_200,0.472113,0.373419
kl_bn_anneal,bn_100-kl_1,0.474366,0.353205


In [16]:
mallet_results['npmi_rank'] = mallet_results.groupby("input_dir")["npmi"].rank(pct=True, ascending=False)
(
    pd.melt(mallet_results, id_vars=['path', 'npmi', 'to', 'npmi_rank'], value_vars=mallet_config_keys)
      .groupby(["variable", "value"])[["npmi_rank", "npmi"]]
      .mean()
      .sort_values("npmi_rank")
)

Unnamed: 0_level_0,Unnamed: 1_level_0,npmi_rank,npmi
variable,value,Unnamed: 2_level_1,Unnamed: 3_level_1
beta,0.1,0.420436,0.247764
iterations,2000,0.426668,0.248478
alpha,0.1,0.456801,0.248297
beta,0.05,0.457972,0.248563
run_seeds,11235,0.461451,0.244701
beta,0.01,0.46541,0.246997
run_seeds,42,0.492032,0.248104
alpha,10.0,0.496176,0.247158
alpha,0.25,0.497192,0.244788
input_dir,wikitext,0.505155,0.227564


## Other coherence measures

In [17]:
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

from tqdm import tqdm



In [18]:
texts = {
    name: {
        "train_dict": Dictionary(load_text(Path(input_dir, "train.txt"))),
        "val_text": load_text(Path(input_dir, "val.txt")),
    }
    for input_dir, name in input_dir_map.items()
}

In [21]:
n = 10

dvae_coherences = []
for idx, row in tqdm(dvae_results.iterrows(), total=len(dvae_results)):
    cm = CoherenceModel(
        topics=[t[:n] for t in load_text(row.path / "topics.txt")],
        texts=texts[row.input_dir]['val_text'],
        dictionary=texts[row.input_dir]['train_dict'],
        coherence='c_v',
    )
    dvae_coherences.append(cm.get_coherence())

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))
100%|██████████| 174/174 [32:11<00:00, 11.10s/it]


In [22]:
out_path = Path("./results", Path(run_dir).name)
dvae_results['c_v'] = dvae_coherences
dvae_results.to_csv(out_path / "dvae_results_summary.csv")

In [None]:
n = 10

mallet_coherences = []
for idx, row in tqdm(mallet_results.iterrows(), total=len(mallet_results)):
    cm = CoherenceModel(
        topics=[t[:n] for t in load_text(row.path / "topics.txt")],
        texts=texts[row.input_dir]['val_text'],
        dictionary=texts[row.input_dir]['train_dict'],
        coherence='c_v'
    )
    mallet_coherences.append(cm.get_coherence())

In [None]:
out_path = Path("./results", Path(run_dir).name)
mallet_results['c_v'] = mallet_coherences
mallet_results.to_csv(out_path / "mallet_results_summary.csv")

In [None]:
dvae_top_c_v = dvae_results.iloc[dvae_results.groupby("input_dir")["c_v"].idxmax()]
dvae_top_c_v

In [None]:
mallet_top_c_v = mallet_results.iloc[mallet_results.groupby("input_dir")["c_v"].idxmax()]
mallet_top_c_v

## Coherences Computed Outside

In [62]:
import sys
sys.path.append("/workspace/topic-preprocessing/soup_nuts/models/dvae/")
from utils import compute_to, compute_tu

In [79]:
run_dir = "./outputs/full-mindf_power_law-maxdf_0.9"
#run_dir = "./outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9"
coherence_measure = "c_v_full" # npmi test, npmi full yield same results

max_overlapping_words = 5

In [36]:
mallet_paths = [p for p in Path(run_dir).glob("**/mallet/**/coherences.json")]
dvae_paths = [p for p in Path(run_dir).glob("**/dvae/**/coherences.json")]

In [47]:
run_name = Path(run_dir).name
input_dir_map = {
    f"/workspace/topic-preprocessing/data/nytimes/processed/{run_name}": "nytimes",
    f"/workspace/topic-preprocessing/data/wikitext/processed/{run_name}": "wikitext",
    f"/workspace/topic-preprocessing/data/bbc/processed/{run_name}": "bbc",
}

In [80]:
mallet_config_keys = ['alpha', 'beta', 'input_dir', 'run_seeds', 'iterations']
mallet_results = []
for p in mallet_paths:
    coherences = load_json(p)
    config = {k: v for k, v in load_yaml(p.parent / "config.yml").items() if k in mallet_config_keys}
    config["input_dir"] = input_dir_map[config["input_dir"]]
    # TODO: select best if desired
    final_coherence = list(coherences[coherence_measure].values())[-1]
    topics = load_text(final_coherence["path"])
    to, overlaps = compute_to(topics, n=max_overlapping_words, return_overlaps=True)
    tu = np.mean(compute_tu(topics, n=10))

    mallet_results.append({
        coherence_measure: np.nan if np.isinf(final_coherence["aggregate"]) else final_coherence["aggregate"],
        f"{coherence_measure}_sd": np.std(final_coherence["by_topic"]),
        "tu": tu,
        "to": to,
        f"mean_{coherence_measure}_tu": np.mean([tu, final_coherence["aggregate"]]),
        "overlaps": np.sum(overlaps >= max_overlapping_words),
        **config,
        "topics": topics,
        "path": str(p.parent),
    })
mallet_results = pd.DataFrame(mallet_results)

In [81]:
mallet_top_coh = (
    mallet_results.loc[mallet_results.overlaps == 0]
                  .sort_values(["input_dir", coherence_measure], ascending=False)
                  .groupby("input_dir").head(1)
)
mallet_top_topics = {
    input_dir: data.to_dict('records')[0]
    for input_dir, data in mallet_top_coh.groupby("input_dir")
}

In [82]:
dvae_config_keys = [
    "input_dir",
    "alpha_prior",
    "learning_rate",
    "encoder_hidden_dim",
    "topic_word_regularization",
    "num_epochs",
    "epochs_to_anneal_bn",
    "epochs_to_anneal_kl",
    "run_seeds"
]

dvae_results = []
for p in dvae_paths:
    coherences = load_json(p)
    config = {k: v for k, v in load_yaml(p.parent / "config.yml").items() if k in dvae_config_keys}
    config["input_dir"] = input_dir_map[config["input_dir"]]
    final_coherence = list(coherences[coherence_measure].values())[-1]
    topics = load_text(final_coherence["path"])
    to, overlaps = compute_to(topics, n=max_overlapping_words, return_overlaps=True)
    tu = np.mean(compute_tu(topics, n=10))

    coh_values = np.nan_to_num(final_coherence["by_topic"], nan=0, posinf=0)

    dvae_results.append({
        coherence_measure: np.mean(coh_values),
        f"{coherence_measure}_sd": np.std(coh_values),
        "tu": tu,
        "to": to,
        f"mean_{coherence_measure}_tu": np.mean([tu, final_coherence["aggregate"]]),
        "overlaps": np.sum(overlaps >= max_overlapping_words),
        **config,
        "topics": topics,
        "path": str(p.parent),
    })
dvae_results = pd.DataFrame(dvae_results)

In [83]:
dvae_top_coh = (
    dvae_results.loc[dvae_results.overlaps == 0]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1)
)
dvae_top_topics = {
    input_dir: data.to_dict('records')[0]
    for input_dir, data in dvae_top_coh.groupby("input_dir")
}

In [84]:
out_path = Path("./results", Path(run_dir).name)
out_path.mkdir(exist_ok=True)
save_json(mallet_top_topics, Path(out_path, f"mallet-topics-best-{coherence_measure}.json"))
save_json(dvae_top_topics, Path(out_path, f"dvae-topics-best-{coherence_measure}.json"))

In [85]:
n = 10
pd.DataFrame(
    [model, dataset, ", ".join(topic[:n])]
    for (model, model_topics) in [('mallet',  mallet_top_topics), ('dvae', dvae_top_topics)]
    for dataset, dataset_topics in model_topics.items()
    for topic in dataset_topics['topics']
).to_csv(out_path / f"topics-best-{coherence_measure}.csv", index=False)

In [86]:
sorted_dvae = (dvae_results.loc[dvae_results.overlaps == 0]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1))
sorted_dvae

Unnamed: 0,c_v_full,c_v_full_sd,tu,to,mean_c_v_full_tu,overlaps,alpha_prior,encoder_hidden_dim,epochs_to_anneal_bn,epochs_to_anneal_kl,input_dir,learning_rate,num_epochs,run_seeds,topic_word_regularization,topics,path
74,0.841277,0.096112,0.92,0.055102,0.880638,0,0.01,0,1,100,wikitext,0.01,500,42,0.0,"[[certifications, mtv_news, australian_recordi...",outputs/full-mindf_power_law-maxdf_0.9/wikitex...
125,0.844652,0.128574,0.948,0.046939,0.896326,0,0.01,0,200,200,nytimes,0.01,500,5591,0.1,"[[bridegroom, officiated, laude, bride, cum, m...",outputs/full-mindf_power_law-maxdf_0.9/nytimes...


In [87]:
for idx, row in sorted_dvae.iterrows():
    print(f"\n\n===={row.input_dir} ({idx})====\n{Path(row.path).parent.name}")
    for topic in row.topics:
        print("  ".join(topic[:5]))



====wikitext (74)====
alpha_0.01-lr_0.01-h2dim_0-reg_0.0-epochs_500-anneal_bn_1-anneal_kl_100
certifications  mtv_news  australian_recording_industry_association  chart  sal_cinquemani
stonework  nave  castle  vaulted  architectural
house_of_commons  church_of_england  protestant  highness  queen_victoria
juveniles  females  iucn  males  species
manhattan_project  los_alamos_laboratory  robert_oppenheimer  enrico_fermi  physicist
epidemiology  symptoms  clinical  diagnosis  therapy
album  certifications  chart  billboard  recording_industry_association_of_america
supreme_court  constitutional  courts  statutory  statute
hindu  inscriptions  dynasty  deity  temple
nhl  national_hockey_league  playoffs  american_hockey_league  hockey
film  filmography  screenplay  roger_ebert  times_of_india
composer  composers  orchestral  opera  soloists
theory  philosopher  empirical  philosophers  thinkers
demography  parish  constituency  councillors  domesday
painting  paintings  painter  literar

In [88]:
sorted_mallet = (mallet_results.loc[mallet_results.overlaps == 0]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1))
sorted_mallet

Unnamed: 0,c_v_full,c_v_full_sd,tu,to,mean_c_v_full_tu,overlaps,alpha,beta,input_dir,iterations,run_seeds,topics,path
36,0.682233,0.097854,0.718,0.291327,0.700117,0,1.0,0.1,wikitext,2000,11235,"[[division, north, battalion, forces, attack, ...",outputs/full-mindf_power_law-maxdf_0.9/wikitex...
102,0.696388,0.111192,0.764,0.202551,0.730194,0,0.05,0.01,nytimes,1000,11235,"[[water, miles, town, day, people, island, par...",outputs/full-mindf_power_law-maxdf_0.9/nytimes...


In [89]:
for idx, row in sorted_mallet.iterrows():
    print(f"\n\n===={row.input_dir} ({idx})====\n{Path(row.path).parent.name}")
    for topic in row.topics:
        print("  ".join(topic[:5]))



====wikitext (36)====
alpha_1.0-beta_0.1-iter_2000-opt_0
division  north  battalion  forces  attack
species  found  large  females  long
season  game  games  home  baseball
species  birds  white  bird  black
ship  ships  british  french  island
french  army  war  battle  men
band  album  music  song  rock
station  line  bridge  railway  construction
court  law  case  police  act
school  students  university  college  year
film  films  production  role  million
women  work  god  social  world
chinese  china  government  language  country
match  event  world  team  championship
nuclear  water  gas  metal  high
character  characters  story  series  love
route  highway  road  state  north
book  published  work  story  books
disease  cells  cell  blood  people
game  season  yards  yard  team
episode  series  season  doctor  episodes
king  england  english  royal  scotland
club  season  team  cup  league
episode  season  series  television  viewers
aircraft  flight  air  engine  design
bui

Diagnosing nans:

In [34]:
from collections import Counter

bad_words = Counter()
n_bad_topics = 0
for p in dvae_paths:
    coherences = load_json(p)
    try:
        final_coherence = list(coherences[coherence_measure].values())[-1]
    except KeyError:
        final_coherence = list(coherences['c_npmi_10_full'].values())[-1]
        print(f"Missing coherence for {p.parent}, {final_coherence['aggregate']:0.3f}")
    topics = load_text(final_coherence["path"])
    bad_coherence = np.isnan(final_coherence["by_topic"]) | np.isinf(final_coherence["by_topic"])
    #bad_coherence = np.isnan(final_coherence["by_topic"])
    if bad_coherence.sum() == 0:
        continue
    bad_topics = [topic for i, topic in enumerate(topics) if bad_coherence[i]]
    print(f"\n\n{p}")
    for bad_topic in bad_topics:
        print(" ".join(bad_topic[:10]))
        bad_words.update(bad_topic[:10])
        n_bad_topics += 1

In [31]:
n_bad_topics, bad_words.most_common(10)

(25,
 [('paula_vitaris', 22),
  ('dana_scully', 18),
  ('fox_mulder', 18),
  ('gillian_anderson', 17),
  ('lars_pearson', 15),
  ('robert_shearman', 14),
  ('mulder', 14),
  ('scully', 14),
  ('frank_spotnitz', 12),
  ('david_duchovny', 10)])