In [1]:
import json
from collections import Counter
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import yaml

sys.path.append("/workspace/topic-preprocessing/soup_nuts/models/dvae/")
from utils import compute_to, compute_tu

In [2]:
def load_json(path):
    with open(path) as infile:
        return json.load(infile)

def load_yaml(path):
    with open(path) as infile:
        return yaml.load(infile, Loader=yaml.FullLoader)

def load_text(path):
    with open(path) as infile:
        return [text.strip().split(" ") for text in infile]

def save_json(obj, path):
    with open(path, 'w') as outfile:
        return json.dump(obj, outfile, indent=2)

## Coherences Computed Outside with calculate_coherence.py

In [3]:
run_dir = "./outputs/full-mindf_power_law-maxdf_0.9"
#run_dir = "./outputs/vocab_25k-mindf_0.0001_or_3-maxdf_0.9"
coherence_measure = "c_npmi_10_full" # npmi test, npmi full yield same results

overlapping_word_threshold = 5
diversity_top_n = 5

In [4]:
mallet_paths = [p for p in Path(run_dir).glob("**/mallet/**/coherences.json")]
dvae_paths = [p for p in Path(run_dir).glob("**/dvae/**/coherences.json")]
etm_paths = [p for p in Path(run_dir).glob("**/etm/**/coherences.json")]

In [5]:
run_name = Path(run_dir).name
input_dir_map = {
    f"/workspace/topic-preprocessing/data/nytimes/processed/{run_name}": "nytimes",
    f"/workspace/topic-preprocessing/data/wikitext/processed/{run_name}": "wikitext",
    f"/workspace/topic-preprocessing/data/bbc/processed/{run_name}": "bbc",
}

In [6]:
mallet_config_keys = ['alpha', 'beta', 'input_dir', 'run_seeds', 'iterations']
mallet_results = []
for p in mallet_paths:
    coherences = load_json(p)
    config = {k: v for k, v in load_yaml(p.parent / "config.yml").items() if k in mallet_config_keys}
    config["input_dir"] = input_dir_map[config["input_dir"]] # dvae, mallet
        
    # select last coherence, TODO: select best among all if desired
    final_coherence = list(coherences[coherence_measure].values())[-1]
    # get coherence for top 5/15 words if available
    # (not used for model selection, just reference)
    missing_result = {None: {"by_topic": -np.inf}}
    coh_5 = list(coherences.get(f"{coherence_measure}_top5", missing_result).values())[-1]
    coh_15 = list(coherences.get(f"{coherence_measure}_top15", missing_result).values())[-1]

    topics = load_text(final_coherence["path"])
    to, overlaps = compute_to(topics, n=diversity_top_n, return_overlaps=True)
    tu = np.mean(compute_tu(topics, n=diversity_top_n))

    coh_values = np.nan_to_num(final_coherence["by_topic"], nan=0, posinf=0)
    coh_5_values = np.nan_to_num(coh_5["by_topic"], nan=0, posinf=0, neginf=np.nan)
    coh_15_values = np.nan_to_num(coh_15["by_topic"], nan=0, posinf=0, neginf=np.nan)
    coh_top_n_means = np.mean(np.concatenate([coh_values, coh_5_values, coh_15_values]))
    
    mallet_results.append({
        coherence_measure: np.mean(coh_values),
        f"{coherence_measure}_topn_mean": coh_top_n_means,
        f"{coherence_measure}_sd": np.std(coh_values),
        "tu": tu,
        "to": to,
        f"mean_{coherence_measure}_tu": np.mean([tu, final_coherence["aggregate"]]),
        "overlaps": np.sum(overlaps >= overlapping_word_threshold),
        **config,
        "topics": topics,
        f"{coherence_measure}_all": coh_values.tolist(),
        f"{coherence_measure}_top5_all": coh_5_values.tolist(),
        "path": str(p.parent),
    })
mallet_results = pd.DataFrame(mallet_results)

In [7]:
mallet_top_coh = (
    mallet_results.loc[mallet_results.overlaps == 0]
                  .sort_values(["input_dir", coherence_measure], ascending=False)
                  .groupby("input_dir").head(1)
)
mallet_top_topics = {
    input_dir: data.to_dict('records')[0]
    for input_dir, data in mallet_top_coh.groupby("input_dir")
}

In [8]:
dvae_config_keys = [
    "input_dir",
    "alpha_prior",
    "learning_rate",
    "encoder_hidden_dim",
    "topic_word_regularization",
    "num_epochs",
    "epochs_to_anneal_bn",
    "epochs_to_anneal_kl",
    "run_seeds"
]

dvae_results = []
for p in dvae_paths:
    coherences = load_json(p)
    config = {k: v for k, v in load_yaml(p.parent / "config.yml").items() if k in mallet_config_keys}
    config["input_dir"] = input_dir_map[config["input_dir"]]
    # select last coherence, TODO: select best among all if desired
    final_coherence = list(coherences[coherence_measure].values())[-1]
    # get coherence for top 5/15 words if available
    # (not used for model selection, just reference)
    missing_result = {None: {"by_topic": -np.inf}}
    coh_5 = list(coherences.get(f"{coherence_measure}_top5", missing_result).values())[-1]
    coh_15 = list(coherences.get(f"{coherence_measure}_top15", missing_result).values())[-1]

    topics = load_text(final_coherence["path"])
    to, overlaps = compute_to(topics, n=diversity_top_n, return_overlaps=True)
    tu = np.mean(compute_tu(topics, n=diversity_top_n))

    coh_values = np.nan_to_num(final_coherence["by_topic"], nan=0, posinf=0)
    coh_5_values = np.nan_to_num(coh_5["by_topic"], nan=0, posinf=0, neginf=np.nan)
    coh_15_values = np.nan_to_num(coh_15["by_topic"], nan=0, posinf=0, neginf=np.nan)
    coh_top_n_means = np.mean(np.concatenate([coh_values, coh_5_values, coh_15_values]))
    
    dvae_results.append({
        coherence_measure: np.mean(coh_values),
        f"{coherence_measure}_topn_mean": coh_top_n_means,
        f"{coherence_measure}_sd": np.std(coh_values),
        "tu": tu,
        "to": to,
        "overlaps": np.sum(overlaps >= overlapping_word_threshold),
        **config,
        "topics": topics,
        f"{coherence_measure}_all": coh_values.tolist(),
        f"{coherence_measure}_top5_all": coh_5_values.tolist(),
        "path": str(p.parent),
    })
dvae_results = pd.DataFrame(dvae_results)

In [9]:
dvae_top_coh = (
    dvae_results.loc[dvae_results.overlaps == 0]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1)
)
dvae_top_topics = {
    input_dir: data.to_dict('records')[0]
    for input_dir, data in dvae_top_coh.groupby("input_dir")
}

In [10]:
etm_config_keys = [
    "data_path",
    "lr",
    "anneal_lr",
    "wdecay",
    "epochs",
    "seed",
]

etm_results = []
for p in etm_paths:
    coherences = load_json(p)
    config = {k: v for k, v in load_yaml(p.parent / "config.yml").items() if k in etm_config_keys}
    config["input_dir"] = input_dir_map[str(Path(config["data_path"]).parent)]

    # select last coherence, TODO: select best among all if desired
    final_coherence = list(coherences[coherence_measure].values())[-1]
    
    topics = load_text(final_coherence["path"])
    to, overlaps = compute_to(topics, n=diversity_top_n, return_overlaps=True)
    tu = np.mean(compute_tu(topics, n=diversity_top_n))

    coh_values = np.nan_to_num(final_coherence["by_topic"], nan=0, posinf=0)
    
    etm_results.append({
        coherence_measure: np.mean(coh_values),
        f"{coherence_measure}_sd": np.std(coh_values),
        "tu": tu,
        "to": to,
        "overlaps": np.sum(overlaps >= overlapping_word_threshold),
        **config,
        "topics": topics,
        f"{coherence_measure}_all": coh_values.tolist(),
        "path": str(p.parent),
    })
etm_results = pd.DataFrame(etm_results)

In [11]:
etm_top_coh = (
    etm_results.loc[(etm_results.overlaps == 0) & (etm_results.tu > 0.7)]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1)
)
etm_top_topics = {
    input_dir: data.to_dict('records')[0]
    for input_dir, data in etm_top_coh.groupby("input_dir")
}

In [12]:
out_path = Path("./results", Path(run_dir).name)
out_path.mkdir(exist_ok=True)

In [69]:
#save_json(mallet_top_topics, Path(out_path, f"mallet-topics-best-{coherence_measure}.json"))
#save_json(dvae_top_topics, Path(out_path, f"dvae-topics-best-{coherence_measure}.json"))
#save_json(etm_top_topics, f"etm-topics-best-{coherence_measure}.json"))

In [22]:
etm_top_topics['wikitext'].keys()

dict_keys(['c_npmi_10_full', 'c_npmi_10_full_sd', 'tu', 'to', 'overlaps', 'anneal_lr', 'data_path', 'epochs', 'lr', 'seed', 'wdecay', 'input_dir', 'topics', 'c_npmi_10_full_all', 'path'])

In [56]:
n = 10
pd.DataFrame(
    [model, dataset, ", ".join(topic[:n])]
    for (model, model_topics) in [('mallet',  mallet_top_topics), ('dvae', dvae_top_topics)]
    for dataset, dataset_topics in model_topics.items()
    for topic in dataset_topics['topics']
).to_csv(out_path / f"topics-best-{coherence_measure}.csv", index=False)

```
Indices of top runs
                      dvae        mallet
                      wiki  nyt   wiki    nyt
c_npmi_10_full_top5   45    127   12      112
c_npmi_10_full [10]   74    130   63      134
c_npmi_10_full_top15  66    130   12      100
```

In [15]:
print(np.sum(dvae_results.overlaps == 0))
sorted_dvae = (dvae_results.loc[dvae_results.overlaps == 0]
                .sort_values(["input_dir", f"{coherence_measure}_topn_mean"], ascending=False)
                .groupby("input_dir").head(1))
sorted_dvae

49


Unnamed: 0,c_npmi_10_full,c_npmi_10_full_topn_mean,c_npmi_10_full_sd,tu,to,overlaps,input_dir,run_seeds,topics,c_npmi_10_full_all,c_npmi_10_full_top5_all,path
74,0.223753,0.225674,0.086427,0.94,0.055102,0,wikitext,42,"[[certifications, mtv_news, australian_recordi...","[0.11697274235210336, 0.21964315492752154, 0.0...","[0.05529666017500906, 0.18248189566097472, 0.0...",outputs/full-mindf_power_law-maxdf_0.9/wikitex...
130,0.253596,0.261827,0.13167,0.96,0.046939,0,nytimes,5591,"[[bridegroom, officiated, laude, bride, cum, m...","[0.3596531982376284, 0.558160615278727, 0.1824...","[0.4740114149250828, 0.6443805339961176, 0.153...",outputs/full-mindf_power_law-maxdf_0.9/nytimes...


In [58]:
for idx, row in sorted_dvae.iterrows():
    print(f"\n\n===={row.input_dir} ({idx})====\n{Path(row.path).parent.name}")
    for coh, topic in sorted(zip(row[f"{coherence_measure}_all"], row["topics"]), key=lambda kv: -kv[0]):
        print("  ".join(topic[:10]))



====wikitext (74)====
alpha_0.01-lr_0.01-h2dim_0-reg_0.0-epochs_500-anneal_bn_1-anneal_kl_100
tropical  landfall  cyclone  utc  weakening
spore  basidia  spores  mycologist  hyphae
mint  numismatic  obverse  coin  coins
episode  dana_scully  robert_shearman  gillian_anderson  fox_mulder
nhl  national_hockey_league  playoffs  american_hockey_league  hockey
waterline  conning  turrets  boilers  amidships
touchdowns  quarterback  touchdown  yards  offense
landfall  gusts  flooding  hurricane  winds
ringo_starr  george_harrison  beatles  simon_leng  guitarist
republican  democratic  democrat  democratic_party  republicans
gameplay  gamespot  graphics  game  multiplayer
renumbering  intersection  intersects  intersections  national_highway_system
manhattan_project  los_alamos_laboratory  robert_oppenheimer  enrico_fermi  physicist
infantry  flank  casualties  battalion  battalions
aircraft  bombers  bomber  pilots  destroyers
astronomers  orbit  brightest  orbiting  orbital
manga  anime  

In [16]:
print(np.sum(mallet_results.overlaps == 0))

sorted_mallet = (mallet_results.loc[mallet_results.overlaps == 0]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1))
sorted_mallet

156


Unnamed: 0,c_npmi_10_full,c_npmi_10_full_topn_mean,c_npmi_10_full_sd,tu,to,mean_c_npmi_10_full_tu,overlaps,alpha,beta,input_dir,iterations,run_seeds,topics,c_npmi_10_full_all,c_npmi_10_full_top5_all,path
63,0.135579,0.136785,0.051036,0.76,0.247959,0.447789,0,1.0,0.05,wikitext,2000,42,"[[water, area, river, park, miles, years, feet...","[0.09887072076525676, 0.14084096323202053, 0.1...","[0.15049582125247446, 0.24808454580946612, 0.1...",outputs/full-mindf_power_law-maxdf_0.9/wikitex...
134,0.15811,0.157084,0.079265,0.816,0.197449,0.487055,0,0.25,0.1,nytimes,1000,11235,"[[oil, water, plant, environmental, gas, power...","[0.09673317404139083, 0.17755340952100082, 0.1...","[0.1439363657411738, 0.17843608064157981, 0.24...",outputs/full-mindf_power_law-maxdf_0.9/nytimes...


In [60]:
for idx, row in sorted_mallet.iterrows():
    print(f"\n\n===={row.input_dir} ({idx})====\n{Path(row.path).parent.name}")
    for coh, topic in sorted(zip(row[f"{coherence_measure}_all"], row["topics"]), key=lambda kv: -kv[0]):
        print("  ".join(topic[:5]))



====wikitext (63)====
alpha_1.0-beta_0.05-iter_2000-opt_500
route  highway  road  state  north
club  season  team  cup  match
album  band  music  song  released
tropical  storm  hurricane  cyclone  depression
song  album  number  video  music
australia  test  match  england  australian
aircraft  air  flight  squadron  war
arab  muslim  israel  egypt  jewish
game  team  season  yards  yard
season  game  team  games  league
storm  damage  hurricane  people  winds
station  line  bridge  railway  trains
episode  homer  season  series  simpsons
school  students  university  college  year
match  championship  team  event  title
book  published  story  work  writing
music  musical  opera  works  composer
building  built  century  house  site
election  president  state  government  party
species  brown  fruit  cap  plants
disease  cells  blood  cell  risk
forces  war  attack  division  troops
british  ship  ships  french  island
episode  series  doctor  mulder  character
species  birds  male

In [17]:
print(np.sum((etm_results.overlaps == 0) & (etm_results.tu > 0.7)))

sorted_etm = (etm_results.loc[(etm_results.overlaps == 0) & (etm_results.tu > 0.7)]
                .sort_values(["input_dir", coherence_measure], ascending=False)
                .groupby("input_dir").head(1))
sorted_etm

143


Unnamed: 0,c_npmi_10_full,c_npmi_10_full_sd,tu,to,overlaps,anneal_lr,data_path,epochs,lr,seed,wdecay,input_dir,topics,c_npmi_10_full_all,path
56,0.113287,0.06795,0.94,0.030612,0,0,/workspace/topic-preprocessing/data/wikitext/p...,1000,0.001,42,1.2e-05,wikitext,"[[new, use, development, world, design, create...","[0.03828346256626948, 0.1038583234675748, 0.01...",outputs/full-mindf_power_law-maxdf_0.9/wikitex...
96,0.113789,0.090883,0.904,0.077041,0,0,/workspace/topic-preprocessing/data/nytimes/pr...,1000,0.02,11235,1e-06,nytimes,"[[campaign, bush, clinton, vote, state, congre...","[0.14624615320916104, 0.1842349463761919, 0.09...",outputs/full-mindf_power_law-maxdf_0.9/nytimes...


In [9]:
for idx, row in sorted_etm.iterrows():
    print(f"\n\n===={row.input_dir} ({idx})====\n{Path(row.path).parent.name}")
    print(row.path)
    for coh, topic in sorted(zip(row[f"{coherence_measure}_all"], row["topics"]), key=lambda kv: -kv[0]):
        print("  ".join(topic[:7]))



====wikitext (1)====
lr_0.01-reg_1.2e-06-epochs_1000-anneal_lr_0
outputs/full-mindf_power_law-maxdf_0.9/../etm_full/wikitext/k-50/etm/lr_0.01-reg_1.2e-06-epochs_1000-anneal_lr_0/5591
storm  tropical  hurricane  mph  winds  depression  cyclone
episode  series  season  episodes  television  viewers  watched
road  route  highway  state  north  bridge  south
album  song  band  music  number  released  songs
ship  aircraft  ships  fleet  squadron  naval  navy
season  team  game  league  games  club  scored
match  defeated  championship  team  event  ring  win
book  published  work  novel  author  works  books
building  built  church  site  tower  stone  buildings
species  genus  animal  specimens  specimen  fish  birds
music  performed  performance  musical  rock  performing  concert
film  films  cast  production  movie  released  role
government  president  state  party  political  minister  election
earth  mass  planet  chemical  solar  planets  sun
school  college  schools  students  y

In [31]:
etm_results['coh_rank'] = etm_results.groupby("input_dir")[coherence_measure].rank(pct=True, ascending=False)
pd.set_option('display.float_format', lambda x: '%.8f' % x)


etm_config_keys = [k for k in etm_config_keys if k != "data_path"]
(
    pd.melt(etm_results, id_vars=['path', coherence_measure, 'to', 'coh_rank'], value_vars=etm_config_keys)
      .groupby(["variable", "value"])[["coh_rank", coherence_measure]]
      .mean()
      .sort_values("coh_rank")
)

Unnamed: 0_level_0,Unnamed: 1_level_0,coh_rank,c_npmi_10_full
variable,value,Unnamed: 2_level_1,Unnamed: 3_level_1
anneal_lr,0.0,0.35447932,0.10523681
lr,0.02,0.39542065,0.10235219
lr,0.01,0.44740815,0.10200579
seed,42.0,0.48056081,0.10176129
epochs,1000.0,0.48056866,0.10095152
seed,5591.0,0.48074573,0.10085187
wdecay,1.2e-06,0.49265374,0.09978961
wdecay,1.2e-07,0.50884879,0.10010662
wdecay,1.2e-05,0.5161476,0.10050472
epochs,500.0,0.53290291,0.09929412


## Synthetic bad topics

First, get topics from every run, then eliminate topics that are too close to ones seen in the top

In [107]:
from tqdm import tqdm
print('', flush=True) # can help a broken tqdm
def jaccard(i, j):
    i, j = set(i), set(j)
    return len(i & j) / len(i | j)

def retain_topic(topic, topics_to_compare, top_n=50, threshold=0.1):
    k = len(topics)
    dists = np.array([
        jaccard(topic[:top_n], topic_j[:top_n])
        for topic_j in topics_to_compare
    ])
    return np.all(dists < threshold)

wikitext_topics = [
    t
    for topics in pd.concat([
        mallet_results.loc[mallet_results.input_dir == "wikitext"].topics,
        dvae_results.loc[dvae_results.input_dir == "wikitext"].topics
    ])
    for t in topics
]

top_wiki_topics = dvae_top_topics['wikitext']['topics'] + mallet_top_topics['wikitext']['topics']
wikitext_topics = [t for t in tqdm(wikitext_topics) if retain_topic(t, top_wiki_topics)]

nytimes_topics = [
    t
    for topics in pd.concat([
        mallet_results.loc[mallet_results.input_dir == "nytimes"].topics,
        dvae_results.loc[dvae_results.input_dir == "nytimes"].topics
    ])
    for t in topics
]
top_nyt_topics = dvae_top_topics['nytimes']['topics'] + mallet_top_topics['nytimes']['topics']
nytimes_topics = [t for t in tqdm(nytimes_topics) if retain_topic(t, top_nyt_topics)]


100%|██████████| 8100/8100 [00:07<00:00, 1050.95it/s]
100%|██████████| 8300/8300 [00:07<00:00, 1099.63it/s]


In [109]:
terms_wikitext = Counter(
    w
    for topic in wikitext_topics
    for w in topic
)
terms_nytimes = Counter(
    w
    for topic in nytimes_topics
    for w in topic
)

In [110]:
pseudoword_data = pd.read_csv("../human_evaluation/pwords.txt", sep="\s+", names=["prob", "order", "pword"])
pseudoword_data = pseudoword_data.loc[pseudoword_data.prob <= np.quantile(pseudoword_data.prob, 0.25)]
pseudowords = pseudoword_data.pword.tolist()

In [111]:
import random
random.seed(42)
n = 10
num_topics = 8

top_wiki_terms = [
    t for t, c in terms_wikitext.most_common(n*num_topics*4) 
    if "_" not in t and len(t) > 3
]
top_nyt_terms = [
    t for t, c in terms_nytimes.most_common(n*num_topics*4)
    if "_" not in t and len(t) > 3
]
bad_topics = {
    "wikitext": {
        "in_vocab": [
            random.sample(top_wiki_terms, n)
            for i in range(num_topics)
        ],
        "pseudo_word": [
            random.sample(random.sample(wikitext_topics[i],n//2) + random.sample(pseudowords,n//2),n)
            for i in range(num_topics)
        ],
    },
    "nytimes": {
        "in_vocab": [
            random.sample(top_nyt_terms, n)
            for i in range(num_topics)
        ],
        "pseudo_word": [
            random.sample(random.sample(nytimes_topics[i*3],n//2) + random.sample(pseudowords,n//2),n)
            for i in range(num_topics)
        ],
    },
}

In [112]:
bad_topics["wikitext"].pop("pseudo_word")
bad_topics["nytimes"].pop("pseudo_word")
save_json(bad_topics, f"{out_path}/bad_topics.json")

## Tables for Paper

### Table 1 (Examples)

In [13]:
dataset = "wikitext"
mallet_top_coh_topics = pd.DataFrame({
    "topics": mallet_top_coh.loc[mallet_top_coh.input_dir == dataset]["topics"].values[0],
    "coherences": mallet_top_coh.loc[mallet_top_coh.input_dir == dataset][f"{coherence_measure}_top5_all"].values[0]
}).sort_values("coherences", ascending=False)

dvae_top_coh_topics = pd.DataFrame({
    "topics": dvae_top_coh.loc[dvae_top_coh.input_dir == dataset]["topics"].values[0],
    "coherences": dvae_top_coh.loc[dvae_top_coh.input_dir == dataset][f"{coherence_measure}_top5_all"].values[0]
}).sort_values("coherences", ascending=False)

In [14]:
for idx, row in mallet_top_coh_topics.head(10).iterrows():
    print(f"{idx:3} | {row['coherences']:0.3f} |", " ".join(row.topics[:5]))

 37 | 0.394 | tropical storm hurricane cyclone depression
 16 | 0.285 | album band music song released
 20 | 0.274 | station line bridge railway trains
  8 | 0.250 | route highway road state north
  1 | 0.248 | species birds males females bird
 12 | 0.247 | arab muslim israel egypt jewish
 32 | 0.244 | season game team games league
 39 | 0.241 | storm damage hurricane people winds
  4 | 0.231 | club season team cup match
 27 | 0.222 | episode homer season series simpsons


In [15]:
for idx, row in dvae_top_coh_topics.head(10).iterrows():
    print(f"{idx:3} | {row['coherences']:0.3f} |", " ".join(row.topics[:5]))

  4 | 0.470 | manhattan_project los_alamos_laboratory robert_oppenheimer enrico_fermi physicist
 41 | 0.456 | spore basidia spores mycologist hyphae
 42 | 0.449 | touchdowns quarterback touchdown yards offense
 33 | 0.446 | tropical landfall cyclone utc weakening
 45 | 0.426 | mint numismatic obverse coin coins
 26 | 0.399 | episode dana_scully robert_shearman gillian_anderson fox_mulder
  9 | 0.386 | nhl national_hockey_league playoffs american_hockey_league hockey
 15 | 0.385 | landfall gusts flooding hurricane winds
 20 | 0.369 | waterline conning turrets boilers amidships
 49 | 0.343 | gameplay gamespot graphics game multiplayer


In [20]:
# collect the 
high_npmi_examples = [
    mallet_top_coh_topics.loc[16],
    mallet_top_coh_topics.loc[37],
    dvae_top_coh_topics.loc[33],
    dvae_top_coh_topics.loc[41],
    dvae_top_coh_topics.loc[4],
]

# row values
npmis = [t.coherences for t in high_npmi_examples]
words = list(zip(*[t.topics[:5] for t in high_npmi_examples]))
# make sure nothing screwy happened
assert(words[0] == ("album", "tropical", "tropical", "spore", "manhattan_project"))

# make the rows
npmi_row = " & ".join(f"{n:0.3f}" for n in npmis) + r" \\"
word_rows = "\\\\ \n ".join(" & ".join([f"{w:21}".replace("_", r"\_") for w in row]) for row in words) + r"\\"

In [21]:
print(word_rows)
print(r"\midrule")
print(npmi_row)

album                 & tropical              & tropical              & spore                 & manhattan\_project    \\ 
 band                  & storm                 & landfall              & basidia               & los\_alamos\_laboratory\\ 
 music                 & hurricane             & cyclone               & spores                & robert\_oppenheimer   \\ 
 song                  & cyclone               & utc                   & mycologist            & enrico\_fermi         \\ 
 released              & depression            & weakening             & hyphae                & physicist            \\
\midrule
0.285 & 0.394 & 0.446 & 0.456 & 0.470 \\


In [257]:
print("mean NPMIs @ 5")
print(f"mallet: {mallet_top_coh_topics.coherences.mean():0.3f} dvae: {dvae_top_coh_topics.coherences.mean():0.3f}")

mean NPMIs @ 5
mallet: 0.156 dvae: 0.256


### Stats on variation

In [96]:
dvae_results[coherence_measure].quantile(0.75) - dvae_results[coherence_measure].quantile(0.25)

0.09071832404845975

## Good example topics

In [88]:
'", "'.join(nytimes_topics[0][:50])

'brushwork", "canvases", "expressionism", "cubism", "museum_of_fine_arts", "cubist", "lifes", "national_gallery_of_art", "sotheby", "curators", "reliefs", "abstract_expressionism", "frank_stella", "whitney_museum_of_american_art", "glueck", "national_gallery", "donald_judd", "sculptural", "impressionists", "jasper_johns", "biomorphic", "paleontologists", "modernism", "impressionism", "curatorial", "sculptures", "portraiture", "antiquities", "expressionist", "gestural", "painterly", "manet", "modernist", "etchings", "paintings", "geometric", "figuration", "motifs", "rohe", "archeologists", "cindy_sherman", "kooning", "bronzes", "surrealist", "printmaking", "inlaid", "degas", "calligraphic", "fossils", "glazes'