Model used = SCHOLAR, SCHOLAR + BAT; K = 50

In [None]:
import os
import pandas as pd
import numpy as np
import json
from run_scholar import print_top_words
import scipy
import torch
import pickle
import file_handling as fh

In [None]:
np.set_printoptions(precision=4)

In [None]:
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
## for Palatino and other serif fonts use:
#rc('font',**{'family':'serif','serif':['Palatino']})
rc('text', usetex=False)

In [None]:
import matplotlib.pyplot as plt

In [None]:
data_path_20ng = 'data/20ng-prodlda/replicated/dev/'
data_path_wiki = 'data/wikitext/processed/new-dev/'
data_path_imdb = 'data/imdb/processed-dev/'

In [None]:
def print_mean_and_std_npmi(model_path):
    d = pd.read_csv(model_path + 'dev_metrics.csv')
    print(str(np.mean(d['npmi_value'])) + ' (' + str(np.std(d['npmi_value'])) + ')')

In [None]:
model_path_baseline_20ng = 'output_20NG_vanilla-scholar_k-50/'
model_path_baseline_wiki = 'output_wiki_vanilla-scholar_k-50/'
model_path_baseline_imdb = 'output_imdb_vanilla-scholar_k-50/'

model_path_kd_20ng = 'output_20NG_scholar-kd_k-50/'
model_path_kd_wiki = 'output_wiki_scholar-kd_k-50/'
model_path_kd_imdb = 'output_imdb_scholar-kd_k-50/'

In [None]:
print('20NG Baseline')
print_mean_and_std_npmi(model_path_baseline_20ng)
print('')
print('20NG KD')
print_mean_and_std_npmi(model_path_kd_20ng)

In [None]:
print('Wiki Baseline')
print_mean_and_std_npmi(model_path_baseline_wiki)
print('')
print('Wiki KD')
print_mean_and_std_npmi(model_path_kd_wiki)

In [None]:
print('imdb Baseline')
print_mean_and_std_npmi(model_path_baseline_imdb)
print('')
print('imdb KD')
print_mean_and_std_npmi(model_path_kd_imdb)

In [None]:
seeds = ['121958', '131932', '259178', '365838', '671155'] #5 runs

In [None]:
data_path = {}
data_path['20ng'] = data_path_20ng
data_path['wiki'] = data_path_wiki
data_path['imdb'] = data_path_imdb

In [None]:
model_path = {}
model_path['20ng_baseline'] = model_path_baseline_20ng
model_path['wiki_baseline'] = model_path_baseline_wiki
model_path['imdb_baseline'] = model_path_baseline_imdb
model_path['20ng_kd'] = model_path_kd_20ng
model_path['wiki_kd'] = model_path_kd_wiki
model_path['imdb_kd'] = model_path_kd_imdb

In [None]:
betas = {} #each data_model corresponding to a list of 5 betas for the five runs
for data_model in model_path:
    betas[data_model] = []
    for seed in seeds:
        betas[data_model].append(np.load(os.path.join(model_path[data_model], seed, 'beta.npz'))['beta'])

In [None]:
def jsd(p, q, base=np.e):
    '''
        Implementation of pairwise `jsd` based on  
        https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence
    '''
    
    ## normalize p, q to probabilities
    p, q = np.array(torch.softmax(torch.from_numpy(p), dim=0)), np.array(torch.softmax(torch.from_numpy(q), dim=0))
    m = (p + q)/2
    return scipy.stats.entropy(p, m, base=base)/2. +  scipy.stats.entropy(q, m, base=base)/2.

In [None]:
def js_divergence(beta1, beta2):
    assert beta1.shape==beta2.shape
    x, y = beta1.shape
    js_div_score_matrix = np.zeros((x,x))
    for i in range(x):
        for j in range(x):
            js_div_score_matrix[i][j] = round(jsd(beta1[i], beta2[j]), 4)
    return js_div_score_matrix

In [None]:
def get_topic_matched_pairs(beta1, beta2):
    assert beta1.shape==beta2.shape
    js_div_scores = js_divergence(beta1, beta2)
    #print(js_div_scores.shape)
    topic_match_tuples = []
    topic_match_scores = []
    while len(topic_match_tuples)<50:
        z = np.argmin(js_div_scores)
        i = z//js_div_scores.shape[0]
        j = z%js_div_scores.shape[1]
        topic_match_tuples.append((i,j))
        topic_match_scores.append(np.min(js_div_scores))
        js_div_scores[i, :] = 2.0
        js_div_scores[:, j] = 2.0
    return topic_match_tuples, topic_match_scores

In [None]:
def get_mean_topic_match_scores(data):
    betas_baseline = betas[data+'_baseline']
    betas_kd = betas[data+'_kd']
    topic_match_scores = []
    for x, y in zip(betas_baseline, betas_kd):
        _, scores = get_topic_matched_pairs(x, y)
        topic_match_scores.append(scores)
    return np.mean(np.array(topic_match_scores), axis=0)

In [None]:
topic_matched_scores = {}
for data in data_path:
    topic_matched_scores[data] = get_mean_topic_match_scores(data)

In [None]:
assert len(topic_matched_scores['20ng'])==50
assert len(topic_matched_scores['wiki'])==50
assert len(topic_matched_scores['imdb'])==50

In [None]:
#create the inital graphs to make the judgements of threshold from
x = list(range(1,51))
for data in topic_matched_scores:
    plt.plot(x, topic_matched_scores[data], '-')
    plt.savefig(data + 'jsdiv_for_matched_topics.png')
    plt.clf()

In [None]:
#after looking at the graphs
thresholds = {}
thresholds['20ng'] = 44
thresholds['wiki'] = 44
thresholds['imdb'] = 44

In [None]:
#draw with a vertical line to confirm thresholds
x = list(range(1,51))
for data in topic_matched_scores:
    plt.plot(x, topic_matched_scores[data], '-')
    plt.axvline(x=thresholds[data], linestyle='--')
    plt.savefig('data + '_jsdiv_for_matched_topics_with_thresh_line.png')
    plt.clf()

In [None]:
#produce the plot with all three datasets in one that goes into the paper
x = list(range(1,51))
colors = {'20ng': 'r', 'wiki': 'b', 'imdb': 'g'}
ls = {'20ng': 'solid', 'wiki': 'dashdot', 'imdb': 'dashed'}
legend_map = {'20ng': '20NG', 'wiki': 'Wiki', 'imdb': 'IMDb'}
xypos = {'20ng': (-1, 0.01), 'wiki': (-1, 0.015), 'imdb': (0, -0.04)}
for data in topic_matched_scores:
    y = topic_matched_scores[data]
    thresh = thresholds[data]
    plt.scatter(thresh,y[thresh-1], marker='|', color=colors[data])
    plt.plot(x, y, color=colors[data], linestyle=ls[data], label=legend_map[data])
    offset = xypos[data]
    pos = (thresh + offset[0], y[thresh-1] + offset[1])
    #plt.annotate(str(thresh), (thresh, y[thresh - 1]), ha='center', xytext=pos, color=colors[data])
plt.xlabel('Matched Topic Pair (best to worst match)')
plt.ylabel('JS Divergence Score')
plt.legend()
plt.xticks([0,10,20,30,40,44,50])
plt.axvline(x=44, linestyle=(0, (1, 5)), color='black')
plt.savefig('scholar_h2h-topic-pair_jsdiv-scores_final.png')

In [None]:
plt.clf()

In [None]:
def get_npmi_vals_and_topic_words(ref_vocab, ref_counts, topics, n=10, cols_to_skip=0):
    vocab_index = dict(zip(ref_vocab, range(len(ref_vocab))))
    n_docs, _ = ref_counts.shape
    npmi_values, top_words_strings = [], []
    for topic in topics:
        words = topic.strip().split()[cols_to_skip:]
        npmi_vals = []
        for word_i, word1 in enumerate(words[:n]):
            if word1 in vocab_index:
                index1 = vocab_index[word1]
            else:
                index1 = None
            for word2 in words[word_i+1:n]:
                if word2 in vocab_index:
                    index2 = vocab_index[word2]
                else:
                    index2 = None
                if index1 is None or index2 is None:
                    npmi = 0.0
                else:
                    col1 = np.array((ref_counts[:, index1] > 0).todense(), dtype=int)
                    col2 = np.array((ref_counts[:, index2] > 0).todense(), dtype=int)
                    c1 = col1.sum()
                    c2 = col2.sum()
                    c12 = np.sum(col1 * col2)
                    if c12 == 0:
                        npmi = 0.0
                    else:
                        npmi = (np.log10(n_docs) + np.log10(c12) - np.log10(c1) - np.log10(c2)) / (np.log10(n_docs) - np.log10(c12))
                npmi_vals.append(npmi)
        npmi_values.append(round(np.mean(npmi_vals), 4))
        top_words_strings.append(' '.join(words[:n]))
    return npmi_values, top_words_strings

In [None]:
def get_npmi_topics(datapath, modelpath, n=10):
    ref_vocab = fh.read_json(datapath + 'train.vocab.json')
    ref_counts = fh.load_sparse(datapath + 'dev.npz').tocsc()
    out = []
    for seed in seeds:
        topics = fh.read_text(modelpath + seed + '/topics.txt')
        npmi_values, top_words_strings = get_npmi_vals_and_topic_words(ref_vocab, ref_counts, topics, n)
#         top_words_lists = [x.split() for x in top_words_strings]
#         top_words_strings = ['\n'.join([' '.join(x[:3]), ' '.join(x[3:7]), ' '.join(x[7:])]) for x in top_words_lists]
        out.append(list(zip(npmi_values, top_words_strings)))
    return out

In [None]:
npmi_topics = {}
for model in model_path:
    data, _ = model.split('_')
    npmi_topics[model] = get_npmi_topics(data_path[data], model_path[model])

In [None]:
def compare_baseline_kd_matched_topics(baseline_npmi_topics, kd_npmi_topics, topic_pairs_jsdiv_baseline_kd, top_matched_pairs=10):
    kd_wins, baseline_wins = 0, 0
    topic_pairs_jsdiv_baseline_kd = topic_pairs_jsdiv_baseline_kd[:top_matched_pairs]
    for x in topic_pairs_jsdiv_baseline_kd:
        #print('Baseline model NPMI and Topic:')
        #print(baseline_npmi_topics[x[0]])
        #print('KD model NPMI and Topic:')
        #print(kd_npmi_topics[x[1]])
        if baseline_npmi_topics[x[0]][0]>kd_npmi_topics[x[1]][0]:
            baseline_wins+=1
        else:
            kd_wins+=1
        #print('---')
    return baseline_wins, kd_wins

In [None]:
mean_kd_baseline_win_percentages = {}
for data in data_path:
    top_pairs_to_consider = thresholds[data]
    all_kd_npmi_topics = npmi_topics[data + '_kd']
    all_baseline_npmi_topics = npmi_topics[data + '_baseline']
    all_baseline_wins, all_kd_wins = [], []
    for i in range(len(seeds)):
        beta_baseline = betas[data + '_baseline'][i]
        beta_kd = betas[data + '_kd'][i]
        baseline_npmi_topics = all_baseline_npmi_topics[i]
        kd_npmi_topics = all_kd_npmi_topics[i]
        topic_pairs_jsdiv_baseline_kd, _ = get_topic_matched_pairs(beta_baseline, beta_kd)
        baseline_wins, kd_wins = compare_baseline_kd_matched_topics(baseline_npmi_topics, kd_npmi_topics, topic_pairs_jsdiv_baseline_kd, top_pairs_to_consider)
        all_baseline_wins.append(baseline_wins)
        all_kd_wins.append(kd_wins)
    mean_kd_wins = np.mean(all_kd_wins)
    mean_baseline_wins = np.mean(all_baseline_wins)
    mean_kd_baseline_win_percentages[data] = (100*(mean_kd_wins/top_pairs_to_consider), 100*(mean_baseline_wins/top_pairs_to_consider))
    print(data + ' DONE')

In [None]:
mean_kd_baseline_wins = {}
for data in data_path:
    top_pairs_to_consider = thresholds[data]
    all_kd_npmi_topics = npmi_topics[data + '_kd']
    all_baseline_npmi_topics = npmi_topics[data + '_baseline']
    all_baseline_wins, all_kd_wins = [], []
    for i in range(len(seeds)):
        beta_baseline = betas[data + '_baseline'][i]
        beta_kd = betas[data + '_kd'][i]
        baseline_npmi_topics = all_baseline_npmi_topics[i]
        kd_npmi_topics = all_kd_npmi_topics[i]
        topic_pairs_jsdiv_baseline_kd, _ = get_topic_matched_pairs(beta_baseline, beta_kd)
        baseline_wins, kd_wins = compare_baseline_kd_matched_topics(baseline_npmi_topics, kd_npmi_topics, topic_pairs_jsdiv_baseline_kd, top_pairs_to_consider)
        all_baseline_wins.append(baseline_wins)
        all_kd_wins.append(kd_wins)
    mean_kd_wins = np.mean(all_kd_wins)
    mean_baseline_wins = np.mean(all_baseline_wins)
    mean_kd_baseline_wins[data] = (mean_kd_wins, mean_baseline_wins)
    print(data + ' DONE')

In [None]:
for x in mean_kd_baseline_win_percentages:
    l = list(mean_kd_baseline_win_percentages[x])
    l2 = [round(z) for z in l]
    mean_kd_baseline_win_percentages[x] = tuple(l2)

In [None]:
plt.clf()

In [None]:
barWidth = 0.3

bars_baseline = [mean_kd_baseline_wins[x][1] for x in mean_kd_baseline_wins]
bars_kd = [mean_kd_baseline_wins[x][0] for x in mean_kd_baseline_wins]

r1 = np.arange(len(bars_baseline))
r2 = [x + barWidth for x in r1]

plt.bar(r1, bars_baseline, color='#fdbb84', width=barWidth, edgecolor='black', label='SCHOLAR')
plt.bar(r2, bars_kd, color='#e34a33', width=barWidth, edgecolor='black', label='SCHOLAR + OURMODEL')

plt.xlabel('Dataset', fontweight='bold')
plt.xticks([r + (barWidth/2) for r in range(len(bars_baseline))], ['20NG', 'Wiki', 'IMDb'])

plt.ylabel('#Topics Better than Counterpart\nin Matched Topic Pairs ', fontweight='bold')

plt.ylim((0,44))
plt.yticks([x for x in range(0,45,4)])

plt.legend()
plt.savefig('kd_baseline_wins_bars.png')

In [None]:
# barWidth = 0.3

# bars_baseline = [mean_kd_baseline_win_percentages[x][1] for x in mean_kd_baseline_win_percentages]
# bars_kd = [mean_kd_baseline_win_percentages[x][0] for x in mean_kd_baseline_win_percentages]

# r1 = np.arange(len(bars_baseline))
# r2 = [x + barWidth for x in r1]

# plt.bar(r1, bars_baseline, color='#557f2d', width=barWidth, edgecolor='white', label='SCHOLAR')
# plt.bar(r2, bars_kd, color='#2d7f5e', width=barWidth, edgecolor='white', label='SCHOLAR + OURMODEL')

# plt.xlabel('Dataset', fontweight='bold')
# plt.xticks([r + barWidth for r in range(len(bars_baseline))], ['20NG', 'Wiki', 'IMDb'])

# plt.ylabel('% of Matched Topic Pairs', fontweight='bold')

# plt.legend()
# plt.savefig('percent_kd_baseline_wins_bars.png')

In [None]:
plt.clf()

In [None]:
def print_compare_baseline_kd_matched_topics(baseline_npmi_topics, kd_npmi_topics, topic_pairs_jsdiv_baseline_kd, scores, top_matched_pairs=10):
    kd_wins, baseline_wins = 0, 0
    topic_pairs_jsdiv_baseline_kd = topic_pairs_jsdiv_baseline_kd[:top_matched_pairs]
    df = pd.DataFrame(columns=['Pair #', 'SCHOLAR vs SCHOLAR+BAT', 'JS Divergence'])
    ind = list(range(1, top_pairs_to_consider+1))
    b_k, js = [], []
    for x, y in zip(topic_pairs_jsdiv_baseline_kd, scores):
#         print('Baseline model NPMI and Topic:')
#         print(baseline_npmi_topics[x[0]])
#         #b.append(baseline_npmi_topics[x[0]])
#         print('KD model NPMI and Topic:')
#         print(kd_npmi_topics[x[1]])
        #k.append(kd_npmi_topics[x[1]])
        print('SCHOLAR: ' + str(baseline_npmi_topics[x[0]]) + '\nSCHOLAR+BAT: ' + str(kd_npmi_topics[x[1]]))
        b_k.append('SCHOLAR: ' + str(baseline_npmi_topics[x[0]]) + '\nSCHOLAR+BAT: ' + str(kd_npmi_topics[x[1]]))
        print('JS Div. Value = ' + str(y))
        js.append(y)
        if baseline_npmi_topics[x[0]][0]>kd_npmi_topics[x[1]][0]:
            baseline_wins+=1
        else:
            kd_wins+=1
        print('---')
    df['Pair #'] = ind
    #df['SCHOLAR'] = b
    #df['SCHOLAR+BAT'] = k
    df['SCHOLAR vs SCHOLAR+BAT'] = b_k
    df['JS Divergence'] = js
    return df, baseline_wins, kd_wins



In [None]:
#20ng
data = '20ng'
top_pairs_to_consider = 50#thresholds[data]
all_kd_npmi_topics = npmi_topics[data + '_kd']
all_baseline_npmi_topics = npmi_topics[data + '_baseline']

i = 2 #selecting a single seed

beta_baseline = betas[data + '_baseline'][i]
beta_kd = betas[data + '_kd'][i]
baseline_npmi_topics = all_baseline_npmi_topics[i]
kd_npmi_topics = all_kd_npmi_topics[i]
topic_pairs_jsdiv_baseline_kd, scores = get_topic_matched_pairs(beta_baseline, beta_kd)
df_20ng, baseline_wins, kd_wins = print_compare_baseline_kd_matched_topics(baseline_npmi_topics, kd_npmi_topics, topic_pairs_jsdiv_baseline_kd, scores, top_pairs_to_consider)

print('KD Wins = ' + str(kd_wins))
print('Baseline Wins = ' + str(baseline_wins))
df_20ng.to_csv('20ng_topic_pairs.csv')

In [None]:
#wiki
data = 'wiki'
top_pairs_to_consider = 50#thresholds[data]
all_kd_npmi_topics = npmi_topics[data + '_kd']
all_baseline_npmi_topics = npmi_topics[data + '_baseline']

i = 2 #selecting a single seed

beta_baseline = betas[data + '_baseline'][i]
beta_kd = betas[data + '_kd'][i]
baseline_npmi_topics = all_baseline_npmi_topics[i]
kd_npmi_topics = all_kd_npmi_topics[i]
topic_pairs_jsdiv_baseline_kd, scores = get_topic_matched_pairs(beta_baseline, beta_kd)
df_wiki, baseline_wins, kd_wins = print_compare_baseline_kd_matched_topics(baseline_npmi_topics, kd_npmi_topics, topic_pairs_jsdiv_baseline_kd, scores, top_pairs_to_consider)

print('KD Wins = ' + str(kd_wins))
print('Baseline Wins = ' + str(baseline_wins))
df_wiki.to_csv('wiki_topic_pairs.csv')

In [None]:
#imdb
data = 'imdb'
top_pairs_to_consider = 50#thresholds[data]
all_kd_npmi_topics = npmi_topics[data + '_kd']
all_baseline_npmi_topics = npmi_topics[data + '_baseline']

i = 2 #selecting a single seed

beta_baseline = betas[data + '_baseline'][i]
beta_kd = betas[data + '_kd'][i]
baseline_npmi_topics = all_baseline_npmi_topics[i]
kd_npmi_topics = all_kd_npmi_topics[i]
topic_pairs_jsdiv_baseline_kd, scores = get_topic_matched_pairs(beta_baseline, beta_kd)
df_imdb, baseline_wins, kd_wins = print_compare_baseline_kd_matched_topics(baseline_npmi_topics, kd_npmi_topics, topic_pairs_jsdiv_baseline_kd, scores, top_pairs_to_consider)

print('KD Wins = ' + str(kd_wins))
print('Baseline Wins = ' + str(baseline_wins))
df_imdb.to_csv('imdb_topic_pairs.csv')

In [None]:
import random

In [None]:
random.randrange(1,11)

In [None]:
random.randrange(1,11)

In [None]:
random.randrange(1,11)

In [None]:
random.randrange(11,21)

In [None]:
random.randrange(11,21)

In [None]:
random.randrange(11,21)

In [None]:
random.randrange(21,31)

In [None]:
random.randrange(21,31)

In [None]:
random.randrange(21,31)

In [None]:
random.randrange(31,41)

In [None]:
random.randrange(31,41)

In [None]:
random.randrange(31,41)

In [None]:
random.randrange(41,51)

In [None]:
random.randrange(41,51)

In [None]:
random.randrange(41,51)