In [1]:
import random
import os

import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from tqdm import tqdm

from evaluations.intrinsic_eval import cherry_words, generic_words
from decomposer import Decomposer, DecomposerConfig

random.seed(42)
torch.manual_seed(42)
sns.set()

DEVICE = 'cpu'
PE = torch.load(
    '../../results/pretrained/init.pt', map_location=DEVICE)['model']
GD = PE.grounding

In [2]:
def load(path):
    stuff = torch.load(path, map_location=DEVICE)['model']
    return stuff.embedding.weight.detach().numpy()

def gather(words):
    word_ids = [PE.word_to_id[w] for w in words]
    freq = [GD[w]['freq'] for w in words]
    skew = [GD[w]['R_ratio'] for w in words]
    maj_deno = [GD[w]['majority_deno'] for w in words]
    return word_ids, freq, skew, maj_deno

def plot(coordinates, words, freq, skew, path):
    fig, ax = plt.subplots(figsize=(15,10))    
    sns.scatterplot(
        coordinates[:,0], coordinates[:,1], 
        hue=skew, palette='coolwarm', # hue_norm=(0, 1), 
        size=freq, sizes=(100, 1000), 
        legend=None, ax=ax)
    for coord, word in zip(coordinates, words):
        ax.annotate(word, coord, fontsize=12)
    with open(path, 'wb') as file:
        fig.savefig(file, dpi=300)
    plt.close(fig)

def plot_categorical(coordinates, words, freq, skew, path):
    fig, ax = plt.subplots(figsize=(20,10))    
    sns.scatterplot(
        coordinates[:,0], coordinates[:,1], 
        hue=skew, palette='muted', hue_norm=(0, 1),
        size=freq, sizes=(100, 1000), 
        legend='brief', ax=ax)
    chartBox = ax.get_position()
    ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
    ax.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), ncol=1)
    for coord, word in zip(coordinates, words):
        ax.annotate(word, coord, fontsize=12)
    with open(path, 'wb') as file:
        fig.savefig(file, dpi=300)
    plt.close(fig)
    
def load_en_masse(in_dir, endswith):
    models = {}
    for dirpath, _, filenames in tqdm(os.walk(in_dir)):
        for file in filenames:
            if file.endswith(endswith):
                path = os.path.join(dirpath, file)
                name = path.lstrip(in_dir).replace('/', ' ')
                models[name] = load(path)
    print(*models.keys(), sep='\n')
    return models
    
def graph_en_masse(
        models,
        out_dir, 
        reduction,  #  'PCA', 'TSNE', or 'both'
        word_ids,  
        words, 
        hues,
        sizes,
        perplexity=None,
        categorical=False):
    os.makedirs(out_dir, exist_ok=True)
    for model_name, embed in tqdm(models.items()):
        space = embed[word_ids]
        if reduction == 'PCA':
            visual = PCA(n_components=2).fit_transform(space)
        elif reduction == 'TSNE':
            assert perplexity is not None
            visual = TSNE(
                perplexity=perplexity, learning_rate=10, 
                n_iter=5000, n_iter_without_progress=1000).fit_transform(space)
        elif reduction == 'both':
            assert perplexity is not None
            space = PCA(n_components=30).fit_transform(space)
            visual = TSNE(
                perplexity=perplexity, learning_rate=10, 
                n_iter=5000, n_iter_without_progress=1000).fit_transform(space)
        else: 
            raise ValueError('unknown dimension reduction method')
        if not categorical:
            plot(visual, words, sizes, hues, 
                 os.path.join(out_dir, f'{model_name}.png'))
        else:
            plot_categorical(visual, words, sizes, hues, 
                 os.path.join(out_dir, f'{model_name}.png'))

In [3]:
ch_ids, ch_freq, ch_skew, ch_deno = gather(cherry_words)
gen_ids, gen_freq, gen_skew, gen_deno = gather(generic_words)

random_words = [w for w in PE.word_to_id.keys() 
                if GD[w]['freq'] > 99]
random_words = random.sample(random_words, 50)
rand_ids, rand_freq, rand_skew, rand_deno = gather(random_words)

In [4]:
R_words = [w for w in PE.word_to_id.keys()
             if GD[w]['freq'] > 99 and GD[w]['R_ratio'] > 0.75]
R_words.remove('federal_debt_stood')  # outliers in clustering graphs
R_words.remove('statements_relating')
R_words.remove('legislative_days_within')
print(len(R_words))
# GOP_words = random.sample(GOP_words, 50)
R_ids, R_freq, R_skew, R_deno = gather(R_words)

51


In [5]:
# D_words = [w for w in PE.word_to_id.keys()
#            if GD[w]['freq'] > 99 and GD[w]['R_ratio'] < 0.25]

D_words = ['war_in_iraq', 'unemployed', 'detainees', 'solar', 
    'wealthiest', 'minorities', 'gun_violence', 
    'amtrak', 'unemployment_benefits', 
    'citizens_united', 'mayors', 'prosecutor', 'working_families', 
    'cpsc', 'sexual_assault',
    'affordable_housing', 'vietnam_veterans', 'drug_companies', 'handguns',
    'hungry', 'college_education', 
    'main_street', 'trauma', 'simon', 'pandemic', 
    'reagan_administration', 'guns', 
    'million_jobs', 'airline_industry', 'mergers', 'blacks', 
    'industrial_base', 'unemployment_insurance',
    'vacancies', 'trade_deficit', 'lost_their_jobs', 'food_safety', 
    'darfur', 'trains', 'deportation', 'credit_cards', 
    'surface_transportation', 'solar_energy', 'ecosystems', 'layoffs', 
    'wall_street', 'steelworkers', 'puerto_rico', 'hunger', 
    'child_support', 'naacp', 'domestic_violence', 'seaports', 
    'hate_crimes', 'underfunded', 'registrants', 'sanctuary', 
    'coastal_zone_management', 'vermonters', 'automakers', 
    'violence_against_women', 'unemployment_rate', 
    'select_committee_on_indian_affairs', 'judicial_nominees', 
    'school_construction', 'clarence_mitchell', 'confidential', 
    'domain_name', 'community_development', 'pell_grant', 'asylum', 'vawa', 
    'somalia', 'african_american', 'traders', 'jersey', 'fdic', 'shameful', 
    'homelessness', 'african_americans', 'payroll_tax',]
#     'retraining', 'unemployed_workers', 'the_disclose_act', 'baltimore', 
#     'assault_weapons', 'credit_card', 'the_patriot_act', 'young_woman', 
#     'trades', 'aye', 'poisoning', 'police_officers', 'mammal', 'toys', 
#     'whistleblowers', 'north_dakota', 'californias', 'computer_crime', 
#     'explosives', 'fast_track', 'bus', 'redlining', 'seclusion', 'gender', 
#     'hawaiian', 'pay_discrimination', 'ledbetter', 'phd', 'supra', 'baggage', 
#     'las_vegas', 'the_voting_rights_act', 'enron', 'richest', 'vra', 'chip', 
#     'tax_break', 'the_usa_patriot_act', 'advance_notice', 'derivatives', 
#     'the_patients_bill_of_rights', 'shelf', 'divestment', 'sa', 
#     'submitted_an_amendment', 'bill_hr', 'first_responders',
#     'unemployment_compensation', 'tax_breaks', 'carbon', 
#     'college_cost_reduction', 'clean_energy', 'waives', 
#     'unregulated', 'taa', 'truman', 'lesbian', 'coupons', 
#     'large_numbers', 'anonymous', 'whites', 'logging']

print(len(D_words))
D_words = random.sample(D_words, 50)
D_ids, D_freq, D_skew, D_deno = gather(D_words)

81


In [6]:
J_words = D_words + R_words
J_ids = D_ids + R_ids
J_freq = D_freq + R_freq
J_skew = D_skew + R_skew
J_deno = D_deno + R_deno
J_cono = [0 if skew < 0.5 else 1 for skew in J_skew]

In [None]:
GD['joliet']

In [None]:
# base_dir = '../../results/only remove deno BS128'
# base_dir = '../../results/cono space remove deno/subset pretrained'
base_dir = '../../results/deno space remove cono/superset pretrained'
models = load_en_masse(base_dir, endswith='epoch100.pt')
models['pretrained superset'] = load('../../results/pretrained/init.pt')
models['pretrained'] = load('../../results/pretrained bill mentions/init.pt')

### Graph by Party Skew (for removing connotation)

In [None]:
graph_en_masse(
    models,
    out_dir=f'{base_dir}/PCA',
    reduction='PCA',
    word_ids=R_ids,
    words=R_words,
    hues=R_skew,
    sizes=R_freq,
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/t-SNE p5',
    reduction='TSNE',
    perplexity=5,
    word_ids=R_ids,
    words=R_words,
    hues=R_skew,
    sizes=R_freq,
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/t-SNE p3',
    reduction='TSNE',
    perplexity=3,
    word_ids=R_ids,
    words=R_words,
    hues=R_skew,
    sizes=R_freq,
)

In [None]:
# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/PCA',
#     reduction='PCA',
#     word_ids=J_ids,
#     words=J_words,
#     hues=J_skew,
#     sizes=J_freq,
# )

# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/t-SNE p5',
#     reduction='TSNE',
#     perplexity=5,
#     word_ids=J_ids,
#     words=J_words,
#     hues=J_skew,
#     sizes=J_freq,
# )

# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/t-SNE p3',
#     reduction='TSNE',
#     perplexity=3,
#     word_ids=J_ids,
#     words=J_words,
#     hues=J_skew,
#     sizes=J_freq,
# )

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/t-SNE p25',
    reduction='TSNE',
    perplexity=25,
    word_ids=J_ids,
    words=J_words,
    hues=J_skew,
    sizes=J_freq,
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/t-SNE p50',
    reduction='TSNE',
    perplexity=50,
    word_ids=J_ids,
    words=J_words,
    hues=J_skew,
    sizes=J_freq,
)

### Graph by Topic Denotation (for removing denotation)

In [None]:
graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly GOP/PCA',
    reduction='PCA',
    perplexity=5,
    word_ids=GOP_ids,
    words=GOP_words,
    hues=GOP_deno,
    sizes=GOP_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly GOP/t-SNE p5',
    reduction='TSNE',
    perplexity=5,
    word_ids=GOP_ids,
    words=GOP_words,
    hues=GOP_deno,
    sizes=GOP_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly GOP/t-SNE p3',
    reduction='TSNE',
    perplexity=3,
    word_ids=GOP_ids,
    words=GOP_words,
    hues=GOP_deno,
    sizes=GOP_freq,
    categorical=True
)

In [None]:
graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly Dem/PCA',
    reduction='PCA',
    perplexity=5,
    word_ids=D_ids,
    words=D_words,
    hues=D_deno,
    sizes=D_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly Dem/t-SNE p5',
    reduction='TSNE',
    perplexity=5,
    word_ids=D_ids,
    words=D_words,
    hues=D_deno,
    sizes=D_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly Dem/t-SNE p3',
    reduction='TSNE',
    perplexity=3,
    word_ids=D_ids,
    words=D_words,
    hues=D_deno,
    sizes=D_freq,
    categorical=True
)

In [None]:
# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/PCA',
#     reduction='PCA',
#     perplexity=5,
#     word_ids=J_ids,
#     words=J_words,
#     hues=J_deno,
#     sizes=J_freq,
#     categorical=True
# )

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/t-SNE p5',
    reduction='TSNE',
    perplexity=5,
    word_ids=J_ids,
    words=J_words,
    hues=J_deno,
    sizes=J_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/t-SNE p3',
    reduction='TSNE',
    perplexity=3,
    word_ids=J_ids,
    words=J_words,
    hues=J_deno,
    sizes=J_freq,
    categorical=True
)

## Graph Recomposers
Want to show that...

For deno vectors, topic cluster better than pretrained

For cono vectors, skew cluster better than pretrained

In [8]:
def load_recomposer(path):
    stuff = torch.load(path, map_location=DEVICE)['model']
    D_embed = stuff.deno_decomposer.embedding.weight.detach().numpy()
    C_embed = stuff.cono_decomposer.embedding.weight.detach().numpy()
    return D_embed, C_embed

def load_recomposers_en_masse(in_dir, endswith):
    D_models = {
        'pretrained superset': load('../../results/pretrained/init.pt'),
        'pretrained': load('../../results/pretrained bill mentions/init.pt')}
    C_models = {
        'pretrained superset': load('../../results/pretrained/init.pt'),
        'pretrained': load('../../results/pretrained bill mentions/init.pt')}
    for dirpath, _, filenames in os.walk(in_dir):
        for file in filenames:
            if file.endswith(endswith):
                path = os.path.join(dirpath, file)
                name = path.lstrip(in_dir).replace('/', ' ')
                D_embed, C_embed = load_recomposer(path)
                # Brittle Hack
                name = name.split()
                D_name = ' '.join(name[0:2] + name[4:])
                R_name = ' '.join(name[2:])
                D_models[D_name] = D_embed
                C_models[R_name] = C_embed
                print(name)
    return D_models, C_models 

In [9]:
base_dir = '../../results/recomposer/superset pretrained'
D_sub, C_sub = load_recomposers_en_masse(base_dir, endswith='epoch30.pt')



['Dd0.9', 'Dg-3.5', 'Cd-2.4', 'Cg2.2', 'R1.5', 'epoch30.pt']
['Dd3.3', 'Dg-3.4', 'Cd-2.4', 'Cg2.7', 'R0.9', 'epoch30.pt']
['Dd0.8', 'Dg-4.7', 'Cd-0.7', 'Cg3.0', 'R3.5', 'epoch30.pt']
['Dd0.6', 'Dg-2.5', 'Cd-4.8', 'Cg4.5', 'R1.3', 'epoch30.pt']
['Dd4.8', 'Dg-1.1', 'Cd-0.3', 'Cg4.5', 'R3.0', 'epoch30.pt']
['Dd3.0', 'Dg-4.1', 'Cd-4.7', 'Cg4.7', 'R4.8', 'epoch30.pt']
['Dd4.6', 'Dg-4.6', 'Cd-4.0', 'Cg0.2', 'R1.6', 'epoch30.pt']
['Dd1.9', 'Dg-0.2', 'Cd-1.3', 'Cg3.0', 'R0.8', 'epoch30.pt']
['Dd3.1', 'Dg-4.3', 'Cd-3.5', 'Cg1.8', 'R2.3', 'epoch30.pt']
['Dd4.0', 'Dg-3.5', 'Cd-4.5', 'Cg3.4', 'R2.2', 'epoch30.pt']
['Dd3.9', 'Dg-4.0', 'Cd-2.4', 'Cg3.0', 'R0.2', 'epoch30.pt']
['Dd0.1', 'Dg-0.2', 'Cd-0.8', 'Cg1.1', 'R0.9', 'epoch30.pt']


In [10]:
base_dir = '../../results/recomposer/superset pretrained'
D_super, C_super = load_recomposers_en_masse(base_dir, endswith='epoch30.pt')

['Dd0.9', 'Dg-3.5', 'Cd-2.4', 'Cg2.2', 'R1.5', 'epoch30.pt']
['Dd3.3', 'Dg-3.4', 'Cd-2.4', 'Cg2.7', 'R0.9', 'epoch30.pt']
['Dd0.8', 'Dg-4.7', 'Cd-0.7', 'Cg3.0', 'R3.5', 'epoch30.pt']
['Dd0.6', 'Dg-2.5', 'Cd-4.8', 'Cg4.5', 'R1.3', 'epoch30.pt']
['Dd4.8', 'Dg-1.1', 'Cd-0.3', 'Cg4.5', 'R3.0', 'epoch30.pt']
['Dd3.0', 'Dg-4.1', 'Cd-4.7', 'Cg4.7', 'R4.8', 'epoch30.pt']
['Dd4.6', 'Dg-4.6', 'Cd-4.0', 'Cg0.2', 'R1.6', 'epoch30.pt']
['Dd1.9', 'Dg-0.2', 'Cd-1.3', 'Cg3.0', 'R0.8', 'epoch30.pt']
['Dd3.1', 'Dg-4.3', 'Cd-3.5', 'Cg1.8', 'R2.3', 'epoch30.pt']
['Dd4.0', 'Dg-3.5', 'Cd-4.5', 'Cg3.4', 'R2.2', 'epoch30.pt']
['Dd3.9', 'Dg-4.0', 'Cd-2.4', 'Cg3.0', 'R0.2', 'epoch30.pt']
['Dd0.1', 'Dg-0.2', 'Cd-0.8', 'Cg1.1', 'R0.9', 'epoch30.pt']


In [None]:
 # Evaluating Denotation
models = D_models

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/topic/PCA',
    reduction='PCA',
    word_ids=J_ids,
    words=J_words,
    hues=J_deno,
    sizes=J_freq,
    categorical=True
)

# graph_en_masse(
#     models, out_dir=f'{base_dir}/Joint/topic/t-SNE p5',
#     reduction='TSNE', perplexity=5,
#     word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
#     categorical=True
# )

# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/topic/t-SNE p3',
#     reduction='TSNE', perplexity=3,
#     word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
#     categorical=True
# )

# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/topic/t-SNE p10',
#     reduction='TSNE', perplexity=10,
#     word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
#     categorical=True
# )

In [None]:
# Evaluating Connotation
models = C_models

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/topic/PCA',
    reduction='PCA',
    word_ids=J_ids,
    words=J_words,
    hues=J_skew,
    sizes=J_freq,
)


# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/party/t-SNE p25',
#     reduction='TSNE', perplexity=25,
#     word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq,
# )

# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/party/t-SNE p50',
#     reduction='TSNE', perplexity=50,
#     word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq,
# )

# Clustering + Homogeneity V-Measure

In [11]:
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_completeness_v_measure

In [None]:
# embed = models['pretrained superset'][J_ids]
# Cono_Space = KMeans(n_clusters=2).fit(embed)
# pred_labels = Cono_Space.predict(embed)
# homogeneity, completeness, v_measure = np.around(homogeneity_completeness_v_measure(
#     J_cono, pred_labels), 4)
# print(homogeneity, completeness, v_measure, sep='\t')

In [24]:
import editdistance

PE.deno_to_id = {val: key for key, val in PE.id_to_deno.items()}

def discretize_cono(skew):
    if skew < 0.5:
        return 0 
    else:
        return 1

def NN_cluster_ids(embed, query_ids, categorical, top_k=5):
    query_ids = torch.tensor(query_ids, device=DEVICE)
    embed = torch.tensor(embed, device=DEVICE)
    embed = nn.Embedding.from_pretrained(embed, freeze=True)

    query_embed = embed(query_ids)
    top_neighbor_ids = [
        nn.functional.cosine_similarity(
            q.view(1, -1), embed.weight).argsort(descending=True)
        for q in query_embed]

    cluster_labels = []
    true_labels = []
    for query_index, sorted_target_indices in enumerate(top_neighbor_ids):
        query_id = query_ids[query_index].item()
        query_word = PE.id_to_word[query_id]
        num_neighbors = 0
#         if categorical:
#             query_label = PE.deno_to_id[GD[query_word]['majority_deno']]
#         else:
#             query_label = discretize_cono(GD[query_word]['R_ratio'])
        query_label = query_index
            
        for sort_rank, target_id in enumerate(sorted_target_indices):
            target_id = target_id.item()
            if num_neighbors == top_k:
                break
            if query_id == target_id:
                continue
            # target_id = target_ids[target_index]  # target is always all embed
            target_word = PE.id_to_word[target_id]
            if editdistance.eval(query_word, target_word) < 3:
                continue
            num_neighbors += 1

            if categorical:
                neighbor_label = PE.deno_to_id[GD[target_word]['majority_deno']]
            else:
                neighbor_label = discretize_cono(GD[target_word]['R_ratio'])            
            cluster_labels.append(query_label)
            true_labels.append(neighbor_label)
    return cluster_labels, true_labels



In [26]:
# Deno space, eval deno, higher is better
for model_name, model in D_super.items():
    cluster_labels, true_labels = NN_cluster_ids(
        model, J_ids, categorical=True, top_k=10)    
    homogeneity, completeness, v_measure = np.around(
        homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')
#     print(pred_labels)

pretrained superset	0.7042	0.4406	0.5421
pretrained	0.3441	0.2111	0.2617
Dd0.9 Dg-3.5 R1.5 epoch30.pt	0.3407	0.2114	0.2609
Dd3.3 Dg-3.4 R0.9 epoch30.pt	0.3319	0.203	0.2519
Dd0.8 Dg-4.7 R3.5 epoch30.pt	0.3408	0.208	0.2583
Dd0.6 Dg-2.5 R1.3 epoch30.pt	0.3384	0.2087	0.2581
Dd4.8 Dg-1.1 R3.0 epoch30.pt	0.3418	0.2141	0.2633
Dd3.0 Dg-4.1 R4.8 epoch30.pt	0.3382	0.2108	0.2598
Dd4.6 Dg-4.6 R1.6 epoch30.pt	0.3408	0.2094	0.2594
Dd1.9 Dg-0.2 R0.8 epoch30.pt	0.3395	0.2042	0.255
Dd3.1 Dg-4.3 R2.3 epoch30.pt	0.3396	0.2078	0.2578
Dd4.0 Dg-3.5 R2.2 epoch30.pt	0.3362	0.2063	0.2557
Dd3.9 Dg-4.0 R0.2 epoch30.pt	0.3353	0.2048	0.2543
Dd0.1 Dg-0.2 R0.9 epoch30.pt	0.3465	0.216	0.2661


In [21]:
# Deno space, eval cono, lower is better
for model_name, model in D_super.items():
    cluster_labels, true_labels = NN_cluster_ids(
        model, J_ids, categorical=True, top_k=5)    
    homogeneity, completeness, v_measure = np.around(
        homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')
#     print(pred_labels)

pretrained superset	0.576	0.565	0.5704
pretrained	0.1763	0.1721	0.1742
Dd0.9 Dg-3.5 R1.5 epoch30.pt	0.1851	0.1764	0.1807
Dd3.3 Dg-3.4 R0.9 epoch30.pt	0.1713	0.1635	0.1673
Dd0.8 Dg-4.7 R3.5 epoch30.pt	0.1759	0.1692	0.1725
Dd0.6 Dg-2.5 R1.3 epoch30.pt	0.182	0.1758	0.1789
Dd4.8 Dg-1.1 R3.0 epoch30.pt	0.1758	0.1726	0.1742
Dd3.0 Dg-4.1 R4.8 epoch30.pt	0.1884	0.1848	0.1866
Dd4.6 Dg-4.6 R1.6 epoch30.pt	0.1774	0.1722	0.1747
Dd1.9 Dg-0.2 R0.8 epoch30.pt	0.1739	0.1641	0.1688
Dd3.1 Dg-4.3 R2.3 epoch30.pt	0.1831	0.174	0.1785
Dd4.0 Dg-3.5 R2.2 epoch30.pt	0.1832	0.1761	0.1796
Dd3.9 Dg-4.0 R0.2 epoch30.pt	0.1663	0.1604	0.1633
Dd0.1 Dg-0.2 R0.9 epoch30.pt	0.186	0.1818	0.1839


In [23]:
# Cono space, eval cono, higher is better
for model_name, model in C_super.items():
    cluster_labels, true_labels = NN_cluster_ids(
        model, J_ids, categorical=False, top_k=5)    
    homogeneity, completeness, v_measure = np.around(
        homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')

pretrained superset	0.3306	0.3225	0.3265
pretrained	0.0025	0.0023	0.0024
Cd-2.4 Cg2.2 R1.5 epoch30.pt	0.0	0.0	0.0
Cd-2.4 Cg2.7 R0.9 epoch30.pt	0.001	0.0009	0.0009
Cd-0.7 Cg3.0 R3.5 epoch30.pt	0.0003	0.0003	0.0003
Cd-4.8 Cg4.5 R1.3 epoch30.pt	0.0002	0.0002	0.0002
Cd-0.3 Cg4.5 R3.0 epoch30.pt	0.001	0.0009	0.001
Cd-4.7 Cg4.7 R4.8 epoch30.pt	0.0001	0.0001	0.0001
Cd-4.0 Cg0.2 R1.6 epoch30.pt	0.0002	0.0002	0.0002
Cd-1.3 Cg3.0 R0.8 epoch30.pt	0.0013	0.0012	0.0012
Cd-3.5 Cg1.8 R2.3 epoch30.pt	0.0017	0.0016	0.0017
Cd-4.5 Cg3.4 R2.2 epoch30.pt	0.0	0.0	0.0
Cd-2.4 Cg3.0 R0.2 epoch30.pt	0.0002	0.0002	0.0002
Cd-0.8 Cg1.1 R0.9 epoch30.pt	0.0022	0.002	0.0021


In [22]:
# Cono space, eval deno, lower is better
for model_name, model in C_super.items():
    cluster_labels, true_labels = NN_cluster_ids(
        model, J_ids, categorical=True, top_k=5)    
    homogeneity, completeness, v_measure = np.around(
        homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')

pretrained superset	0.576	0.565	0.5704
pretrained	0.1763	0.1721	0.1742
Cd-2.4 Cg2.2 R1.5 epoch30.pt	0.1718	0.1692	0.1705
Cd-2.4 Cg2.7 R0.9 epoch30.pt	0.1785	0.1735	0.1759
Cd-0.7 Cg3.0 R3.5 epoch30.pt	0.1899	0.1796	0.1846
Cd-4.8 Cg4.5 R1.3 epoch30.pt	0.1709	0.1676	0.1692
Cd-0.3 Cg4.5 R3.0 epoch30.pt	0.1726	0.1701	0.1713
Cd-4.7 Cg4.7 R4.8 epoch30.pt	0.1766	0.1702	0.1733
Cd-4.0 Cg0.2 R1.6 epoch30.pt	0.1729	0.1664	0.1696
Cd-1.3 Cg3.0 R0.8 epoch30.pt	0.1692	0.1693	0.1692
Cd-3.5 Cg1.8 R2.3 epoch30.pt	0.1754	0.173	0.1742
Cd-4.5 Cg3.4 R2.2 epoch30.pt	0.182	0.1795	0.1807
Cd-2.4 Cg3.0 R0.2 epoch30.pt	0.1827	0.1799	0.1813
Cd-0.8 Cg1.1 R0.9 epoch30.pt	0.1751	0.1713	0.1732


In [None]:
# homogeneity, completeness, v_measure = np.around(
#     homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
# print(homogeneity, completeness, v_measure, sep='\t')