In [1]:
import random
import os

import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from tqdm import tqdm

from evaluations.intrinsic_eval import cherry_words, generic_words
from decomposer import Decomposer, DecomposerConfig

random.seed(42)
torch.manual_seed(42)
sns.set()

DEVICE = 'cpu'
PE = torch.load(
    '../../results/pretrained/init.pt', map_location=DEVICE)['model']
GD = PE.grounding

In [45]:
def load(path):
    stuff = torch.load(path, map_location=DEVICE)['model']
    return stuff.embedding.weight.detach().numpy()

def gather(words):
    word_ids = [PE.word_to_id[w] for w in words]
    freq = [GD[w]['freq'] for w in words]
    skew = [GD[w]['R_ratio'] for w in words]
    maj_deno = [GD[w]['majority_deno'] for w in words]
    return word_ids, freq, skew, maj_deno

def plot(coordinates, words, freq, skew, path):
    fig, ax = plt.subplots(figsize=(15,10))    
    sns.scatterplot(
        coordinates[:,0], coordinates[:,1], 
        hue=skew, palette='coolwarm', # hue_norm=(0, 1), 
        size=freq, sizes=(100, 1000), 
        legend=None, ax=ax)
    for coord, word in zip(coordinates, words):
        ax.annotate(word, coord, fontsize=12)
    with open(path, 'wb') as file:
        fig.savefig(file, dpi=300)
    plt.close(fig)

def plot_categorical(coordinates, words, freq, skew, path):
    fig, ax = plt.subplots(figsize=(20,10))    
    sns.scatterplot(
        coordinates[:,0], coordinates[:,1], 
        hue=skew, palette='muted', hue_norm=(0, 1),
        size=freq, sizes=(100, 1000), 
        legend='brief', ax=ax)
    chartBox = ax.get_position()
    ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
    ax.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), ncol=1)
    for coord, word in zip(coordinates, words):
        ax.annotate(word, coord, fontsize=12)
    with open(path, 'wb') as file:
        fig.savefig(file, dpi=300)
    plt.close(fig)
    
def load_en_masse(in_dir, endswith):
    models = {}
    for dirpath, _, filenames in tqdm(os.walk(in_dir)):
        for file in filenames:
            if file.endswith(endswith):
                path = os.path.join(dirpath, file)
                name = path.lstrip(in_dir).replace('/', ' ')
                models[name] = load(path)
    print(*models.keys(), sep='\n')
    return models
    
def graph_en_masse(
        models,
        out_dir, 
        reduction,  #  'PCA', 'TSNE', or 'both'
        word_ids,  
        words, 
        hues,
        sizes,
        perplexity=None,
        categorical=False):
    os.makedirs(out_dir, exist_ok=True)
    for model_name, embed in tqdm(models.items()):
        space = embed[word_ids]
        if reduction == 'PCA':
            visual = PCA(n_components=2).fit_transform(space)
        elif reduction == 'TSNE':
            assert perplexity is not None
            visual = TSNE(
                perplexity=perplexity, learning_rate=10, 
                n_iter=5000, n_iter_without_progress=1000).fit_transform(space)
        elif reduction == 'both':
            assert perplexity is not None
            space = PCA(n_components=30).fit_transform(space)
            visual = TSNE(
                perplexity=perplexity, learning_rate=10, 
                n_iter=5000, n_iter_without_progress=1000).fit_transform(space)
        else: 
            raise ValueError('unknown dimension reduction method')
        if not categorical:
            plot(visual, words, sizes, hues, 
                 os.path.join(out_dir, f'{model_name}.png'))
        else:
            plot_categorical(visual, words, sizes, hues, 
                 os.path.join(out_dir, f'{model_name}.png'))

In [3]:
ch_ids, ch_freq, ch_skew, ch_deno = gather(cherry_words)
gen_ids, gen_freq, gen_skew, gen_deno = gather(generic_words)

random_words = [w for w in PE.word_to_id.keys() 
                if GD[w]['freq'] > 99]
random_words = random.sample(random_words, 50)
rand_ids, rand_freq, rand_skew, rand_deno = gather(random_words)

In [5]:
R_words = [w for w in PE.word_to_id.keys()
             if GD[w]['freq'] > 99 and GD[w]['R_ratio'] > 0.75]
R_words.remove('federal_debt_stood')  # outliers in clustering graphs
R_words.remove('statements_relating')
R_words.remove('legislative_days_within')
print(len(R_words))
# GOP_words = random.sample(GOP_words, 50)
R_ids, R_freq, R_skew, R_deno = gather(R_words)

51


In [6]:
# D_words = [w for w in PE.word_to_id.keys()
#            if GD[w]['freq'] > 99 and GD[w]['R_ratio'] < 0.25]

D_words = ['war_in_iraq', 'unemployed', 'detainees', 'solar', 
    'wealthiest', 'minorities', 'gun_violence', 
    'amtrak', 'unemployment_benefits', 
    'citizens_united', 'mayors', 'prosecutor', 'working_families', 
    'cpsc', 'sexual_assault',
    'affordable_housing', 'vietnam_veterans', 'drug_companies', 'handguns',
    'hungry', 'college_education', 
    'main_street', 'trauma', 'simon', 'pandemic', 
    'reagan_administration', 'guns', 
    'million_jobs', 'airline_industry', 'mergers', 'blacks', 
    'industrial_base', 'unemployment_insurance',
    'vacancies', 'trade_deficit', 'lost_their_jobs', 'food_safety', 
    'darfur', 'trains', 'deportation', 'credit_cards', 
    'surface_transportation', 'solar_energy', 'ecosystems', 'layoffs', 
    'wall_street', 'steelworkers', 'puerto_rico', 'hunger', 
    'child_support', 'naacp', 'domestic_violence', 'seaports', 
    'hate_crimes', 'underfunded', 'registrants', 'sanctuary', 
    'coastal_zone_management', 'vermonters', 'automakers', 
    'violence_against_women', 'unemployment_rate', 
    'select_committee_on_indian_affairs', 'judicial_nominees', 
    'school_construction', 'clarence_mitchell', 'confidential', 
    'domain_name', 'community_development', 'pell_grant', 'asylum', 'vawa', 
    'somalia', 'african_american', 'traders', 'jersey', 'fdic', 'shameful', 
    'homelessness', 'african_americans', 'payroll_tax',]
#     'retraining', 'unemployed_workers', 'the_disclose_act', 'baltimore', 
#     'assault_weapons', 'credit_card', 'the_patriot_act', 'young_woman', 
#     'trades', 'aye', 'poisoning', 'police_officers', 'mammal', 'toys', 
#     'whistleblowers', 'north_dakota', 'californias', 'computer_crime', 
#     'explosives', 'fast_track', 'bus', 'redlining', 'seclusion', 'gender', 
#     'hawaiian', 'pay_discrimination', 'ledbetter', 'phd', 'supra', 'baggage', 
#     'las_vegas', 'the_voting_rights_act', 'enron', 'richest', 'vra', 'chip', 
#     'tax_break', 'the_usa_patriot_act', 'advance_notice', 'derivatives', 
#     'the_patients_bill_of_rights', 'shelf', 'divestment', 'sa', 
#     'submitted_an_amendment', 'bill_hr', 'first_responders',
#     'unemployment_compensation', 'tax_breaks', 'carbon', 
#     'college_cost_reduction', 'clean_energy', 'waives', 
#     'unregulated', 'taa', 'truman', 'lesbian', 'coupons', 
#     'large_numbers', 'anonymous', 'whites', 'logging']

print(len(D_words))
D_words = random.sample(D_words, 50)
D_ids, D_freq, D_skew, D_deno = gather(D_words)

81


In [11]:
J_words = D_words + R_words
J_ids = D_ids + R_ids
J_freq = D_freq + R_freq
J_skew = D_skew + R_skew
J_deno = D_deno + R_deno
J_cono = [0 if skew < 0.5 else 1 for skew in J_skew]

In [None]:
GD['joliet']

In [43]:
# base_dir = '../../results/only remove deno BS128'
# base_dir = '../../results/cono space remove deno/subset pretrained'
base_dir = '../../results/deno space remove cono/superset pretrained'
models = load_en_masse(base_dir, endswith='epoch100.pt')
models['pretrained superset'] = load('../../results/pretrained/init.pt')
models['pretrained'] = load('../../results/pretrained bill mentions/init.pt')

5it [00:01,  2.99it/s]


E1 A1 epoch100.pt
E4 A1 epoch100.pt


### Graph by Party Skew (for removing connotation)

In [None]:
graph_en_masse(
    models,
    out_dir=f'{base_dir}/PCA',
    reduction='PCA',
    word_ids=R_ids,
    words=R_words,
    hues=R_skew,
    sizes=R_freq,
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/t-SNE p5',
    reduction='TSNE',
    perplexity=5,
    word_ids=R_ids,
    words=R_words,
    hues=R_skew,
    sizes=R_freq,
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/t-SNE p3',
    reduction='TSNE',
    perplexity=3,
    word_ids=R_ids,
    words=R_words,
    hues=R_skew,
    sizes=R_freq,
)

In [42]:
# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/PCA',
#     reduction='PCA',
#     word_ids=J_ids,
#     words=J_words,
#     hues=J_skew,
#     sizes=J_freq,
# )

# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/t-SNE p5',
#     reduction='TSNE',
#     perplexity=5,
#     word_ids=J_ids,
#     words=J_words,
#     hues=J_skew,
#     sizes=J_freq,
# )

# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/t-SNE p3',
#     reduction='TSNE',
#     perplexity=3,
#     word_ids=J_ids,
#     words=J_words,
#     hues=J_skew,
#     sizes=J_freq,
# )

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/t-SNE p25',
    reduction='TSNE',
    perplexity=25,
    word_ids=J_ids,
    words=J_words,
    hues=J_skew,
    sizes=J_freq,
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/t-SNE p50',
    reduction='TSNE',
    perplexity=50,
    word_ids=J_ids,
    words=J_words,
    hues=J_skew,
    sizes=J_freq,
)

100%|██████████| 4/4 [00:07<00:00,  1.89s/it]
100%|██████████| 4/4 [00:07<00:00,  1.94s/it]


### Graph by Topic Denotation (for removing denotation)

In [None]:
graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly GOP/PCA',
    reduction='PCA',
    perplexity=5,
    word_ids=GOP_ids,
    words=GOP_words,
    hues=GOP_deno,
    sizes=GOP_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly GOP/t-SNE p5',
    reduction='TSNE',
    perplexity=5,
    word_ids=GOP_ids,
    words=GOP_words,
    hues=GOP_deno,
    sizes=GOP_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly GOP/t-SNE p3',
    reduction='TSNE',
    perplexity=3,
    word_ids=GOP_ids,
    words=GOP_words,
    hues=GOP_deno,
    sizes=GOP_freq,
    categorical=True
)

In [None]:
graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly Dem/PCA',
    reduction='PCA',
    perplexity=5,
    word_ids=D_ids,
    words=D_words,
    hues=D_deno,
    sizes=D_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly Dem/t-SNE p5',
    reduction='TSNE',
    perplexity=5,
    word_ids=D_ids,
    words=D_words,
    hues=D_deno,
    sizes=D_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Highly Dem/t-SNE p3',
    reduction='TSNE',
    perplexity=3,
    word_ids=D_ids,
    words=D_words,
    hues=D_deno,
    sizes=D_freq,
    categorical=True
)

In [44]:
# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/PCA',
#     reduction='PCA',
#     perplexity=5,
#     word_ids=J_ids,
#     words=J_words,
#     hues=J_deno,
#     sizes=J_freq,
#     categorical=True
# )

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/t-SNE p5',
    reduction='TSNE',
    perplexity=5,
    word_ids=J_ids,
    words=J_words,
    hues=J_deno,
    sizes=J_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/t-SNE p3',
    reduction='TSNE',
    perplexity=3,
    word_ids=J_ids,
    words=J_words,
    hues=J_deno,
    sizes=J_freq,
    categorical=True
)

100%|██████████| 4/4 [00:07<00:00,  1.84s/it]
100%|██████████| 4/4 [00:11<00:00,  2.82s/it]
100%|██████████| 4/4 [00:11<00:00,  2.78s/it]


## Graph Recomposers
Want to show that...

For deno vectors, topic cluster better than pretrained

For cono vectors, skew cluster better than pretrained

In [63]:
def load_recomposer(path):
    stuff = torch.load(path, map_location=DEVICE)['model']
    D_embed = stuff.deno_decomposer.embedding.weight.detach().numpy()
    C_embed = stuff.cono_decomposer.embedding.weight.detach().numpy()
    return D_embed, C_embed

def load_recomposers_en_masse(in_dir, endswith):
    D_models = {
        'pretrained superset': load('../../results/pretrained/init.pt'),
        'pretrained': load('../../results/pretrained bill mentions/init.pt')}
    C_models = {
        'pretrained superset': load('../../results/pretrained/init.pt'),
        'pretrained': load('../../results/pretrained bill mentions/init.pt')}
    for dirpath, _, filenames in os.walk(in_dir):
        for file in filenames:
            if file.endswith(endswith):
                path = os.path.join(dirpath, file)
                name = path.lstrip(in_dir).replace('/', ' ')
                D_embed, C_embed = load_recomposer(path)
                # Brittle Hack
                name = name.split()
                D_name = ' '.join(name[0:2] + name[4:])
                R_name = ' '.join(name[2:])
                D_models[D_name] = D_embed
                C_models[R_name] = C_embed
                print(name)
    return D_models, C_models 

In [64]:
base_dir = '../../results/recomposer/superset pretrained'
D_models, C_models = load_recomposers_en_masse(base_dir, endswith='epoch100.pt')



['Dd0.9', 'Dg-3.5', 'Cd-2.4', 'Cg2.2', 'R1.5', 'epoch100.pt']
['Dd0.8', 'Dg-4.7', 'Cd-0.7', 'Cg3.0', 'R3.5', 'epoch100.pt']
['Dd3.0', 'Dg-4.1', 'Cd-4.7', 'Cg4.7', 'R4.8', 'epoch100.pt']
['Dd1.9', 'Dg-0.2', 'Cd-1.3', 'Cg3.0', 'R0.8', 'epoch100.pt']
['Dd3.1', 'Dg-4.3', 'Cd-3.5', 'Cg1.8', 'R2.3', 'epoch100.pt']
['Dd4.0', 'Dg-3.5', 'Cd-4.5', 'Cg3.4', 'R2.2', 'epoch100.pt']
['Dd3.9', 'Dg-4.0', 'Cd-2.4', 'Cg3.0', 'R0.2', 'epoch100.pt']
['Dd0.1', 'Dg-0.2', 'Cd-0.8', 'Cg1.1', 'R0.9', 'epoch100.pt']


In [65]:
 # Evaluating Denotation
models = D_models

graph_en_masse(
    models, out_dir=f'{base_dir}/Joint/topic/t-SNE p5',
    reduction='TSNE', perplexity=5,
    word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/topic/t-SNE p3',
    reduction='TSNE', perplexity=3,
    word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
    categorical=True
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/topic/t-SNE p10',
    reduction='TSNE', perplexity=10,
    word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
    categorical=True
)


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:02<00:24,  2.71s/it][A
 20%|██        | 2/10 [00:05<00:21,  2.68s/it][A
 30%|███       | 3/10 [00:07<00:18,  2.65s/it][A
 40%|████      | 4/10 [00:10<00:16,  2.69s/it][A
 50%|█████     | 5/10 [00:13<00:12,  2.60s/it][A
 60%|██████    | 6/10 [00:15<00:10,  2.56s/it][A
 70%|███████   | 7/10 [00:17<00:07,  2.50s/it][A
 80%|████████  | 8/10 [00:20<00:04,  2.49s/it][A
 90%|█████████ | 9/10 [00:22<00:02,  2.52s/it][A
100%|██████████| 10/10 [00:25<00:00,  2.54s/it][A

  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:03<00:27,  3.09s/it][A
 20%|██        | 2/10 [00:06<00:24,  3.11s/it][A
 30%|███       | 3/10 [00:09<00:21,  3.07s/it][A
 40%|████      | 4/10 [00:12<00:17,  3.00s/it][A
 50%|█████     | 5/10 [00:15<00:15,  3.06s/it][A
 60%|██████    | 6/10 [00:18<00:11,  3.00s/it][A
 70%|███████   | 7/10 [00:20<00:08,  2.83s/it][A
 80%|████████  | 8/10 [00:23<00:05,  2.79s/it][A
 90%|████████

In [66]:
# Evaluating Connotation
models = C_models

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/party/t-SNE p25',
    reduction='TSNE', perplexity=25,
    word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq,
)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/Joint/party/t-SNE p50',
    reduction='TSNE', perplexity=50,
    word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq,
)


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:02<00:18,  2.10s/it][A
 20%|██        | 2/10 [00:04<00:16,  2.05s/it][A
 30%|███       | 3/10 [00:06<00:14,  2.07s/it][A
 40%|████      | 4/10 [00:08<00:12,  2.05s/it][A
 50%|█████     | 5/10 [00:10<00:10,  2.04s/it][A
 60%|██████    | 6/10 [00:12<00:08,  2.05s/it][A
 70%|███████   | 7/10 [00:14<00:06,  2.19s/it][A
 80%|████████  | 8/10 [00:16<00:04,  2.14s/it][A
 90%|█████████ | 9/10 [00:19<00:02,  2.22s/it][A
100%|██████████| 10/10 [00:21<00:00,  2.12s/it][A

  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:01<00:17,  1.89s/it][A
 20%|██        | 2/10 [00:03<00:15,  1.88s/it][A
 30%|███       | 3/10 [00:05<00:13,  1.88s/it][A
 40%|████      | 4/10 [00:07<00:11,  1.93s/it][A
 50%|█████     | 5/10 [00:09<00:10,  2.00s/it][A
 60%|██████    | 6/10 [00:11<00:07,  1.99s/it][A
 70%|███████   | 7/10 [00:13<00:05,  1.98s/it][A
 80%|████████  | 8/10 [00:16<00:04,  2.09s/it][A
 90%|████████

# Clustering + Homogeneity V-Measure

In [17]:
from sklearn.cluster import KMeans
from sklearn.metrics import homogeneity_completeness_v_measure

In [20]:
models.keys()

dict_keys(['E4 A1 superset epoch200.pt', 'E1 A1 superset epoch200.pt', 'pretrained superset', 'pretrained'])

In [35]:

for model_name, model in models.items():
    embed = model[J_ids]
#     Cono_Space = KMeans(n_clusters=2).fit(embed)
    Deno_Space = KMeans(n_clusters=41).fit(embed)
    pred_labels = Deno_Space.predict(embed)
    homogeneity, completeness, v_measure = np.around(homogeneity_completeness_v_measure(
        J_deno, pred_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')
#     print(pred_labels)


E1 A1 epoch100.pt	0.6428	0.5346	0.5838
pretrained superset	0.7669	0.6505	0.7039
pretrained	0.6125	0.5461	0.5774


In [22]:
embed = models['pretrained superset'][J_ids]
Cono_Space = KMeans(n_clusters=2).fit(embed)
pred_labels = Cono_Space.predict(embed)
homogeneity, completeness, v_measure = np.around(homogeneity_completeness_v_measure(
    J_cono, pred_labels), 4)
print(homogeneity, completeness, v_measure, sep='\t')

0.0829	0.2076	0.1185


In [23]:
pred_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int32)

In [24]:
J_skew

[0.24267782426778242,
 0.21937842778793418,
 0.16216216216216217,
 0.21495327102803738,
 0.21782178217821782,
 0.2392638036809816,
 0.20078740157480315,
 0.19637462235649547,
 0.24217462932454695,
 0.22916666666666666,
 0.24,
 0.17692307692307693,
 0.2231404958677686,
 0.23529411764705882,
 0.18181818181818182,
 0.22321428571428573,
 0.1342281879194631,
 0.1415525114155251,
 0.11382113821138211,
 0.23780487804878048,
 0.19597989949748743,
 0.23921568627450981,
 0.16363636363636364,
 0.23931623931623933,
 0.21666666666666667,
 0.24742268041237114,
 0.18543046357615894,
 0.20909090909090908,
 0.23562152133580705,
 0.192,
 0.23636363636363636,
 0.18333333333333332,
 0.1650485436893204,
 0.18691588785046728,
 0.03225806451612903,
 0.22764227642276422,
 0.13970588235294118,
 0.09,
 0.23076923076923078,
 0.22448979591836735,
 0.1885245901639344,
 0.1888111888111888,
 0.1893939393939394,
 0.21238938053097345,
 0.23148148148148148,
 0.13679245283018868,
 0.22053231939163498,
 0.203703703703703