In [1]:
from pathlib import Path
from typing import Tuple, Union, List, Dict, Iterable, Optional

import torch
import numpy as np
from tqdm.notebook import tqdm

from decomposer import Decomposer, DecomposerConfig
from recomposer import Recomposer, RecomposerConfig
# from evaluations.helpers import GroundedWord, load_recomposers_en_masse
# from evaluations.clustering import graph_en_masse
# from evaluations.euphemism import cherry_words

In [2]:
from dataclasses import dataclass

BASE_DIR = Path.home() / 'Research/congressional_adversary/results'
# sup_PE = torch.load(BASE_DIR / 'SGNS deno/pretrained super large/init.pt')['model']
# sup_PE = torch.load(BASE_DIR / 'news/validation/backup PE/init.pt')['model']
# sup_PE = torch.load(BASE_DIR / 'news/validation/pretrained/init.pt')['model']
sup_PE = torch.load(BASE_DIR / 'news/validation/pretrained/init.pt')['model']
# sup_PE = torch.load(BASE_DIR / 'news/train/pretrained/init.pt')['model']
WTI = sup_PE.word_to_id
ITW = sup_PE.id_to_word
grounding = sup_PE.cono_grounding

def GD(query):
    freq = grounding[WTI[query]]
    ratio = torch.nn.functional.normalize(freq, dim=0, p=1)
    
    print(query, end='\t')
    for r in ratio.tolist():
        print(round(r, 4), end=', ')
    print(end='\t')
    
    for f in freq.tolist():    
        print(int(f), end=', ')
    print()
    
    

sup_PE = sup_PE.embedding.weight.detach().cpu().numpy()
print(f'Vocab size = {len(WTI):,}')


# sub_PE = torch.load(BASE_DIR / 'bill topic/pretrained subset/init.pt')['model']
# sub_PE_WID = sub_PE.word_to_id
# sub_PE_GD = sub_PE.grounding
# del sub_PE


@dataclass
class GroundedWord():
    word: str

#     def __post_init__(self) -> None:
#         self.word_id: int = WTI[self.word]
#         metadata = sub_PE_GD[self.word]
#         self.freq: int = metadata['freq']
#         self.R_ratio: float = metadata['R_ratio']
#         self.majority_deno: int = metadata['majority_deno']

#         self.PE_neighbors = self.neighbors(sup_PE)
            
#     def neighbors(self, embed, top_k=10): 
#         query_id = sup_PE.word_to_id[self.word]
#         query_vec = sup_PE[query_id]
#         distances = [
#             distance.cosine(query_vec, neighbor_vec)
#             for neighbor_vec in sup_PE]
#         self.sup_PE_neighbors = set()
#         for sort_rank, neighbor_id in enumerate(sorted_neighbor_indices):
#             if num_neighbors == top_k:
#                 break
#             if query_id == neighbor_id:
#                 continue
#             neighbor_word = self.id_to_word[neighbor_id]
#             if editdistance.eval(query_word, neighbor_word) < 3:
#                 continue
            
#         self.sub_PE_neighbors: List[str] = nearest(, sub_PE)

    def __str__(self) -> str:
        return str(vars(self))
    
    
# capitalism: List[GroundedWord] = []
# socialism: List[GroundedWord] = []
# for word in sub_PE_WID.keys():
#     ratio = sub_PE_GD[word]['R_ratio']
#     freq = sub_PE_GD[word]['freq']
#     word = GroundedWord(word)
#     if ratio < 0.2 and freq > 100:  # 0.2:
#         socialism.append(word)
#     elif ratio > 0.8 and freq > 100:  # 0.8:
#         capitalism.append(word)

# print(
#     f'{len(capitalism)} capitalists\n'
#     f'{len(socialism)} socialists')
# polarization = capitalism + socialism

Vocab size = 138,441


In [4]:
from scipy.spatial.distance import cosine as cos_dist
import editdistance

def vec(query: str, embed: np.ndarray) -> np.ndarray:
    try:
        query_id = WTI[query]
    except KeyError:
        raise KeyError(f'Out of vocabulary: {query}')
    return embed[query_id]


def nearest_neighbors(
        query: str,
        embed: np.ndarray,
        top_k: int = 10
        ) -> None:
    query_vec = vec(query, embed)
#     print(f"{query}’s neareset neighbors:")
    distances = [
        cos_dist(query_vec, neighbor_vec)
        for neighbor_vec in embed]
    neighbor_indices = np.argsort(distances)
    num_neighbors = 0        
    for sort_rank, neighbor_id in enumerate(neighbor_indices):
        if num_neighbors == top_k:
            break
#         if query_id == neighbor_id:
#             continue
        neighbor_word = ITW[neighbor_id]

        if editdistance.eval(query, neighbor_word) < 3:
            continue
        cosine_similarity = 1 - distances[neighbor_id]
        # neighbor_ids.append(neighbor_id)
        num_neighbors += 1
        print(f'{cosine_similarity:.4f}\t{neighbor_word}')
    print()

In [5]:
def get_embed(model: Decomposer) -> np.ndarray:
    return model.embedding.weight.detach().cpu().numpy()


def load(
        path: Path,
        match_vocab: bool = False,
        device: str = 'cpu'
        ) -> np.ndarray:
    model = torch.load(path, map_location=device)['model']
    try:
        assert model.word_to_id == WTI
    except AssertionError:
        print(f'Vocabulary mismatch: {path}')
        print(f'Vocab size = {len(model.word_to_id)}')
        if match_vocab:
            raise RuntimeError
        else:
            return None
    return get_embed(model)


def load_decomposers_en_masse(
        in_dirs: Union[Path, List[Path]],
        patterns: Union[str, List[str]]
        ) -> Tuple[Dict[str, np.ndarray], ...]:
    if not isinstance(in_dirs, List):
        in_dirs = [in_dirs, ]
    if not isinstance(patterns, List):
        patterns = [patterns, ]
    checkpoints: List[Path] = []
    for in_dir in in_dirs:
        for pattern in patterns:
            checkpoints += list(in_dir.glob(pattern))
    if len(checkpoints) == 0:
        raise FileNotFoundError('No model with path pattern found at in_dir?')

    models = {
#         'pretrained superset': load(BASE_DIR / 'bill topic/pretrained superset/init.pt'),
#         'pretrained subset': load(BASE_DIR / 'bill topic/pretrained subset/init.pt')
    }
    for path in tqdm(checkpoints):
        tqdm.write(f'Loading {path}')
        embed = load(path) 
        if embed is None:
            continue
#         name = path.parent.name
        name = path.parent.name + '/' + path.name
        models[name] = embed
    return models

In [6]:
# base_dir = Path('../../results/SGNS deno/sans recomposer')
# deno_space = load_decomposers_en_masse(base_dir, patterns='*/epoch10.pt')

base_dir = Path('../../results/news/validation')
deno_space = load_decomposers_en_masse(base_dir, patterns='*/epoch2.pt')
# deno_space.update(load_decomposers_en_masse(base_dir, patterns='*/epoch3.pt'))

print(deno_space.keys())

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

Loading ../../results/news/validation/-0.01d/epoch2.pt
Loading ../../results/news/validation/-3c/epoch2.pt
Loading ../../results/news/validation/1d 0c/epoch2.pt

dict_keys(['-0.01d/epoch2.pt', '-3c/epoch2.pt', '1d 0c/epoch2.pt'])


In [8]:
from utils.word_similarity.all_wordsim import mean_delta, read_plain_text, compare
import math

def convert(embed):
    word_vecs: Dict[str, np.array] = {}
    for word_id, vector in enumerate(embed):
        vector /= math.sqrt((vector ** 2).sum() + 1e-6)
        word_vecs[ITW[word_id]] = vector
    return word_vecs

# wd = convert_to_dict(sup_PE)
# compare(wd, wd)
compare(convert(deno_space['-3c/epoch2.pt']), convert(sup_PE))


             Dataset             Rho       Reference      Delta >.02
    EN-MTurk-771.txt         44.79%         52.84%          -8.04%
  EN-RW-STANFORD.txt         32.43%         45.33%         -12.90%
   EN-SIMLEX-999.txt         23.27%         32.92%          -9.65%
    EN-MTurk-287.txt         52.73%         59.11%          -6.38%
       EN-YP-130.txt         40.75%         50.72%          -9.98%
   EN-WS-353-SIM.txt         57.16%         68.51%         -11.35%
    EN-MEN-TR-3k.txt         51.83%         52.49%               𝜀
   EN-WS-353-REL.txt         41.18%         44.70%          -3.52%
     EN-VERB-143.txt         15.90%         29.87%         -13.97%
   EN-WS-353-ALL.txt         47.69%         58.15%         -10.46%
 EN-SimVerb-3500.txt         12.91%         26.25%         -13.34%
Mean Delta = -9.1131%


In [None]:
def cf(query):
    GD(query)
    print('Pretrained:')
    nearest_neighbors(query, sup_PE)
    print('Our model:')
    nearest_neighbors(query, deno_space['-3c/epoch2.pt'])

In [None]:
cf('estate_tax')
cf('death_tax')

In [None]:
cf('undocumented_workers')
cf('illegal_aliens')
cf('chain_migration')

In [None]:
cf('obamacare')
cf('aca')
cf('public_option')
cf('single_payer')
cf('socialized_medicine')

In [None]:
cf('leftists')
cf('antifa')

In [None]:
cf('guns')

In [None]:
cf('trade')

In [None]:
cf('crooked_hillary')
cf('crazy_bernie')
cf('lyin_ted')
cf('little_marco')

In [None]:
cf('lgbt')

## Clustering

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

sns.set()

def plot(
        coordinates: np.ndarray,
        words: List[GroundedWord],
        path: Path
        ) -> None:
    fig, ax = plt.subplots(figsize=(15, 10))
#     skew = [w.R_ratio for w in words]
#     freq = [w.freq for w in words]
    sns.scatterplot(
        coordinates[:, 0], coordinates[:, 1],
#         hue=skew, palette='coolwarm',  # hue_norm=(0, 1),
#         size=freq, sizes=(200, 1000),
        legend=None, ax=ax)
    for coord, w in zip(coordinates, words):
        ax.annotate(w.word, coord, fontsize=20)
    with open(path, 'wb') as file:
        fig.savefig(file, dpi=300)
    plt.close(fig)


def plot_categorical(
        coordinates: np.ndarray,
        words: List[GroundedWord],
        path: Path,
        fancy: bool = True
        ) -> None:
    if fancy:
        fig, ax = plt.subplots(figsize=(20, 10))
        categories = [w.majority_deno for w in words]
        freq = [w.freq for w in words]
        sns.scatterplot(
            coordinates[:, 0], coordinates[:, 1],
            hue=categories, palette='muted', hue_norm=(0, 1),
            size=freq, sizes=(200, 1000),
            legend='brief', 
            ax=ax)
        chartBox = ax.get_position()
        ax.set_position(  # adjust legend
            [chartBox.x0, chartBox.y0, chartBox.width * 0.6, chartBox.height])
        ax.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), ncol=1)
    else:
        fig, ax = plt.subplots(figsize=(20, 10))
        freq = [w.freq for w in words]
        sns.scatterplot(
            coordinates[:, 0], coordinates[:, 1], ax=ax)

    for coord, w in zip(coordinates, words):
        ax.annotate(w.word, coord, fontsize=12)
    with open(path, 'wb') as file:
        fig.savefig(file, dpi=300)
    plt.close(fig)


def graph_en_masse(
        models: Dict[str, np.ndarray],
        out_dir: Path,
        reduction: str,  # 'PCA', 'TSNE', or 'both'
        words: List[GroundedWord],
        # hues: Union[List[float], List[int]],
        # sizes: List[int],
        perplexity: Optional[int] = None,
        categorical: bool = False
        ) -> None:
    Path.mkdir(out_dir, parents=True, exist_ok=True)
    word_ids = np.array([w.word_id for w in words])
    for model_name, embed in tqdm(models.items()):
        space = embed[word_ids]
        if reduction == 'PCA':
            visual = PCA(n_components=2).fit_transform(space)
        elif reduction == 'TSNE':
            assert perplexity is not None
            visual = TSNE(
                perplexity=perplexity, learning_rate=10,
                n_iter=5000, n_iter_without_progress=1000).fit_transform(space)
        elif reduction == 'both':
            assert perplexity is not None
            space = PCA(n_components=30).fit_transform(space)
            visual = TSNE(
                perplexity=perplexity, learning_rate=10,
                n_iter=5000, n_iter_without_progress=1000).fit_transform(space)
        else:
            raise ValueError('unknown dimension reduction method')
        if categorical:
            plot_categorical(visual, words, out_dir / f'{model_name}.png')
        else:
            plot(visual, words, out_dir / f'{model_name}.png')


In [None]:
cherry_words = [
    'government', 'washington',
    'estate_tax', 'death_tax',
    'public_option', 'governmentrun',
    'foreign_trade', 'international_trade',
    'cut_taxes', 'trickledown'
]

cherry_words = [GroundedWord(w) for w in cherry_words]

In [None]:
models = deno_space
stuff = polarization

graph_en_masse(
    models, out_dir=base_dir / 'cherry/topic/t-SNE p5',
    reduction='TSNE', perplexity=5, words=stuff, categorical=True)
graph_en_masse(
    models, out_dir=base_dir / 'cherry/topic/t-SNE p3',
    reduction='TSNE', perplexity=3, words=stuff, categorical=True)
graph_en_masse(
    models, out_dir=base_dir / 'cherry/topic/t-SNE p2',
    reduction='TSNE', perplexity=2, words=stuff, categorical=True)

In [None]:
models = cono_space

graph_en_masse(
    models, out_dir=base_dir / 'cherry/party/t-SNE p5',
    reduction='TSNE', perplexity=5, words=cherry_words, categorical=False)

graph_en_masse(
    models, out_dir=base_dir / 'cherry/party/t-SNE p3',
    reduction='TSNE', perplexity=3, words=cherry_words, categorical=False)

graph_en_masse(
    models, out_dir=base_dir / 'cherry/party/t-SNE p2',
    reduction='TSNE', perplexity=2, words=cherry_words, categorical=False)

In [None]:
models = deno_space

graph_en_masse(
    models,
    out_dir=f'{base_dir}/decomposed deno/party/t-SNE p25',
    reduction='TSNE', perplexity=25,
    word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/decomposed deno/party/t-SNE p50',
    reduction='TSNE', perplexity=50,
    word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq)

In [None]:
models = cono_space

graph_en_masse(
    models, out_dir=f'{base_dir}/decomposed cono/topic/t-SNE p5',
    reduction='TSNE', perplexity=5,
    word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
    categorical=True)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/decomposed cono/topic/t-SNE p3',
    reduction='TSNE', perplexity=3,
    word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
    categorical=True)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/decomposed cono/topic/t-SNE p10',
    reduction='TSNE', perplexity=10,
    word_ids=J_ids, words=J_words, hues=J_deno, sizes=J_freq,
    categorical=True)

In [None]:
models = cono_space

# graph_en_masse(
#     models,
#     out_dir=f'{base_dir}/Joint/topic/PCA',
#     reduction='PCA',
#     word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/decomposed cono/party/t-SNE p25',
    reduction='TSNE', perplexity=25,
    word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq)

graph_en_masse(
    models,
    out_dir=f'{base_dir}/decomposed cono/party/t-SNE p50',
    reduction='TSNE', perplexity=50,
    word_ids=J_ids, words=J_words, hues=J_skew, sizes=J_freq)

# Homogeneity V-Measure

In [None]:
# Deno space, eval deno, higher is better
for model_name, model in deno_space.items():
    cluster_labels, true_labels = NN_cluster_ids(
        model, J_ids, categorical=True, top_k=10)    
    homogeneity, completeness, v_measure = np.around(
        homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')
#     print(pred_labels)

In [None]:
# Deno space, eval cono, lower is better
for model_name, model in deno_space.items():
    cluster_labels, true_labels = NN_cluster_ids(
        model, J_ids, categorical=False, top_k=5)    
    homogeneity, completeness, v_measure = np.around(
        homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')
#     print(pred_labels)

In [None]:
# Cono space, eval cono, higher is better
for model_name, model in cono_space.items():
    cluster_labels, true_labels = NN_cluster_ids(
        model, J_ids, categorical=False, top_k=5)    
    homogeneity, completeness, v_measure = np.around(
        homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')

In [None]:
# Cono space, eval deno, lower is better
for model_name, model in cono_space.items():
    cluster_labels, true_labels = NN_cluster_ids(
        model, J_ids, categorical=True, top_k=5)    
    homogeneity, completeness, v_measure = np.around(
        homogeneity_completeness_v_measure(true_labels, cluster_labels), 4)
    print(model_name, homogeneity, completeness, v_measure, sep='\t')