In [1]:
from pathlib import Path
from typing import Tuple, Union, List, Dict, Iterable, Optional

import torch
import numpy as np
from tqdm.notebook import tqdm

from decomposer import Decomposer, DecomposerConfig
from recomposer import Recomposer, RecomposerConfig
from data import GroundedWord
from evaluations.helpers import load_en_masse, ground

# from evaluations.helpers import GroundedWord, load_recomposers_en_masse
# from evaluations.clustering import graph_en_masse
# from evaluations.euphemism import cherry_words

Loading vocabulary from /home/webson/Research/congressional_adversary/results/search/pretrained/init.pt
Vocab size = 138,443


  self.R_ratio = self.cono_freq[2] / (self.cono_freq[0] + self.cono_freq[2])


In [2]:
base_dir = Path('../../results/pious/')
deno_space, cono_space = load_en_masse(
    base_dir, 
    patterns=['*/epoch1.pt', '*/epoch3.pt', '*/epoch5.pt'], 
    recomposer=True)
for name in deno_space.keys():
    print(name)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

Loading ../../results/pious/true cross/epoch1.pt




Loading ../../results/pious/intraDS/epoch1.pt
Loading ../../results/pious/sans bound/epoch1.pt
Loading ../../results/pious/true cross/epoch3.pt
Loading ../../results/pious/intraDS/epoch3.pt
Loading ../../results/pious/sans bound/epoch3.pt
Loading ../../results/pious/true cross/epoch5.pt
Loading ../../results/pious/intraDS/epoch5.pt
Loading ../../results/pious/sans bound/epoch5.pt

pretrained
true cross epoch1
intraDS epoch1
sans bound epoch1
true cross epoch3
intraDS epoch3
sans bound epoch3
true cross epoch5
intraDS epoch5
sans bound epoch5


## Clustering

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

sns.set()

def plot(
        coordinates: np.ndarray,
        words: List[GroundedWord],
        path: Path
        ) -> None:
    fig, ax = plt.subplots(figsize=(15, 10))
    skew = [w.R_ratio for w in words]
    freq = [w.freq for w in words]
    sns.scatterplot(
        coordinates[:, 0], coordinates[:, 1],
        hue=skew, palette='coolwarm', hue_norm=(0, 1),
        size=freq, sizes=(200, 1000),
        legend=None, ax=ax)
    for coord, w in zip(coordinates, words):
        ax.annotate(w.word, coord, fontsize=12)
    with open(path, 'wb') as file:
        fig.savefig(file, dpi=300)
    plt.close(fig)


def plot_categorical(
        coordinates: np.ndarray,
        words: List[GroundedWord],
        path: Path,
        fancy: bool = False
        ) -> None:
    if fancy:
        fig, ax = plt.subplots(figsize=(20, 10))
        categories = [w.majority_deno for w in words]
        freq = [w.freq for w in words]
        sns.scatterplot(
            coordinates[:, 0], coordinates[:, 1],
            hue=categories, palette='muted', hue_norm=(0, 1),
            size=freq, sizes=(200, 1000),
            legend='brief', 
            ax=ax)
        chartBox = ax.get_position()
        ax.set_position(  # adjust legend
            [chartBox.x0, chartBox.y0, chartBox.width * 0.6, chartBox.height])
        ax.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), ncol=1)
    else:
        fig, ax = plt.subplots(figsize=(20, 10))
        freq = [w.freq for w in words]
        sns.scatterplot(
            coordinates[:, 0], coordinates[:, 1], ax=ax)

    for coord, w in zip(coordinates, words):
        ax.annotate(w.word, coord, fontsize=12)
    with open(path, 'wb') as file:
        fig.savefig(file, dpi=300)
    plt.close(fig)


def graph_en_masse(
        models: Dict[str, np.ndarray],
        out_dir: Path,
        reduction: str,  # 'PCA', 'TSNE', or 'both'
        words: List[GroundedWord],
        # hues: Union[List[float], List[int]],
        # sizes: List[int],
        perplexity: Optional[int] = None,
        categorical: bool = False
        ) -> None:
    Path.mkdir(out_dir, parents=True, exist_ok=True)
    word_ids = np.array([w.id for w in words])
    for model_name, embed in tqdm(models.items()):
        space = embed[word_ids]
        if reduction == 'PCA':
            visual = PCA(n_components=2).fit_transform(space)
        elif reduction == 'TSNE':
            assert perplexity is not None
            visual = TSNE(
                perplexity=perplexity, learning_rate=10,
                n_iter=5000, n_iter_without_progress=1000).fit_transform(space)
        elif reduction == 'both':
            assert perplexity is not None
            space = PCA(n_components=30).fit_transform(space)
            visual = TSNE(
                perplexity=perplexity, learning_rate=10,
                n_iter=5000, n_iter_without_progress=1000).fit_transform(space)
        else:
            raise ValueError('unknown dimension reduction method')
        if categorical:
            plot_categorical(visual, words, out_dir / f'{model_name}.png')
        else:
            plot(visual, words, out_dir / f'{model_name}.png')


In [4]:
cherry_words = [
    'government', 'washington',
    'estate_tax', 'death_tax',
    'public_option', 'government_run',
    'foreign_trade', 'international_trade',
#     'cut_taxes', 'trickle_down'
]
cherry_words = [ground[w] for w in cherry_words]

In [5]:
import random
random.seed(1)

test_path = Path('../../data/ellie/partisan_sample.hp.txt')
with open(test_path) as file:
    test_words = [ground[word.strip()] for word in file]
sampled_test = random.sample(test_words, 50)

In [6]:
models = deno_space
grounded_words = sampled_test

graph_en_masse(
    models, out_dir=base_dir / 'test/deno_space/t-SNE p2',
    reduction='TSNE', perplexity=2, words=grounded_words)

graph_en_masse(
    models, out_dir=base_dir / 'test/deno_space/t-SNE p3',
    reduction='TSNE', perplexity=3, words=grounded_words)

graph_en_masse(
    models, out_dir=base_dir / 'test/deno_space/t-SNE p5',
    reduction='TSNE', perplexity=5, words=grounded_words)

graph_en_masse(
    models, out_dir=base_dir / 'test/deno_space/t-SNE p25',
    reduction='TSNE', perplexity=25, words=grounded_words)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




# Connotation Space

In [7]:
models = cono_space
grounded_words = sampled_test

graph_en_masse(
    models, out_dir=base_dir / 'test/cono_space/t-SNE p25',
    reduction='TSNE', perplexity=25, words=grounded_words)
graph_en_masse(
    models, out_dir=base_dir / 'test/cono_space/t-SNE p3',
    reduction='TSNE', perplexity=3, words=grounded_words)
graph_en_masse(
    models, out_dir=base_dir / 'test/cono_space/t-SNE p2',
    reduction='TSNE', perplexity=2, words=grounded_words)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [8]:
# models = deno_space
# grounded_words = cherry_words

# graph_en_masse(
#     models, out_dir=base_dir / 'test_cherry/deno_space/t-SNE p5',
#     reduction='TSNE', perplexity=5, words=grounded_words)
# graph_en_masse(
#     models, out_dir=base_dir / 'test_cherry/deno_space/t-SNE p3',
#     reduction='TSNE', perplexity=3, words=grounded_words)
# graph_en_masse(
#     models, out_dir=base_dir / 'test_cherry/deno_space/t-SNE p2',
#     reduction='TSNE', perplexity=2, words=grounded_words)