In [2]:
import pickle
import csv
import os
from typing import Set, Tuple, NamedTuple, List, Dict, Counter, Optional

import torch
import numpy as np
from scipy.spatial import distance
from scipy.stats import spearmanr

from decomposer import AdversarialDecomposer, AdversarialConfig

np.random.seed(42)
torch.manual_seed(42)

class Embedding():
    
    def __init__(self, path: str, source: Optional[str] = None):
        if source is None or source == 'adversarial':
            self.init_from_adversarial(path)
        elif source == 'skip_gram':
            self.init_from_skip_gram(path)
        elif source == 'plain_text':
            self.init_from_plain_text(path)
        else:
            raise ValueError('Unknown embedding source.')
            
    def init_from_adversarial(self, path: str, device=torch.device('cpu')):
        payload = torch.load(path, map_location=device)
        model = payload['model']
        self.word_to_id = model.word_to_id
        self.id_to_word = model.id_to_word 
        self.Dem_frequency: Counter[str] = model.Dem_frequency
        self.GOP_frequency: Counter[str] = model.GOP_frequency
        
        # encoded layer
        self.embedding = model.export_encoded_embedding(device=device)
#         self.embedding = model.export_decomposed_embedding(device=device)

#         # manually choose which layer to export
#         all_vocab_ids = torch.arange(
#             len(self.word_to_id), dtype=torch.long, device=device)
#         with torch.no_grad():
#             embed = model.embedding(all_vocab_ids)
#             encoded = model.encoder(embed)
#             self.cono_logits = model.cono_decoder(encoded)
            
#     def init_from_adversarial(self, path: str):        
#         config = DenotationEncoderConfig()
#         config.input_dir = '../../data/processed/adversarial/44_Obama_1e-5'
#         data = AdversarialDataset(config)
#         model = DenotationEncoder(config, data)
#         model.load_state_dict(torch.load(path))
#         self.embedding = model.export_decomposed_embedding().to('cpu')
#         self.word_to_id = model.word_to_id
#         self.id_to_word = model.id_to_word

    def init_from_skip_gram(self, paths: Tuple[str, str]) -> None:
        """Directly extract the weights of a single layer."""
        model_path, vocab_path = paths
        with open(model_path, 'rb') as model_file:
            state_dict = torch.load(model_file, map_location='cpu')
    #     print(state_dict.keys())
        self.embedding = state_dict['center_embedding.weight'].numpy()
        with open(vocab_path, 'rb') as vocab_file:
            self.word_to_id, self.id_to_word, _ = pickle.load(vocab_file)

    def init_from_plain_text(self, path: str) -> Tuple[np.array, Dict[str, int]]:
        id_generator = 0
        word_to_id: Dict[str, int] = {}
        embeddings: List[float] = []
        embedding_file = open(path)
        vocab_size, num_dimensions = map(int, embedding_file.readline().split())
        print(f'vocab_size = {vocab_size:,}, num_dimensions = {num_dimensions}')
        print(f'Loading embeddings from {path}', flush=True)
        for line in embedding_file:
            line: List[str] = line.split()  # type: ignore
            word = line[0]
            vector = np.array(line[-num_dimensions:], dtype=np.float64)
            embeddings.append(vector)
            word_to_id[word] = id_generator
            id_generator += 1
        embedding_file.close()
        print('Done')
        self.id_to_word = {val: key for key, val in word_to_id.items()}
        self.word_to_id = word_to_id
        self.embedding = np.array(embeddings)
        
    def write_to_tensorboard_projector(self, tb_dir: str) -> None:
        from torch.utils import tensorboard
        tb = tensorboard.SummaryWriter(log_dir=tb_dir)
        all_vocab_ids = range(len(self.word_to_id))
        embedding_labels = [
            self.id_to_word[word_id]
            for word_id in all_vocab_ids]
        tb.add_embedding(
            self.embedding[:9999], 
            embedding_labels[:9999], 
            global_step=0)
        
    def export_web_projector(self, out_dir: str) -> None:
        random_indices = np.random.randint(len(self.embedding), size=10000)
        subset_embedding = self.embedding[random_indices].tolist()
        
        vector_path = os.path.join(out_dir, 'tensorboard.tsv')
        with open(vector_path, 'w') as vector_file:
            for vector in subset_embedding:
                vector_file.write('\t'.join(map(str, vector)) + '\n')

        label_path = os.path.join(out_dir, 'tensorboard_labels.tsv')
        with open(label_path, 'w') as label_file:
            for index in random_indices:
                label_file.write(self.id_to_word[index] + '\n')

    def cosine_similarity(self, query1: str, query2: str) -> float:
        try:
            query1_id = self.word_to_id[query1]
        except KeyError as error:
            print(f'Out of vocabulary: {query1}')
            raise error
        try:
            query2_id = self.word_to_id[query2]
        except KeyError as error:
            print(f'Out of vocabulary: {query2}')
            raise error
        vectors = self.embedding[(query1_id, query2_id), :]
        similarity = 1 - distance.cosine(vectors[0], vectors[1])
        return similarity

    def nearest_neighbor(self, query: str, top_k: int = 10):
        try:
            query_id = self.word_to_id[query]
        except KeyError:
            raise KeyError(f'{query} is out of vocabulary. Sorry!')    
        query_vec = self.embedding[query_id]
        
        distances = [distance.cosine(query_vec, vec) 
                     for vec in self.embedding]
        neighbors = np.argsort(distances)
        print(f"{query}'s neareset neighbors:")
        for ranking in range(1, top_k + 1):
            word_id = neighbors[ranking]
            word = self.id_to_word[word_id]
            cosine_similarity = 1 - distances[word_id]
            print(f'{cosine_similarity:.4f}\t{word}')
        print()
        

class PhrasePair(NamedTuple):
    query: str
    neighbor: str
    deno_sim: float
    cono_sim: float
        

def load_cherry(path, exclude_hard_examples=True):
    data = []
    with open(path) as file:
        if path.endswith('tsv'):
            reader = csv.DictReader(file, dialect=csv.excel_tab)
        else:
            reader = csv.DictReader(file)
        for row in reader:
            if row['semantic_similarity'] and row['cono_similarity']:
                if (exclude_hard_examples and 
                        'hard example' in row['comment'].lower()):
                    continue
                data.append(PhrasePair(
                    row['query'], 
                    row['neighbor'], 
#                     row['query_words'], 
#                     row['neighbor_words'], 
                    float(row['semantic_similarity']), 
                    float(row['cono_similarity'])))
    print(f'Loaded {len(data)} labeled entries at {path}')
    return data


def load_MTurk_results(path):
    data = []
    with open(path) as file:
        if path.endswith('tsv'):
            reader = csv.DictReader(file, dialect=csv.excel_tab)
        else:
            reader = csv.DictReader(file)
        for row in reader:
            if row['median_deno'] and row['median_query_cono']:
                
                cono_sim = 5 - abs(
                    float(row['median_query_cono']) - 
                    float(row['median_neighbor_cono']))
                
                data.append(PhrasePair( 
                    row['query_words'], 
                    row['neighbor_words'], 
                    float(row['median_deno']), 
                    cono_sim))
    print(f'Loaded {len(data)} labeled entries at {path}')
    return data


def correlate_sim_deltas(model, ref_model, phrase_pairs, verbose=False):
    label_deltas = []
    model_deltas = []
    if verbose:
        print(f'deno_sim\tcono_sim\tref_sim\tmodel_sim')
    
    for pair in phrase_pairs:
        try:
            sim = model.cosine_similarity(pair.query, pair.neighbor)
            ref_sim = ref_model.cosine_similarity(pair.query, pair.neighbor)
        except KeyError:
            continue 
        model_delta = sim - ref_sim
        model_deltas.append(model_delta)
        label_deltas.append(pair.deno_sim - pair.cono_sim)
            
        if verbose:
            print(f'{pair.deno_sim}  {pair.cono_sim}  {ref_sim:.2%}  {sim:.2%}  '
                  f'{pair.query}  {pair.neighbor}')

    median = np.median(model_deltas)
    mean = np.mean(model_deltas)
    stddev = np.std(model_deltas)
    rho, _ = spearmanr(model_deltas, label_deltas)
    return rho, median, mean, stddev

## Cherry Data

In [3]:
Dem_pairs = load_cherry(    
    '../../data/evaluation/cherries/labeled_Dem_samples.tsv',
    exclude_hard_examples=True)
GOP_pairs = load_cherry(
    '../../data/evaluation/cherries/labeled_GOP_samples.tsv',
    exclude_hard_examples=True)
test_data = Dem_pairs + GOP_pairs

# Same entity denotation, different party connotation.
euphemism = [pair for pair in test_data
             if pair.deno_sim > pair.cono_sim]

# Different entity denotation, same party connotation.
party_platform = [pair for pair in test_data
                  if pair.deno_sim < pair.cono_sim]
party_platform += load_cherry(
    '../../data/evaluation/cherries/remove_deno.tsv',
    exclude_hard_examples=False)

print(f'{len(euphemism)} euphemism')
print(f'{len(party_platform)} party platform')

Loaded 26 labeled entries at ../../data/evaluation/cherries/labeled_Dem_samples.tsv
Loaded 29 labeled entries at ../../data/evaluation/cherries/labeled_GOP_samples.tsv
Loaded 32 labeled entries at ../../data/evaluation/cherries/remove_deno.tsv
45 euphemism
42 party platform


## Pilot Batch

In [None]:
# test_data = load_MTurk_results('../../data/evaluation/qualification_30.csv')
test_data = load_MTurk_results('../../data/evaluation/combined_result.csv')

# Same entity denotation, different party connotation.
euphemism = [pair for pair in test_data
             if pair.deno_sim > pair.cono_sim]

# Different entity denotation, same party connotation.
party_platform = [pair for pair in test_data
                  if pair.deno_sim < pair.cono_sim]

print(f'{len(euphemism)} euphemism (deno_sim > cono_sim)')
print(f'{len(party_platform)} party platform (deno_sim < cono_sim)')

In [4]:
# preview
for stuff in euphemism:
    q, n, d, c = stuff
    print(d, c, q, n, sep='\t')

5.0	1.0	tax_breaks	bipartisan_tax_relief
5.0	1.0	star_wars	strategic_defense_initiative
5.0	1.0	star_wars	missile_defense
4.0	1.0	military_spending	federal_spending
4.0	1.0	military_spending	government_spending
4.5	1.0	assault_weapons	firearms
5.0	1.0	assault_weapons	rifles
4.0	2.0	credit_card_companies	creditors
5.0	3.0	military_budget	defense_budget
5.0	3.0	the_recovery_act	the_stimulus_bill
5.0	4.0	the_recovery_act	stimulus_package
3.0	2.0	trickledown	cut_taxes
5.0	3.0	tax_expenditures	spending_programs
4.0	2.0	waterboarding	interrogation
5.0	3.0	assault_weapons_ban	gun_control
5.0	1.0	antichoice	prolife
4.0	2.0	private_insurance_companies	medicare_advantage_program
5.0	1.0	nuclear_option	constitutional_option
5.0	2.0	corporate_profits	earnings
5.0	1.0	death_tax	estate_tax
5.0	2.0	unborn	fetus
5.0	2.0	partialbirth_abortion	lateterm
5.0	1.0	partialbirth_abortion	dx
5.0	1.0	illegals	undocumented_immigrants
5.0	1.0	governmentrun	public_option
4.0	1.0	medical_liability	medical_malpracti

### Load Pretrained Embedding

In [6]:
pretrained = Embedding('../../data/pretrained_word2vec/for_real.txt', 'plain_text')

vocab_size = 111,387, num_dimensions = 300
Loading embeddings from ../../data/pretrained_word2vec/for_real.txt
Done


## Decompose Denotation +d -c models 
similarity should increase for euphemism

In [7]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/' 
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.017	0.016	0.029	0.263
1d -1c/epoch4.pt	0.005	-0.000	0.035	0.171
1d -1c/epoch6.pt	-0.002	-0.006	0.036	0.083
1d -1c/epoch8.pt	-0.005	-0.007	0.043	0.134
1d -1c/epoch10.pt	-0.003	-0.009	0.034	0.085
1d -1c/epoch12.pt	-0.001	-0.011	0.039	-0.026
1d -1c/epoch14.pt	-0.003	-0.004	0.039	0.084
1d -1c/epoch16.pt	-0.002	-0.008	0.036	0.027
1d -1c/epoch20.pt	-0.009	-0.009	0.037	-0.043
1d -1c/epoch24.pt	-0.003	-0.006	0.038	-0.014
1d -1c/epoch28.pt	-0.013	-0.015	0.040	0.002
1d -1c/epoch30.pt	-0.011	-0.013	0.041	-0.058
				
1d -2c/epoch2.pt	0.030	0.029	0.032	0.045
1d -2c/epoch4.pt	0.010	0.011	0.038	0.112
1d -2c/epoch6.pt	0.008	0.001	0.044	0.122
1d -2c/epoch8.pt	0.010	0.001	0.048	0.078
1d -2c/epoch10.pt	0.005	-0.005	0.050	0.129
1d -2c/epoch12.pt	0.004	-0.007	0.051	0.144
1d -2c/epoch14.pt	0.001	-0.011	0.055	0.095
1d -2c/epoch16.pt	-0.001	-0.011	0.049	0.135
1d -2c/epoch20.pt	-0.004	-0.007	0.052	-0.015
1d -2c/epoch24.pt	0.003	-0.010	0.04

In [8]:
# Evaluate against 1d 0c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '1d 0c'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.011	0.013	0.019	0.185
1d -1c/epoch4.pt	0.002	0.009	0.019	0.163
1d -1c/epoch6.pt	0.008	0.010	0.017	0.025
1d -1c/epoch8.pt	0.008	0.008	0.023	0.237
1d -1c/epoch10.pt	0.004	0.009	0.024	0.018
1d -1c/epoch12.pt	0.008	0.011	0.025	-0.081
1d -1c/epoch14.pt	0.019	0.018	0.024	0.123
1d -1c/epoch16.pt	0.010	0.015	0.027	-0.041
1d -1c/epoch20.pt	0.009	0.015	0.029	-0.097
1d -1c/epoch24.pt	0.012	0.018	0.033	-0.064
1d -1c/epoch28.pt	0.009	0.009	0.031	0.007
1d -1c/epoch30.pt	0.008	0.010	0.031	-0.087
				
1d -2c/epoch2.pt	0.020	0.026	0.022	-0.051
1d -2c/epoch4.pt	0.021	0.019	0.017	0.075
1d -2c/epoch6.pt	0.014	0.016	0.022	0.129
1d -2c/epoch8.pt	0.013	0.016	0.028	0.104
1d -2c/epoch10.pt	0.014	0.013	0.031	0.125
1d -2c/epoch12.pt	0.011	0.014	0.030	0.187
1d -2c/epoch14.pt	0.010	0.011	0.036	0.169
1d -2c/epoch16.pt	0.010	0.011	0.029	0.149
1d -2c/epoch20.pt	0.014	0.017	0.036	-0.081
1d -2c/epoch24.pt	0.012	0.014	0.034	0.122
1d -2c/epoch28.pt	0

## Decompose Denotation +d -c models 
Similarity should decrease for party platform.

In [22]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	-0.003	-0.009	0.049	0.079
1d -1c/epoch4.pt	0.005	-0.007	0.048	-0.031
1d -1c/epoch6.pt	-0.005	-0.015	0.041	-0.102
1d -1c/epoch8.pt	-0.005	-0.014	0.039	-0.148
1d -1c/epoch10.pt	-0.005	-0.017	0.045	-0.172
1d -1c/epoch12.pt	-0.010	-0.012	0.042	-0.175
1d -1c/epoch14.pt	0.005	0.006	0.045	-0.240
1d -1c/epoch16.pt	-0.012	-0.009	0.043	-0.204
1d -1c/epoch20.pt	-0.003	-0.013	0.045	-0.180
1d -1c/epoch24.pt	-0.007	-0.011	0.050	-0.267
1d -1c/epoch28.pt	-0.009	-0.012	0.053	-0.308
1d -1c/epoch30.pt	-0.007	-0.011	0.054	-0.254
				
1d -2c/epoch2.pt	0.013	0.006	0.048	0.149
1d -2c/epoch4.pt	0.007	0.001	0.046	0.086
1d -2c/epoch6.pt	0.005	-0.006	0.044	0.004
1d -2c/epoch8.pt	-0.010	-0.013	0.056	-0.043
1d -2c/epoch10.pt	-0.007	-0.013	0.052	-0.033
1d -2c/epoch12.pt	0.008	-0.006	0.052	-0.117
1d -2c/epoch14.pt	-0.006	-0.015	0.051	-0.069
1d -2c/epoch16.pt	-0.010	-0.011	0.052	-0.089
1d -2c/epoch20.pt	0.005	-0.005	0.054	-0.092
1d -2c/epoch24.pt	0

In [23]:
# Evaluate against 1d 0c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '1d 0c'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.014	0.014	0.025	-0.025
1d -1c/epoch4.pt	0.017	0.017	0.030	0.083
1d -1c/epoch6.pt	0.010	0.008	0.024	0.056
1d -1c/epoch8.pt	0.008	0.009	0.031	-0.113
1d -1c/epoch10.pt	0.001	0.004	0.037	-0.211
1d -1c/epoch12.pt	0.014	0.012	0.038	-0.129
1d -1c/epoch14.pt	0.036	0.033	0.046	-0.143
1d -1c/epoch16.pt	0.022	0.016	0.043	-0.146
1d -1c/epoch20.pt	0.015	0.015	0.043	-0.059
1d -1c/epoch24.pt	0.016	0.020	0.046	-0.188
1d -1c/epoch28.pt	0.013	0.019	0.043	-0.257
1d -1c/epoch30.pt	0.022	0.018	0.044	-0.229
				
1d -2c/epoch2.pt	0.025	0.029	0.036	0.141
1d -2c/epoch4.pt	0.025	0.026	0.045	0.130
1d -2c/epoch6.pt	0.020	0.018	0.041	0.146
1d -2c/epoch8.pt	0.006	0.011	0.048	0.003
1d -2c/epoch10.pt	0.004	0.008	0.046	-0.022
1d -2c/epoch12.pt	0.014	0.018	0.048	-0.039
1d -2c/epoch14.pt	0.013	0.013	0.044	0.067
1d -2c/epoch16.pt	0.014	0.014	0.054	-0.007
1d -2c/epoch20.pt	0.021	0.023	0.060	-0.014
1d -2c/epoch24.pt	0.027	0.029	0.063	-0.041
1d -2c/epoc

## Decompose Connotaion -d +c models 
similarity should decrease for euphemism

In [11]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.90%	-5.47%	15.20%	6.02%
-0.005d 1c/epoch4.pt	-3.21%	-8.39%	12.53%	8.49%
-0.005d 1c/epoch6.pt	-7.51%	-9.36%	12.49%	26.07%
-0.005d 1c/epoch8.pt	-6.37%	-11.70%	13.74%	14.84%
-0.005d 1c/epoch10.pt	-8.97%	-7.93%	8.32%	27.21%
-0.005d 1c/epoch12.pt	-9.36%	-10.00%	8.83%	27.00%
-0.005d 1c/epoch14.pt	-9.08%	-9.84%	8.01%	24.87%
-0.005d 1c/epoch16.pt	-9.04%	-10.71%	8.59%	23.56%
-0.005d 1c/epoch20.pt	-10.05%	-11.76%	8.47%	7.68%
-0.005d 1c/epoch24.pt	-9.33%	-12.09%	8.60%	19.42%
-0.005d 1c/epoch28.pt	-10.08%	-12.75%	9.10%	32.94%
-0.005d 1c/epoch30.pt	-9.76%	-11.93%	8.53%	42.65%
				
-0.05d 1c/epoch2.pt	23.64%	26.00%	10.35%	2.83%
-0.05d 1c/epoch4.pt	24.48%	26.87%	10.91%	-2.01%
-0.05d 1c/epoch6.pt	14.90%	17.43%	8.71%	-0.78%
-0.05d 1c/epoch8.pt	13.45%	15.22%	8.49%	-0.97%
-0.05d 1c/epoch10.pt	9.07%	11.53%	8.29%	-0.67%
-0.05d 1c/epoch12.pt	7.96%	9.71%	8.34%	-4.44%
-0.05d 1c/epoch14.pt	7.79%	9.14%	8.28%	-2.22%
-0.05d 1c/epoch16.pt

In [12]:
# Evaluate against 0d 1c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '0d 1c'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.61%	-2.20%	2.33%	-0.35%
-0.005d 1c/epoch4.pt	-0.98%	-1.35%	3.55%	25.43%
-0.005d 1c/epoch6.pt	-2.65%	-3.17%	4.21%	21.51%
-0.005d 1c/epoch8.pt	-4.45%	-5.31%	4.95%	-2.51%
-0.005d 1c/epoch10.pt	-2.90%	-3.46%	3.35%	32.10%
-0.005d 1c/epoch12.pt	-4.65%	-4.68%	3.15%	11.78%
-0.005d 1c/epoch14.pt	-3.06%	-3.83%	3.85%	6.44%
-0.005d 1c/epoch16.pt	-4.76%	-4.80%	4.49%	21.20%
-0.005d 1c/epoch20.pt	-3.52%	-3.32%	3.81%	15.99%
-0.005d 1c/epoch24.pt	-4.05%	-3.59%	3.71%	14.31%
-0.005d 1c/epoch28.pt	-4.94%	-4.58%	4.69%	2.45%
-0.005d 1c/epoch30.pt	-5.20%	-4.66%	4.86%	10.05%
				
-0.05d 1c/epoch2.pt	24.39%	29.27%	19.40%	-3.41%
-0.05d 1c/epoch4.pt	31.64%	33.90%	19.02%	2.73%
-0.05d 1c/epoch6.pt	21.33%	23.62%	15.42%	-14.28%
-0.05d 1c/epoch8.pt	16.72%	21.61%	15.48%	-9.05%
-0.05d 1c/epoch10.pt	12.57%	16.00%	11.33%	-12.83%
-0.05d 1c/epoch12.pt	14.81%	15.03%	11.25%	-21.13%
-0.05d 1c/epoch14.pt	12.27%	15.14%	10.67%	-17.65%
-0.05d 1c/epoch16.

## Decompose Connotation -d +c models 
similarity should increase for party platform.

In [24]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	11.64%	15.08%	17.91%	-13.65%
-0.005d 1c/epoch4.pt	5.24%	7.20%	15.53%	-0.80%
-0.005d 1c/epoch6.pt	4.57%	4.06%	14.10%	-7.87%
-0.005d 1c/epoch8.pt	1.94%	0.76%	12.14%	9.47%
-0.005d 1c/epoch10.pt	2.34%	3.39%	11.96%	-17.45%
-0.005d 1c/epoch12.pt	-0.46%	-0.72%	13.68%	-4.07%
-0.005d 1c/epoch14.pt	1.33%	-0.01%	9.17%	-17.68%
-0.005d 1c/epoch16.pt	-1.05%	-1.29%	10.33%	-13.27%
-0.005d 1c/epoch20.pt	-2.92%	-2.61%	10.97%	-8.02%
-0.005d 1c/epoch24.pt	-2.00%	-1.64%	10.62%	-13.50%
-0.005d 1c/epoch28.pt	-2.52%	-2.63%	10.51%	-18.06%
-0.005d 1c/epoch30.pt	-4.18%	-3.29%	11.58%	-20.19%
				
-0.05d 1c/epoch2.pt	64.66%	55.65%	22.69%	-66.20%
-0.05d 1c/epoch4.pt	69.03%	58.55%	24.35%	-66.20%
-0.05d 1c/epoch6.pt	46.91%	42.12%	18.99%	-64.45%
-0.05d 1c/epoch8.pt	42.24%	38.27%	17.66%	-63.76%
-0.05d 1c/epoch10.pt	34.21%	32.34%	16.92%	-59.12%
-0.05d 1c/epoch12.pt	32.49%	30.39%	16.20%	-56.92%
-0.05d 1c/epoch14.pt	31.16%	29.82%	16.22%	-56.16%
-0.0

In [25]:
# Evaluate against 0d 1c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '0d 1c'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.76%	-1.53%	3.77%	8.63%
-0.005d 1c/epoch4.pt	-1.79%	-1.68%	3.85%	-24.90%
-0.005d 1c/epoch6.pt	-2.06%	-1.88%	4.47%	-4.45%
-0.005d 1c/epoch8.pt	-1.96%	-2.03%	4.50%	-13.19%
-0.005d 1c/epoch10.pt	-1.55%	-2.01%	3.77%	-23.46%
-0.005d 1c/epoch12.pt	-2.62%	-1.54%	3.96%	-10.00%
-0.005d 1c/epoch14.pt	-1.36%	-1.43%	3.53%	-18.97%
-0.005d 1c/epoch16.pt	-2.75%	-2.38%	5.04%	-12.74%
-0.005d 1c/epoch20.pt	-1.31%	-2.28%	5.18%	-19.58%
-0.005d 1c/epoch24.pt	-4.39%	-3.94%	6.63%	-12.36%
-0.005d 1c/epoch28.pt	-3.31%	-4.64%	8.21%	-20.42%
-0.005d 1c/epoch30.pt	-3.57%	-3.46%	7.76%	-25.21%
				
-0.05d 1c/epoch2.pt	35.43%	39.03%	26.03%	-59.73%
-0.05d 1c/epoch4.pt	55.39%	49.67%	27.28%	-64.98%
-0.05d 1c/epoch6.pt	34.38%	36.19%	22.82%	-63.38%
-0.05d 1c/epoch8.pt	38.41%	35.48%	21.15%	-60.95%
-0.05d 1c/epoch10.pt	25.61%	26.94%	18.79%	-49.92%
-0.05d 1c/epoch12.pt	25.15%	29.57%	19.46%	-49.62%
-0.05d 1c/epoch14.pt	28.26%	28.40%	17.58%	-51.52%
-0.