In [1]:
import pickle
import csv
import os
from typing import Set, Tuple, NamedTuple, List, Dict, Counter, Optional

import torch
import numpy as np
from scipy.spatial import distance
from scipy.stats import spearmanr

from decomposer import AdversarialDecomposer, AdversarialConfig

np.random.seed(42)
torch.manual_seed(42)

class Embedding():
    
    def __init__(self, path: str, source: Optional[str] = None):
        if source is None or source == 'adversarial':
            self.init_from_adversarial(path)
        elif source == 'skip_gram':
            self.init_from_skip_gram(path)
        elif source == 'plain_text':
            self.init_from_plain_text(path)
        else:
            raise ValueError('Unknown embedding source.')
            
    def init_from_adversarial(self, path: str, device=torch.device('cpu')):
        payload = torch.load(path, map_location=device)
        model = payload['model']
        self.word_to_id = model.word_to_id
        self.id_to_word = model.id_to_word 
        self.Dem_frequency: Counter[str] = model.Dem_frequency
        self.GOP_frequency: Counter[str] = model.GOP_frequency
        
        # encoded layer
        self.embedding = model.export_encoded_embedding(device=device)
#         self.embedding = model.export_decomposed_embedding(device=device)

#         # manually choose which layer to export
#         all_vocab_ids = torch.arange(
#             len(self.word_to_id), dtype=torch.long, device=device)
#         with torch.no_grad():
#             embed = model.embedding(all_vocab_ids)
#             encoded = model.encoder(embed)
#             self.cono_logits = model.cono_decoder(encoded)
            
#     def init_from_adversarial(self, path: str):        
#         config = DenotationEncoderConfig()
#         config.input_dir = '../../data/processed/adversarial/44_Obama_1e-5'
#         data = AdversarialDataset(config)
#         model = DenotationEncoder(config, data)
#         model.load_state_dict(torch.load(path))
#         self.embedding = model.export_decomposed_embedding().to('cpu')
#         self.word_to_id = model.word_to_id
#         self.id_to_word = model.id_to_word

    def init_from_skip_gram(self, paths: Tuple[str, str]) -> None:
        """Directly extract the weights of a single layer."""
        model_path, vocab_path = paths
        with open(model_path, 'rb') as model_file:
            state_dict = torch.load(model_file, map_location='cpu')
    #     print(state_dict.keys())
        self.embedding = state_dict['center_embedding.weight'].numpy()
        with open(vocab_path, 'rb') as vocab_file:
            self.word_to_id, self.id_to_word, _ = pickle.load(vocab_file)

    def init_from_plain_text(self, path: str) -> Tuple[np.array, Dict[str, int]]:
        id_generator = 0
        word_to_id: Dict[str, int] = {}
        embeddings: List[float] = []
        embedding_file = open(path)
        vocab_size, num_dimensions = map(int, embedding_file.readline().split())
        print(f'vocab_size = {vocab_size:,}, num_dimensions = {num_dimensions}')
        print(f'Loading embeddings from {path}', flush=True)
        for line in embedding_file:
            line: List[str] = line.split()  # type: ignore
            word = line[0]
            vector = np.array(line[-num_dimensions:], dtype=np.float64)
            embeddings.append(vector)
            word_to_id[word] = id_generator
            id_generator += 1
        embedding_file.close()
        print('Done')
        self.id_to_word = {val: key for key, val in word_to_id.items()}
        self.word_to_id = word_to_id
        self.embedding = np.array(embeddings)
        
    def write_to_tensorboard_projector(self, tb_dir: str) -> None:
        from torch.utils import tensorboard
        tb = tensorboard.SummaryWriter(log_dir=tb_dir)
        all_vocab_ids = range(len(self.word_to_id))
        embedding_labels = [
            self.id_to_word[word_id]
            for word_id in all_vocab_ids]
        tb.add_embedding(
            self.embedding[:9999], 
            embedding_labels[:9999], 
            global_step=0)
        
    def export_web_projector(self, out_dir: str) -> None:
        random_indices = np.random.randint(len(self.embedding), size=10000)
        subset_embedding = self.embedding[random_indices].tolist()
        
        vector_path = os.path.join(out_dir, 'tensorboard.tsv')
        with open(vector_path, 'w') as vector_file:
            for vector in subset_embedding:
                vector_file.write('\t'.join(map(str, vector)) + '\n')

        label_path = os.path.join(out_dir, 'tensorboard_labels.tsv')
        with open(label_path, 'w') as label_file:
            for index in random_indices:
                label_file.write(self.id_to_word[index] + '\n')

    def cosine_similarity(self, query1: str, query2: str) -> float:
        try:
            query1_id = self.word_to_id[query1]
        except KeyError as error:
            print(f'Out of vocabulary: {query1}')
            raise error
        try:
            query2_id = self.word_to_id[query2]
        except KeyError as error:
            print(f'Out of vocabulary: {query2}')
            raise error
        vectors = self.embedding[(query1_id, query2_id), :]
        similarity = 1 - distance.cosine(vectors[0], vectors[1])
        return similarity

    def nearest_neighbor(self, query: str, top_k: int = 10):
        try:
            query_id = self.word_to_id[query]
        except KeyError:
            raise KeyError(f'{query} is out of vocabulary. Sorry!')    
        query_vec = self.embedding[query_id]
        
        distances = [distance.cosine(query_vec, vec) 
                     for vec in self.embedding]
        neighbors = np.argsort(distances)
        print(f"{query}'s neareset neighbors:")
        for ranking in range(1, top_k + 1):
            word_id = neighbors[ranking]
            word = self.id_to_word[word_id]
            cosine_similarity = 1 - distances[word_id]
            print(f'{cosine_similarity:.4f}\t{word}')
        print()
        

class PhrasePair(NamedTuple):
    query: str
    neighbor: str
    deno_sim: float
    cono_sim: float
    

def load_cherry(path, exclude_hard_examples=True):
    data = []
    with open(path) as file:
        if path.endswith('tsv'):
            reader = csv.DictReader(file, dialect=csv.excel_tab)
        else:
            reader = csv.DictReader(file)
        for row in reader:
            if row['semantic_similarity'] and row['cono_similarity']:
                if (exclude_hard_examples and 
                        'hard example' in row['comment'].lower()):
                    continue
                data.append(PhrasePair(
                    row['query'], 
                    row['neighbor'], 
#                     row['query_words'], 
#                     row['neighbor_words'], 
                    float(row['semantic_similarity']), 
                    float(row['cono_similarity'])))
    print(f'Loaded {len(data)} labeled entries at {path}')
    return data


def load_MTurk_results(path):
    data = []
    with open(path) as file:
        if path.endswith('tsv'):
            reader = csv.DictReader(file, dialect=csv.excel_tab)
        else:
            reader = csv.DictReader(file)
        for row in reader:
            qc = row['median_query_cono']
            nc = row['median_neighbor_cono']
            if qc and nc:  # nonempty string
                qc = float(qc)
                nc = float(nc)
                if qc == 0 or nc == 0:  # unable to judge
                    continue
                
#                 cono_sim = 5 - abs(qc - nc)
            
                if ((qc > 3 and nc > 3) 
                        or (qc < 3 and nc < 3) 
                        or (qc == 3 and nc == 3)):
                    cono_sim = 5
                else:
                    cono_sim = 1
    
                data.append(PhrasePair( 
                    row['query_words'], 
                    row['neighbor_words'], 
                    float(row['median_deno']), 
                    cono_sim))
    print(f'Loaded {len(data)} labeled entries at {path}')
    return data


def correlate_sim_deltas(model, ref_model, phrase_pairs, verbose=False):
    label_deltas = []
    model_deltas = []
    if verbose:
        print(f'deno_sim\tcono_sim\tref_sim\tmodel_sim')
    
    for pair in phrase_pairs:
        try:
            sim = model.cosine_similarity(pair.query, pair.neighbor)
            ref_sim = ref_model.cosine_similarity(pair.query, pair.neighbor)
        except KeyError:
            continue 
        model_delta = sim - ref_sim
        model_deltas.append(model_delta)
        label_deltas.append(pair.deno_sim - pair.cono_sim)
            
        if verbose:
            print(f'{pair.deno_sim}  {pair.cono_sim}  {ref_sim:.2%}  {sim:.2%}  '
                  f'{pair.query}  {pair.neighbor}')

    median = np.median(model_deltas)
    mean = np.mean(model_deltas)
    stddev = np.std(model_deltas)
    rho, _ = spearmanr(model_deltas, label_deltas)
    return rho, median, mean, stddev


def preview(things):
    for stuff in things:
        q, n, d, c = stuff
        print(d, c, q, n, sep='\t')


def same_deno(pair):
    return pair.deno_sim >= 3


def same_cono(pair):
    return pair.cono_sim >= 3
        
        
def is_euphemism(pair) -> bool:
    return same_deno(pair) and not same_cono(pair)


def is_party_platform(pair) -> bool:
    return not same_deno(pair) and same_cono(pair)

## Cherry Data

In [None]:
Dem_pairs = load_cherry(    
    '../../data/evaluation/cherries/labeled_Dem_samples.tsv',
    exclude_hard_examples=True)
GOP_pairs = load_cherry(
    '../../data/evaluation/cherries/labeled_GOP_samples.tsv',
    exclude_hard_examples=True)
val_data = Dem_pairs + GOP_pairs

euphemism = list(filter(is_euphemism, val_data))
party_platform = list(filter(is_party_platform, val_data))
party_platform += load_cherry(
    '../../data/evaluation/cherries/remove_deno.tsv',
    exclude_hard_examples=False)

print(f'{len(euphemism)} euphemism')
preview(euphemism)
print(f'\n{len(party_platform)} party platform')
preview(party_platform)

## Qualification Batch

In [None]:
test_data = load_cherry(
    '../../data/evaluation/qualification_30.csv', 
    exclude_hard_examples=False)

euphemism = list(filter(is_euphemism, test_data))
party_platform = list(filter(is_party_platform, test_data))

print(f'{len(euphemism)} euphemism')
print(f'{len(party_platform)} party platform')

## Pilot Batch

In [2]:
# test_data = load_MTurk_results('../../data/evaluation/qualification_30.csv')
test_data = load_MTurk_results('../../data/evaluation/combined_result.csv')

euphemism = list(filter(is_euphemism, test_data))
party_platform = list(filter(is_party_platform, test_data))

print(f'{len(euphemism)} euphemism')
preview(euphemism)
print(f'\n{len(party_platform)} party platform')
preview(party_platform)

Loaded 318 labeled entries at ../../data/evaluation/combined_result.csv
37 euphemism
3.0	1	obamacare	health_care_law
3.0	1	bilingual_ballots	voting_systems
3.0	1	medical_liability_insurance	medical_liability_crisis
3.0	1	the_distinguished_acting_republican_leader	distinguished_minority_leader
3.0	1	trillion_debt	national_debt
4.0	1	recovery_and_reinvestment	stimulus
4.5	1	laidoff_workers	displaced_workers
3.0	1	trillion_in_debt	trillion
3.0	1	republican_congressman	thencongressman
3.0	1	the_classroom_act	smaller_class_size
3.0	1	increase_supply	drill_our_way
3.0	1	lilly_ledbetter	equal_employment_opportunity_commission
3.0	1	megabanks	banks
3.0	1	constitutional_option	change_the_rules
5.0	1	private_accounts	personal_accounts
3.5	1	drill_our_way	energy_independent
5.0	1	death_tax	estate_taxes
3.0	1	gun_dealers	gun
3.0	1	arctic_wildlife_refuge	north_slope
4.0	1	government_option	government_health_care
3.0	1	tax_penalty	marriage_penalty_relief
3.0	1	the_disclose_act	real_campaign_finance


### Load Pretrained Embedding

In [3]:
pretrained = Embedding('../../data/pretrained_word2vec/for_real.txt', 'plain_text')

vocab_size = 111,387, num_dimensions = 300
Loading embeddings from ../../data/pretrained_word2vec/for_real.txt
Done


## Decompose Denotation +d -c models 
similarity should increase for euphemism

In [4]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/' 
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.020	0.018	0.035	0.039
1d -1c/epoch4.pt	0.015	0.008	0.039	0.160
1d -1c/epoch6.pt	0.002	-0.001	0.039	0.218
1d -1c/epoch8.pt	0.006	0.001	0.037	0.113
1d -1c/epoch10.pt	0.003	-0.002	0.043	0.146
1d -1c/epoch12.pt	0.004	-0.005	0.041	0.166
1d -1c/epoch14.pt	-0.000	-0.005	0.040	0.257
1d -1c/epoch16.pt	-0.001	-0.007	0.041	0.230
1d -1c/epoch20.pt	0.004	-0.007	0.045	0.219
1d -1c/epoch24.pt	-0.000	-0.009	0.049	0.198
1d -1c/epoch28.pt	-0.007	-0.011	0.048	0.297
1d -1c/epoch30.pt	-0.007	-0.016	0.048	0.365
				
1d -2c/epoch2.pt	0.039	0.029	0.038	0.218
1d -2c/epoch4.pt	0.022	0.016	0.037	0.231
1d -2c/epoch6.pt	0.006	0.006	0.036	0.361
1d -2c/epoch8.pt	0.009	0.001	0.037	0.360
1d -2c/epoch10.pt	-0.002	-0.008	0.037	0.343
1d -2c/epoch12.pt	0.001	-0.005	0.033	0.336
1d -2c/epoch14.pt	-0.004	-0.011	0.040	0.393
1d -2c/epoch16.pt	-0.005	-0.011	0.038	0.268
1d -2c/epoch20.pt	-0.007	-0.010	0.043	0.342
1d -2c/epoch24.pt	-0.009	-0.015	0.041	0.375
1

In [5]:
# Evaluate against 1d 0c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '1d 0c'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.014	0.009	0.020	-0.300
1d -1c/epoch4.pt	0.012	0.012	0.020	-0.108
1d -1c/epoch6.pt	0.006	0.009	0.021	-0.084
1d -1c/epoch8.pt	0.008	0.012	0.023	-0.208
1d -1c/epoch10.pt	0.009	0.008	0.025	-0.155
1d -1c/epoch12.pt	0.007	0.006	0.025	-0.200
1d -1c/epoch14.pt	0.007	0.006	0.024	0.018
1d -1c/epoch16.pt	0.005	0.003	0.026	-0.062
1d -1c/epoch20.pt	0.005	0.006	0.030	-0.068
1d -1c/epoch24.pt	0.008	0.006	0.033	0.006
1d -1c/epoch28.pt	0.012	0.006	0.033	0.111
1d -1c/epoch30.pt	0.008	0.000	0.034	0.131
				
1d -2c/epoch2.pt	0.017	0.021	0.025	-0.065
1d -2c/epoch4.pt	0.023	0.020	0.025	0.056
1d -2c/epoch6.pt	0.018	0.016	0.031	-0.053
1d -2c/epoch8.pt	0.012	0.011	0.029	-0.027
1d -2c/epoch10.pt	0.001	0.002	0.027	0.124
1d -2c/epoch12.pt	0.007	0.006	0.024	-0.048
1d -2c/epoch14.pt	-0.000	-0.000	0.030	0.140
1d -2c/epoch16.pt	-0.001	-0.001	0.035	-0.012
1d -2c/epoch20.pt	0.001	0.003	0.038	0.023
1d -2c/epoch24.pt	-0.001	-0.000	0.037	0.102
1d -2c/

## Decompose Denotation +d -c models 
Similarity should decrease for party platform.

In [6]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.018	0.016	0.032	0.044
1d -1c/epoch4.pt	0.007	0.004	0.035	0.137
1d -1c/epoch6.pt	0.000	-0.005	0.039	0.087
1d -1c/epoch8.pt	-0.005	-0.009	0.040	0.126
1d -1c/epoch10.pt	-0.005	-0.010	0.042	0.036
1d -1c/epoch12.pt	-0.004	-0.010	0.041	0.048
1d -1c/epoch14.pt	-0.003	-0.009	0.043	-0.000
1d -1c/epoch16.pt	-0.009	-0.012	0.043	-0.007
1d -1c/epoch20.pt	-0.011	-0.012	0.042	-0.022
1d -1c/epoch24.pt	-0.013	-0.014	0.048	0.006
1d -1c/epoch28.pt	-0.015	-0.016	0.050	-0.064
1d -1c/epoch30.pt	-0.009	-0.014	0.048	-0.040
				
1d -2c/epoch2.pt	0.033	0.028	0.035	-0.052
1d -2c/epoch4.pt	0.017	0.015	0.037	-0.002
1d -2c/epoch6.pt	0.007	0.003	0.038	0.010
1d -2c/epoch8.pt	0.000	-0.005	0.041	0.080
1d -2c/epoch10.pt	0.006	-0.005	0.041	0.081
1d -2c/epoch12.pt	0.002	-0.008	0.043	0.026
1d -2c/epoch14.pt	-0.003	-0.010	0.043	0.090
1d -2c/epoch16.pt	0.000	-0.011	0.041	0.160
1d -2c/epoch20.pt	-0.009	-0.011	0.041	0.027
1d -2c/epoch24.pt	-0.004	-0.013	0.

In [7]:
# Evaluate against 1d 0c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '1d 0c'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.008	0.011	0.020	-0.062
1d -1c/epoch4.pt	0.011	0.012	0.024	-0.060
1d -1c/epoch6.pt	0.009	0.010	0.025	-0.050
1d -1c/epoch8.pt	0.010	0.009	0.025	0.006
1d -1c/epoch10.pt	0.004	0.009	0.025	-0.141
1d -1c/epoch12.pt	0.009	0.011	0.026	-0.108
1d -1c/epoch14.pt	0.014	0.014	0.031	-0.143
1d -1c/epoch16.pt	0.010	0.010	0.027	-0.159
1d -1c/epoch20.pt	0.011	0.012	0.032	-0.176
1d -1c/epoch24.pt	0.010	0.012	0.034	-0.147
1d -1c/epoch28.pt	0.010	0.010	0.034	-0.221
1d -1c/epoch30.pt	0.010	0.012	0.034	-0.152
				
1d -2c/epoch2.pt	0.022	0.023	0.028	-0.155
1d -2c/epoch4.pt	0.022	0.023	0.032	-0.147
1d -2c/epoch6.pt	0.016	0.019	0.030	-0.142
1d -2c/epoch8.pt	0.012	0.014	0.028	-0.011
1d -2c/epoch10.pt	0.015	0.014	0.026	-0.018
1d -2c/epoch12.pt	0.012	0.014	0.031	-0.101
1d -2c/epoch14.pt	0.013	0.012	0.031	-0.004
1d -2c/epoch16.pt	0.014	0.011	0.032	0.040
1d -2c/epoch20.pt	0.014	0.013	0.036	-0.040
1d -2c/epoch24.pt	0.015	0.013	0.035	-0.055
1d -2c

## Decompose Connotaion -d +c models 
similarity should decrease for euphemism

In [8]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-3.23%	-3.30%	12.66%	-7.91%
-0.005d 1c/epoch4.pt	-3.15%	-4.38%	9.17%	-31.55%
-0.005d 1c/epoch6.pt	-6.44%	-7.02%	7.88%	-15.84%
-0.005d 1c/epoch8.pt	-7.49%	-8.33%	8.87%	-15.81%
-0.005d 1c/epoch10.pt	-6.13%	-7.68%	6.86%	-3.85%
-0.005d 1c/epoch12.pt	-8.69%	-8.85%	6.99%	1.37%
-0.005d 1c/epoch14.pt	-9.09%	-10.10%	8.35%	-3.49%
-0.005d 1c/epoch16.pt	-10.92%	-11.49%	7.98%	-1.49%
-0.005d 1c/epoch20.pt	-11.19%	-13.32%	9.18%	-3.75%
-0.005d 1c/epoch24.pt	-13.06%	-13.10%	8.58%	-7.34%
-0.005d 1c/epoch28.pt	-13.65%	-14.44%	9.16%	11.99%
-0.005d 1c/epoch30.pt	-12.41%	-13.15%	8.81%	-0.61%
				
-0.05d 1c/epoch2.pt	25.88%	26.57%	8.70%	-49.03%
-0.05d 1c/epoch4.pt	26.74%	26.79%	8.82%	-48.35%
-0.05d 1c/epoch6.pt	18.42%	18.40%	8.49%	-41.98%
-0.05d 1c/epoch8.pt	16.91%	16.30%	8.43%	-39.53%
-0.05d 1c/epoch10.pt	13.68%	13.10%	8.65%	-38.30%
-0.05d 1c/epoch12.pt	11.85%	11.77%	8.62%	-33.96%
-0.05d 1c/epoch14.pt	11.34%	10.86%	8.50%	-34.10%
-0.05

In [9]:
# Evaluate against 0d 1c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '0d 1c'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.49%	-2.08%	3.65%	16.30%
-0.005d 1c/epoch4.pt	-1.78%	-1.98%	2.96%	1.82%
-0.005d 1c/epoch6.pt	-3.78%	-3.44%	3.42%	6.87%
-0.005d 1c/epoch8.pt	-5.01%	-4.59%	3.79%	4.96%
-0.005d 1c/epoch10.pt	-4.12%	-4.20%	3.36%	-4.46%
-0.005d 1c/epoch12.pt	-4.12%	-4.50%	3.19%	9.18%
-0.005d 1c/epoch14.pt	-4.22%	-4.23%	4.21%	9.39%
-0.005d 1c/epoch16.pt	-5.62%	-5.11%	4.61%	13.94%
-0.005d 1c/epoch20.pt	-5.32%	-5.04%	5.82%	10.41%
-0.005d 1c/epoch24.pt	-4.97%	-4.92%	5.15%	6.87%
-0.005d 1c/epoch28.pt	-5.66%	-6.42%	4.66%	9.46%
-0.005d 1c/epoch30.pt	-5.67%	-5.59%	6.16%	3.87%
				
-0.05d 1c/epoch2.pt	23.12%	27.79%	15.47%	-14.54%
-0.05d 1c/epoch4.pt	28.99%	29.18%	12.14%	-11.80%
-0.05d 1c/epoch6.pt	20.02%	21.98%	11.22%	-17.84%
-0.05d 1c/epoch8.pt	18.12%	20.05%	10.46%	-24.71%
-0.05d 1c/epoch10.pt	18.20%	16.58%	9.83%	-31.97%
-0.05d 1c/epoch12.pt	14.17%	16.12%	11.46%	-24.71%
-0.05d 1c/epoch14.pt	14.36%	16.73%	12.81%	-19.32%
-0.05d 1c/epoch16.pt	

## Decompose Connotation -d +c models 
similarity should increase for party platform.

In [10]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-2.35%	-5.07%	14.75%	13.70%
-0.005d 1c/epoch4.pt	-3.15%	-5.32%	11.80%	17.44%
-0.005d 1c/epoch6.pt	-5.01%	-7.53%	10.33%	25.93%
-0.005d 1c/epoch8.pt	-6.67%	-7.60%	9.32%	14.64%
-0.005d 1c/epoch10.pt	-6.75%	-7.70%	8.23%	31.50%
-0.005d 1c/epoch12.pt	-8.18%	-9.25%	8.89%	25.94%
-0.005d 1c/epoch14.pt	-6.97%	-9.77%	9.27%	16.92%
-0.005d 1c/epoch16.pt	-9.10%	-10.17%	9.41%	18.06%
-0.005d 1c/epoch20.pt	-11.00%	-11.48%	9.15%	15.94%
-0.005d 1c/epoch24.pt	-12.41%	-12.32%	9.43%	29.29%
-0.005d 1c/epoch28.pt	-11.38%	-12.72%	9.75%	19.38%
-0.005d 1c/epoch30.pt	-12.02%	-13.19%	9.71%	16.11%
				
-0.05d 1c/epoch2.pt	30.32%	30.67%	11.33%	0.95%
-0.05d 1c/epoch4.pt	30.13%	30.80%	12.26%	2.09%
-0.05d 1c/epoch6.pt	19.64%	21.48%	10.39%	6.45%
-0.05d 1c/epoch8.pt	17.01%	18.77%	9.93%	8.22%
-0.05d 1c/epoch10.pt	13.35%	15.61%	9.89%	9.61%
-0.05d 1c/epoch12.pt	12.61%	14.53%	9.86%	10.75%
-0.05d 1c/epoch14.pt	11.62%	13.64%	9.90%	10.93%
-0.05d 1c/epoch1

In [11]:
# Evaluate against 0d 1c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '0d 1c'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-2.07%	-2.43%	3.26%	-13.37%
-0.005d 1c/epoch4.pt	-2.13%	-2.70%	4.23%	15.58%
-0.005d 1c/epoch6.pt	-3.39%	-3.53%	4.55%	5.30%
-0.005d 1c/epoch8.pt	-3.67%	-4.26%	3.91%	3.78%
-0.005d 1c/epoch10.pt	-4.11%	-4.57%	4.02%	11.90%
-0.005d 1c/epoch12.pt	-4.55%	-4.57%	4.48%	13.97%
-0.005d 1c/epoch14.pt	-4.21%	-4.94%	4.61%	8.99%
-0.005d 1c/epoch16.pt	-4.09%	-3.81%	4.63%	5.23%
-0.005d 1c/epoch20.pt	-5.24%	-5.40%	4.94%	3.78%
-0.005d 1c/epoch24.pt	-4.80%	-5.37%	4.55%	18.40%
-0.005d 1c/epoch28.pt	-5.03%	-6.06%	6.32%	6.46%
-0.005d 1c/epoch30.pt	-5.45%	-6.10%	6.43%	3.00%
				
-0.05d 1c/epoch2.pt	29.32%	33.31%	19.95%	-8.81%
-0.05d 1c/epoch4.pt	29.19%	33.42%	19.18%	-2.10%
-0.05d 1c/epoch6.pt	23.02%	25.48%	15.04%	-9.55%
-0.05d 1c/epoch8.pt	19.16%	22.11%	13.73%	-2.79%
-0.05d 1c/epoch10.pt	17.03%	18.75%	13.38%	-6.44%
-0.05d 1c/epoch12.pt	17.29%	19.21%	13.97%	-2.64%
-0.05d 1c/epoch14.pt	15.09%	18.47%	13.71%	-2.18%
-0.05d 1c/epoch16.pt	27.7