In [53]:
import pickle
import csv
import os
from typing import Set, Tuple, NamedTuple, List, Dict, Counter, Optional

import torch
import numpy as np
from scipy.spatial import distance
from scipy.stats import spearmanr

from decomposer import AdversarialDecomposer, AdversarialConfig

np.random.seed(42)
torch.manual_seed(42)

class Embedding():
    
    def __init__(self, path: str, source: Optional[str] = None):
        if source is None or source == 'adversarial':
            self.init_from_adversarial(path)
        elif source == 'skip_gram':
            self.init_from_skip_gram(path)
        elif source == 'plain_text':
            self.init_from_plain_text(path)
        else:
            raise ValueError('Unknown embedding source.')
            
    def init_from_adversarial(self, path: str, device=torch.device('cpu')):
        payload = torch.load(path, map_location=device)
        model = payload['model']
        self.word_to_id = model.word_to_id
        self.id_to_word = model.id_to_word 
        self.Dem_frequency: Counter[str] = model.Dem_frequency
        self.GOP_frequency: Counter[str] = model.GOP_frequency
        
        # encoded layer
        self.embedding = model.export_encoded_embedding(device=device)
#         self.embedding = model.export_decomposed_embedding(device=device)

#         # manually choose which layer to export
#         all_vocab_ids = torch.arange(
#             len(self.word_to_id), dtype=torch.long, device=device)
#         with torch.no_grad():
#             embed = model.embedding(all_vocab_ids)
#             encoded = model.encoder(embed)
#             self.cono_logits = model.cono_decoder(encoded)
            
#     def init_from_adversarial(self, path: str):        
#         config = DenotationEncoderConfig()
#         config.input_dir = '../../data/processed/adversarial/44_Obama_1e-5'
#         data = AdversarialDataset(config)
#         model = DenotationEncoder(config, data)
#         model.load_state_dict(torch.load(path))
#         self.embedding = model.export_decomposed_embedding().to('cpu')
#         self.word_to_id = model.word_to_id
#         self.id_to_word = model.id_to_word

    def init_from_skip_gram(self, paths: Tuple[str, str]) -> None:
        """Directly extract the weights of a single layer."""
        model_path, vocab_path = paths
        with open(model_path, 'rb') as model_file:
            state_dict = torch.load(model_file, map_location='cpu')
    #     print(state_dict.keys())
        self.embedding = state_dict['center_embedding.weight'].numpy()
        with open(vocab_path, 'rb') as vocab_file:
            self.word_to_id, self.id_to_word, _ = pickle.load(vocab_file)

    def init_from_plain_text(self, path: str) -> Tuple[np.array, Dict[str, int]]:
        id_generator = 0
        word_to_id: Dict[str, int] = {}
        embeddings: List[float] = []
        embedding_file = open(path)
        vocab_size, num_dimensions = map(int, embedding_file.readline().split())
        print(f'vocab_size = {vocab_size:,}, num_dimensions = {num_dimensions}')
        print(f'Loading embeddings from {path}', flush=True)
        for line in embedding_file:
            line: List[str] = line.split()  # type: ignore
            word = line[0]
            vector = np.array(line[-num_dimensions:], dtype=np.float64)
            embeddings.append(vector)
            word_to_id[word] = id_generator
            id_generator += 1
        embedding_file.close()
        print('Done')
        self.id_to_word = {val: key for key, val in word_to_id.items()}
        self.word_to_id = word_to_id
        self.embedding = np.array(embeddings)
        
    def write_to_tensorboard_projector(self, tb_dir: str) -> None:
        from torch.utils import tensorboard
        tb = tensorboard.SummaryWriter(log_dir=tb_dir)
        all_vocab_ids = range(len(self.word_to_id))
        embedding_labels = [
            self.id_to_word[word_id]
            for word_id in all_vocab_ids]
        tb.add_embedding(
            self.embedding[:9999], 
            embedding_labels[:9999], 
            global_step=0)
        
    def export_web_projector(self, out_dir: str) -> None:
        random_indices = np.random.randint(len(self.embedding), size=10000)
        subset_embedding = self.embedding[random_indices].tolist()
        
        vector_path = os.path.join(out_dir, 'tensorboard.tsv')
        with open(vector_path, 'w') as vector_file:
            for vector in subset_embedding:
                vector_file.write('\t'.join(map(str, vector)) + '\n')

        label_path = os.path.join(out_dir, 'tensorboard_labels.tsv')
        with open(label_path, 'w') as label_file:
            for index in random_indices:
                label_file.write(self.id_to_word[index] + '\n')

    def cosine_similarity(self, query1: str, query2: str) -> float:
        try:
            query1_id = self.word_to_id[query1]
        except KeyError as error:
            print(f'Out of vocabulary: {query1}')
            raise error
        try:
            query2_id = self.word_to_id[query2]
        except KeyError as error:
            print(f'Out of vocabulary: {query2}')
            raise error
        vectors = self.embedding[(query1_id, query2_id), :]
        similarity = 1 - distance.cosine(vectors[0], vectors[1])
        return similarity

    def nearest_neighbor(self, query: str, top_k: int = 10):
        try:
            query_id = self.word_to_id[query]
        except KeyError:
            raise KeyError(f'{query} is out of vocabulary. Sorry!')    
        query_vec = self.embedding[query_id]
        
        distances = [distance.cosine(query_vec, vec) 
                     for vec in self.embedding]
        neighbors = np.argsort(distances)
        print(f"{query}'s neareset neighbors:")
        for ranking in range(1, top_k + 1):
            word_id = neighbors[ranking]
            word = self.id_to_word[word_id]
            cosine_similarity = 1 - distances[word_id]
            print(f'{cosine_similarity:.4f}\t{word}')
        print()
        

class PhrasePair(NamedTuple):
    query: str
    neighbor: str
    deno_sim: float
    cono_sim: float
    

def load_cherry(path, exclude_hard_examples=True):
    data = []
    with open(path) as file:
        if path.endswith('tsv'):
            reader = csv.DictReader(file, dialect=csv.excel_tab)
        else:
            reader = csv.DictReader(file)
        for row in reader:
            if row['semantic_similarity'] and row['cono_similarity']:
                if (exclude_hard_examples and 
                        'hard example' in row['comment'].lower()):
                    continue
                data.append(PhrasePair(
                    row['query'], 
                    row['neighbor'], 
#                     row['query_words'], 
#                     row['neighbor_words'], 
                    float(row['semantic_similarity']), 
                    float(row['cono_similarity'])))
    print(f'Loaded {len(data)} labeled entries at {path}')
    return data


def load_MTurk_results(path):
    data = []
    with open(path) as file:
        if path.endswith('tsv'):
            reader = csv.DictReader(file, dialect=csv.excel_tab)
        else:
            reader = csv.DictReader(file)
        for row in reader:
            qc = row['median_query_cono']
            nc = row['median_neighbor_cono']
            if qc and nc:  # nonempty string
                qc = float(qc)
                nc = float(nc)
                if qc == 0 or nc == 0:  # unable to judge
                    continue
                
#                 cono_sim = 5 - abs(qc - nc)
            
                if ((qc > 3 and nc > 3) 
                        or (qc < 3 and nc < 3) 
                        or (qc == 3 and nc == 3)):
                    cono_sim = 5
                else:
                    cono_sim = 1
    
                data.append(PhrasePair( 
                    row['query_words'], 
                    row['neighbor_words'], 
                    float(row['median_deno']), 
                    cono_sim))
    print(f'Loaded {len(data)} labeled entries at {path}')
    return data


def correlate_sim_deltas(model, ref_model, phrase_pairs, verbose=False):
    label_deltas = []
    model_deltas = []
    if verbose:
        print(f'deno_sim\tcono_sim\tref_sim\tmodel_sim')
    
    for pair in phrase_pairs:
        try:
            sim = model.cosine_similarity(pair.query, pair.neighbor)
            ref_sim = ref_model.cosine_similarity(pair.query, pair.neighbor)
        except KeyError:
            continue 
        model_delta = sim - ref_sim
        model_deltas.append(model_delta)
        label_deltas.append(pair.deno_sim - pair.cono_sim)
            
        if verbose:
            print(f'{pair.deno_sim}  {pair.cono_sim}  {ref_sim:.2%}  {sim:.2%}  '
                  f'{pair.query}  {pair.neighbor}')

    median = np.median(model_deltas)
    mean = np.mean(model_deltas)
    stddev = np.std(model_deltas)
    rho, _ = spearmanr(model_deltas, label_deltas)
    return rho, median, mean, stddev


def preview(things):
    for stuff in things:
        q, n, d, c = stuff
        print(d, c, q, n, sep='\t')


def same_deno(pair):
    return pair.deno_sim >= 3


def same_cono(pair):
    return pair.cono_sim >= 3
        
        
def is_euphemism(pair) -> bool:
    return same_deno(pair) and not same_cono(pair)


def is_party_platform(pair) -> bool:
    return not same_deno(pair) and same_cono(pair)

## Cherry Data

In [55]:
Dem_pairs = load_cherry(    
    '../../data/evaluation/cherries/labeled_Dem_samples.tsv',
    exclude_hard_examples=True)
GOP_pairs = load_cherry(
    '../../data/evaluation/cherries/labeled_GOP_samples.tsv',
    exclude_hard_examples=True)
val_data = Dem_pairs + GOP_pairs

euphemism = list(filter(is_euphemism, val_data))
party_platform = list(filter(is_party_platform, val_data))
party_platform += load_cherry(
    '../../data/evaluation/cherries/remove_deno.tsv',
    exclude_hard_examples=False)

print(f'{len(euphemism)} euphemism')
preview(euphemism)
print(f'\n{len(party_platform)} party platform')
preview(party_platform)

Loaded 26 labeled entries at ../../data/evaluation/cherries/labeled_Dem_samples.tsv
Loaded 29 labeled entries at ../../data/evaluation/cherries/labeled_GOP_samples.tsv
Loaded 32 labeled entries at ../../data/evaluation/cherries/remove_deno.tsv
39 euphemism
5.0	1.0	tax_breaks	bipartisan_tax_relief
5.0	1.0	star_wars	strategic_defense_initiative
5.0	1.0	star_wars	missile_defense
4.0	1.0	military_spending	federal_spending
4.0	1.0	military_spending	government_spending
4.5	1.0	assault_weapons	firearms
5.0	1.0	assault_weapons	rifles
4.0	2.0	credit_card_companies	creditors
3.0	2.0	trickledown	cut_taxes
4.0	2.0	waterboarding	interrogation
5.0	1.0	antichoice	prolife
4.0	2.0	private_insurance_companies	medicare_advantage_program
5.0	1.0	nuclear_option	constitutional_option
5.0	2.0	corporate_profits	earnings
5.0	1.0	death_tax	estate_tax
5.0	2.0	unborn	fetus
5.0	2.0	partialbirth_abortion	lateterm
5.0	1.0	partialbirth_abortion	dx
5.0	1.0	illegals	undocumented_immigrants
5.0	1.0	governmentrun	public_

## Qualification Batch

In [33]:
test_data = load_cherry(
    '../../data/evaluation/qualification_30.csv', 
    exclude_hard_examples=False)

euphemism = list(filter(is_euphemism, test_data))
party_platform = list(filter(is_party_platform, test_data))

print(f'{len(euphemism)} euphemism')
print(f'{len(party_platform)} party platform')

KeyError: 'query'

## Pilot Batch

In [54]:
# test_data = load_MTurk_results('../../data/evaluation/qualification_30.csv')
test_data = load_MTurk_results('../../data/evaluation/combined_result.csv')

euphemism = list(filter(is_euphemism, test_data))
party_platform = list(filter(is_party_platform, test_data))

print(f'{len(euphemism)} euphemism')
preview(euphemism)
print(f'\n{len(party_platform)} party platform')
preview(party_platform)

Loaded 318 labeled entries at ../../data/evaluation/combined_result.csv
37 euphemism
3.0	1	obamacare	health_care_law
3.0	1	bilingual_ballots	voting_systems
3.0	1	medical_liability_insurance	medical_liability_crisis
3.0	1	the_distinguished_acting_republican_leader	distinguished_minority_leader
3.0	1	trillion_debt	national_debt
4.0	1	recovery_and_reinvestment	stimulus
4.5	1	laidoff_workers	displaced_workers
3.0	1	trillion_in_debt	trillion
3.0	1	republican_congressman	thencongressman
3.0	1	the_classroom_act	smaller_class_size
3.0	1	increase_supply	drill_our_way
3.0	1	lilly_ledbetter	equal_employment_opportunity_commission
3.0	1	megabanks	banks
3.0	1	constitutional_option	change_the_rules
5.0	1	private_accounts	personal_accounts
3.5	1	drill_our_way	energy_independent
5.0	1	death_tax	estate_taxes
3.0	1	gun_dealers	gun
3.0	1	arctic_wildlife_refuge	north_slope
4.0	1	government_option	government_health_care
3.0	1	tax_penalty	marriage_penalty_relief
3.0	1	the_disclose_act	real_campaign_finance


### Load Pretrained Embedding

In [56]:
pretrained = Embedding('../../data/pretrained_word2vec/for_real.txt', 'plain_text')

vocab_size = 111,387, num_dimensions = 300
Loading embeddings from ../../data/pretrained_word2vec/for_real.txt
Done


## Decompose Denotation +d -c models 
similarity should increase for euphemism

In [57]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/' 
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.017	0.016	0.029	0.372
1d -1c/epoch4.pt	0.004	-0.001	0.036	0.268
1d -1c/epoch6.pt	-0.002	-0.007	0.037	0.187
1d -1c/epoch8.pt	-0.005	-0.009	0.043	0.252
1d -1c/epoch10.pt	-0.004	-0.010	0.033	0.215
1d -1c/epoch12.pt	-0.010	-0.013	0.038	0.116
1d -1c/epoch14.pt	-0.004	-0.005	0.040	0.191
1d -1c/epoch16.pt	-0.003	-0.009	0.034	0.128
1d -1c/epoch20.pt	-0.014	-0.011	0.036	0.068
1d -1c/epoch24.pt	-0.006	-0.008	0.037	0.072
1d -1c/epoch28.pt	-0.014	-0.017	0.038	0.111
1d -1c/epoch30.pt	-0.013	-0.016	0.040	0.080
				
1d -2c/epoch2.pt	0.029	0.027	0.033	0.226
1d -2c/epoch4.pt	0.005	0.008	0.039	0.284
1d -2c/epoch6.pt	0.008	-0.002	0.047	0.264
1d -2c/epoch8.pt	0.010	-0.002	0.050	0.228
1d -2c/epoch10.pt	0.003	-0.006	0.054	0.216
1d -2c/epoch12.pt	-0.002	-0.010	0.054	0.268
1d -2c/epoch14.pt	0.001	-0.014	0.058	0.206
1d -2c/epoch16.pt	-0.001	-0.012	0.052	0.139
1d -2c/epoch20.pt	-0.004	-0.009	0.055	0.041
1d -2c/epoch24.pt	0.003	-0.011	0.051	

In [58]:
# Evaluate against 1d 0c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '1d 0c'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.011	0.014	0.019	0.032
1d -1c/epoch4.pt	0.006	0.010	0.020	0.025
1d -1c/epoch6.pt	0.006	0.010	0.018	0.008
1d -1c/epoch8.pt	0.008	0.009	0.023	0.269
1d -1c/epoch10.pt	0.009	0.011	0.024	-0.092
1d -1c/epoch12.pt	0.010	0.011	0.026	-0.107
1d -1c/epoch14.pt	0.019	0.020	0.024	-0.010
1d -1c/epoch16.pt	0.013	0.017	0.028	-0.184
1d -1c/epoch20.pt	0.009	0.016	0.031	-0.191
1d -1c/epoch24.pt	0.012	0.019	0.035	-0.175
1d -1c/epoch28.pt	0.010	0.010	0.032	-0.065
1d -1c/epoch30.pt	0.008	0.010	0.031	-0.084
				
1d -2c/epoch2.pt	0.020	0.025	0.022	-0.061
1d -2c/epoch4.pt	0.019	0.019	0.017	0.085
1d -2c/epoch6.pt	0.014	0.016	0.021	0.170
1d -2c/epoch8.pt	0.012	0.016	0.029	0.198
1d -2c/epoch10.pt	0.014	0.014	0.031	0.145
1d -2c/epoch12.pt	0.012	0.015	0.032	0.236
1d -2c/epoch14.pt	0.010	0.012	0.038	0.196
1d -2c/epoch16.pt	0.011	0.014	0.030	0.004
1d -2c/epoch20.pt	0.014	0.018	0.038	-0.167
1d -2c/epoch24.pt	0.014	0.016	0.036	-0.019
1d -2c/epoch28.

## Decompose Denotation +d -c models 
Similarity should decrease for party platform.

In [59]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.001	-0.009	0.050	0.177
1d -1c/epoch4.pt	0.008	-0.006	0.048	0.025
1d -1c/epoch6.pt	-0.005	-0.013	0.043	0.167
1d -1c/epoch8.pt	-0.005	-0.011	0.040	0.136
1d -1c/epoch10.pt	0.003	-0.012	0.044	0.106
1d -1c/epoch12.pt	0.002	-0.007	0.042	0.177
1d -1c/epoch14.pt	0.016	0.011	0.045	-0.015
1d -1c/epoch16.pt	-0.009	-0.004	0.045	0.126
1d -1c/epoch20.pt	0.002	-0.008	0.046	0.157
1d -1c/epoch24.pt	0.003	-0.003	0.049	0.157
1d -1c/epoch28.pt	-0.003	-0.003	0.052	0.157
1d -1c/epoch30.pt	0.001	-0.003	0.054	0.187
				
1d -2c/epoch2.pt	-0.002	0.005	0.048	0.258
1d -2c/epoch4.pt	0.006	0.002	0.047	0.227
1d -2c/epoch6.pt	0.010	-0.005	0.047	0.086
1d -2c/epoch8.pt	0.000	-0.010	0.055	0.167
1d -2c/epoch10.pt	-0.005	-0.011	0.050	0.136
1d -2c/epoch12.pt	0.012	-0.002	0.051	0.076
1d -2c/epoch14.pt	0.005	-0.012	0.051	0.187
1d -2c/epoch16.pt	-0.002	-0.007	0.052	0.187
1d -2c/epoch20.pt	0.008	-0.001	0.054	0.136
1d -2c/epoch24.pt	0.008	0.001	0.058	0.126


In [60]:
# Evaluate against 1d 0c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '1d 0c'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.013	0.014	0.027	-0.136
1d -1c/epoch4.pt	0.016	0.016	0.031	-0.066
1d -1c/epoch6.pt	0.009	0.007	0.025	-0.147
1d -1c/epoch8.pt	0.009	0.010	0.032	-0.136
1d -1c/epoch10.pt	0.002	0.005	0.040	-0.136
1d -1c/epoch12.pt	0.015	0.013	0.039	-0.147
1d -1c/epoch14.pt	0.036	0.034	0.048	-0.248
1d -1c/epoch16.pt	0.023	0.017	0.046	-0.197
1d -1c/epoch20.pt	0.015	0.015	0.046	-0.106
1d -1c/epoch24.pt	0.017	0.023	0.048	-0.126
1d -1c/epoch28.pt	0.015	0.024	0.044	-0.187
1d -1c/epoch30.pt	0.027	0.021	0.045	-0.197
				
1d -2c/epoch2.pt	0.015	0.027	0.038	0.035
1d -2c/epoch4.pt	0.019	0.023	0.048	0.025
1d -2c/epoch6.pt	0.013	0.014	0.043	-0.045
1d -2c/epoch8.pt	0.007	0.010	0.052	-0.005
1d -2c/epoch10.pt	0.005	0.007	0.049	-0.086
1d -2c/epoch12.pt	0.014	0.018	0.052	-0.086
1d -2c/epoch14.pt	0.011	0.012	0.047	-0.045
1d -2c/epoch16.pt	0.013	0.014	0.058	-0.025
1d -2c/epoch20.pt	0.020	0.022	0.065	-0.056
1d -2c/epoch24.pt	0.026	0.028	0.067	-0.106
1d -2c

## Decompose Connotaion -d +c models 
similarity should decrease for euphemism

In [61]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.90%	-6.09%	15.90%	13.02%
-0.005d 1c/epoch4.pt	-3.21%	-9.09%	13.04%	15.72%
-0.005d 1c/epoch6.pt	-5.77%	-9.27%	13.25%	30.27%
-0.005d 1c/epoch8.pt	-5.62%	-11.04%	14.08%	7.94%
-0.005d 1c/epoch10.pt	-5.01%	-7.10%	8.57%	12.57%
-0.005d 1c/epoch12.pt	-9.19%	-9.56%	9.25%	22.59%
-0.005d 1c/epoch14.pt	-8.22%	-9.24%	8.08%	19.04%
-0.005d 1c/epoch16.pt	-7.38%	-10.45%	9.09%	17.19%
-0.005d 1c/epoch20.pt	-10.05%	-11.58%	8.84%	1.30%
-0.005d 1c/epoch24.pt	-9.06%	-12.02%	8.90%	18.89%
-0.005d 1c/epoch28.pt	-9.85%	-12.58%	9.64%	25.11%
-0.005d 1c/epoch30.pt	-9.17%	-11.36%	8.71%	34.97%
				
-0.05d 1c/epoch2.pt	23.64%	26.40%	10.85%	0.28%
-0.05d 1c/epoch4.pt	24.48%	27.31%	11.41%	-6.15%
-0.05d 1c/epoch6.pt	16.31%	17.88%	9.01%	-9.14%
-0.05d 1c/epoch8.pt	13.93%	15.64%	8.74%	-10.14%
-0.05d 1c/epoch10.pt	9.89%	11.98%	8.49%	-13.85%
-0.05d 1c/epoch12.pt	8.06%	9.97%	8.60%	-13.91%
-0.05d 1c/epoch14.pt	8.45%	9.55%	8.43%	-12.55%
-0.05d 1c/epoch16

In [62]:
# Evaluate against 0d 1c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '0d 1c'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.99%	-2.26%	2.23%	8.96%
-0.005d 1c/epoch4.pt	-0.86%	-1.34%	3.71%	26.24%
-0.005d 1c/epoch6.pt	-2.65%	-3.21%	4.17%	29.86%
-0.005d 1c/epoch8.pt	-4.20%	-5.29%	5.16%	-8.08%
-0.005d 1c/epoch10.pt	-2.51%	-3.03%	3.11%	24.46%
-0.005d 1c/epoch12.pt	-4.44%	-4.50%	3.07%	4.93%
-0.005d 1c/epoch14.pt	-2.71%	-3.49%	3.60%	-8.37%
-0.005d 1c/epoch16.pt	-4.68%	-4.48%	4.64%	8.79%
-0.005d 1c/epoch20.pt	-3.43%	-3.09%	3.97%	10.40%
-0.005d 1c/epoch24.pt	-3.76%	-3.44%	3.48%	10.67%
-0.005d 1c/epoch28.pt	-4.01%	-4.35%	4.63%	-3.72%
-0.005d 1c/epoch30.pt	-5.20%	-4.35%	4.80%	3.21%
				
-0.05d 1c/epoch2.pt	24.39%	30.23%	20.44%	-9.45%
-0.05d 1c/epoch4.pt	31.64%	35.06%	19.84%	-3.55%
-0.05d 1c/epoch6.pt	20.39%	23.94%	16.07%	-17.81%
-0.05d 1c/epoch8.pt	15.53%	21.39%	15.82%	-8.02%
-0.05d 1c/epoch10.pt	12.28%	16.05%	12.04%	-8.99%
-0.05d 1c/epoch12.pt	15.78%	15.03%	11.78%	-25.07%
-0.05d 1c/epoch14.pt	12.27%	15.30%	11.07%	-25.57%
-0.05d 1c/epoch16.pt

## Decompose Connotation -d +c models 
similarity should increase for party platform.

In [63]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	11.76%	15.89%	19.39%	-15.66%
-0.005d 1c/epoch4.pt	6.12%	7.55%	16.83%	1.52%
-0.005d 1c/epoch6.pt	5.72%	4.41%	15.28%	1.52%
-0.005d 1c/epoch8.pt	1.86%	0.62%	13.03%	13.64%
-0.005d 1c/epoch10.pt	3.18%	4.49%	12.66%	1.52%
-0.005d 1c/epoch12.pt	-0.13%	-0.38%	14.91%	2.53%
-0.005d 1c/epoch14.pt	1.69%	0.51%	9.40%	-13.64%
-0.005d 1c/epoch16.pt	-1.31%	-0.83%	11.08%	-9.60%
-0.005d 1c/epoch20.pt	-2.92%	-2.13%	11.43%	0.51%
-0.005d 1c/epoch24.pt	-1.99%	-1.05%	10.99%	-8.59%
-0.005d 1c/epoch28.pt	-1.52%	-1.67%	10.75%	-3.54%
-0.005d 1c/epoch30.pt	-3.87%	-2.36%	12.01%	-7.58%
				
-0.05d 1c/epoch2.pt	69.25%	61.04%	20.77%	-45.98%
-0.05d 1c/epoch4.pt	71.93%	64.30%	22.37%	-45.98%
-0.05d 1c/epoch6.pt	49.16%	46.24%	17.84%	-45.98%
-0.05d 1c/epoch8.pt	43.90%	41.94%	16.74%	-45.98%
-0.05d 1c/epoch10.pt	40.99%	35.43%	16.46%	-44.97%
-0.05d 1c/epoch12.pt	34.39%	33.31%	15.79%	-43.96%
-0.05d 1c/epoch14.pt	37.08%	32.72%	15.80%	-42.95%
-0.05d 1c/epoc

In [64]:
# Evaluate against 0d 1c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '0d 1c'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.42%	-1.58%	4.00%	17.69%
-0.005d 1c/epoch4.pt	-1.22%	-1.32%	3.94%	-8.59%
-0.005d 1c/epoch6.pt	-1.84%	-1.68%	4.79%	5.56%
-0.005d 1c/epoch8.pt	-1.57%	-1.86%	4.82%	2.53%
-0.005d 1c/epoch10.pt	-1.24%	-1.51%	3.83%	4.55%
-0.005d 1c/epoch12.pt	-2.03%	-1.29%	4.22%	-3.54%
-0.005d 1c/epoch14.pt	-0.97%	-1.13%	3.74%	4.55%
-0.005d 1c/epoch16.pt	-2.69%	-2.14%	5.21%	0.51%
-0.005d 1c/epoch20.pt	-1.00%	-1.94%	5.45%	-3.54%
-0.005d 1c/epoch24.pt	-3.32%	-3.63%	7.06%	2.53%
-0.005d 1c/epoch28.pt	-2.09%	-3.87%	8.44%	2.53%
-0.005d 1c/epoch30.pt	-2.91%	-2.33%	7.49%	5.56%
				
-0.05d 1c/epoch2.pt	41.48%	43.58%	26.00%	-46.99%
-0.05d 1c/epoch4.pt	60.99%	55.44%	26.01%	-46.99%
-0.05d 1c/epoch6.pt	38.12%	40.15%	22.63%	-46.99%
-0.05d 1c/epoch8.pt	41.65%	39.46%	20.57%	-46.99%
-0.05d 1c/epoch10.pt	29.07%	29.43%	19.31%	-45.98%
-0.05d 1c/epoch12.pt	31.45%	32.40%	19.71%	-45.98%
-0.05d 1c/epoch14.pt	30.58%	31.08%	17.44%	-43.96%
-0.05d 1c/epoch16.pt