In [1]:
import pickle
import csv
import os
from typing import Set, Tuple, NamedTuple, List, Dict, Counter, Optional

import torch
import numpy as np
from scipy.spatial import distance
from scipy.stats import spearmanr

from decomposer import AdversarialDecomposer, AdversarialConfig

np.random.seed(42)
torch.manual_seed(42)

class Embedding():
    
    def __init__(self, path: str, source: Optional[str] = None):
        if source is None or source == 'adversarial':
            self.init_from_adversarial(path)
        elif source == 'skip_gram':
            self.init_from_skip_gram(path)
        elif source == 'plain_text':
            self.init_from_plain_text(path)
        else:
            raise ValueError('Unknown embedding source.')
            
    def init_from_adversarial(self, path: str, device=torch.device('cpu')):
        payload = torch.load(path, map_location=device)
        model = payload['model']
        self.word_to_id = model.word_to_id
        self.id_to_word = model.id_to_word 
        self.Dem_frequency: Counter[str] = model.Dem_frequency
        self.GOP_frequency: Counter[str] = model.GOP_frequency
        
        # encoded layer
        self.embedding = model.export_encoded_embedding(device=device)
#         self.embedding = model.export_decomposed_embedding(device=device)

#         # manually choose which layer to export
#         all_vocab_ids = torch.arange(
#             len(self.word_to_id), dtype=torch.long, device=device)
#         with torch.no_grad():
#             embed = model.embedding(all_vocab_ids)
#             encoded = model.encoder(embed)
#             self.cono_logits = model.cono_decoder(encoded)
            
#     def init_from_adversarial(self, path: str):        
#         config = DenotationEncoderConfig()
#         config.input_dir = '../../data/processed/adversarial/44_Obama_1e-5'
#         data = AdversarialDataset(config)
#         model = DenotationEncoder(config, data)
#         model.load_state_dict(torch.load(path))
#         self.embedding = model.export_decomposed_embedding().to('cpu')
#         self.word_to_id = model.word_to_id
#         self.id_to_word = model.id_to_word

    def init_from_skip_gram(self, paths: Tuple[str, str]) -> None:
        """Directly extract the weights of a single layer."""
        model_path, vocab_path = paths
        with open(model_path, 'rb') as model_file:
            state_dict = torch.load(model_file, map_location='cpu')
    #     print(state_dict.keys())
        self.embedding = state_dict['center_embedding.weight'].numpy()
        with open(vocab_path, 'rb') as vocab_file:
            self.word_to_id, self.id_to_word, _ = pickle.load(vocab_file)

    def init_from_plain_text(self, path: str) -> Tuple[np.array, Dict[str, int]]:
        id_generator = 0
        word_to_id: Dict[str, int] = {}
        embeddings: List[float] = []
        embedding_file = open(path)
        vocab_size, num_dimensions = map(int, embedding_file.readline().split())
        print(f'vocab_size = {vocab_size:,}, num_dimensions = {num_dimensions}')
        print(f'Loading embeddings from {path}', flush=True)
        for line in embedding_file:
            line: List[str] = line.split()  # type: ignore
            word = line[0]
            vector = np.array(line[-num_dimensions:], dtype=np.float64)
            embeddings.append(vector)
            word_to_id[word] = id_generator
            id_generator += 1
        embedding_file.close()
        print('Done')
        self.id_to_word = {val: key for key, val in word_to_id.items()}
        self.word_to_id = word_to_id
        self.embedding = np.array(embeddings)
        
    def write_to_tensorboard_projector(self, tb_dir: str) -> None:
        from torch.utils import tensorboard
        tb = tensorboard.SummaryWriter(log_dir=tb_dir)
        all_vocab_ids = range(len(self.word_to_id))
        embedding_labels = [
            self.id_to_word[word_id]
            for word_id in all_vocab_ids]
        tb.add_embedding(
            self.embedding[:9999], 
            embedding_labels[:9999], 
            global_step=0)
        
    def export_web_projector(self, out_dir: str) -> None:
        random_indices = np.random.randint(len(self.embedding), size=10000)
        subset_embedding = self.embedding[random_indices].tolist()
        
        vector_path = os.path.join(out_dir, 'tensorboard.tsv')
        with open(vector_path, 'w') as vector_file:
            for vector in subset_embedding:
                vector_file.write('\t'.join(map(str, vector)) + '\n')

        label_path = os.path.join(out_dir, 'tensorboard_labels.tsv')
        with open(label_path, 'w') as label_file:
            for index in random_indices:
                label_file.write(self.id_to_word[index] + '\n')

    def cosine_similarity(self, query1: str, query2: str) -> float:
        try:
            query1_id = self.word_to_id[query1]
        except KeyError as error:
            print(f'Out of vocabulary: {query1}')
            raise error
        try:
            query2_id = self.word_to_id[query2]
        except KeyError as error:
            print(f'Out of vocabulary: {query2}')
            raise error
        vectors = self.embedding[(query1_id, query2_id), :]
        similarity = 1 - distance.cosine(vectors[0], vectors[1])
        return similarity

    def nearest_neighbor(self, query: str, top_k: int = 10):
        try:
            query_id = self.word_to_id[query]
        except KeyError:
            raise KeyError(f'{query} is out of vocabulary. Sorry!')    
        query_vec = self.embedding[query_id]
        
        distances = [distance.cosine(query_vec, vec) 
                     for vec in self.embedding]
        neighbors = np.argsort(distances)
        print(f"{query}'s neareset neighbors:")
        for ranking in range(1, top_k + 1):
            word_id = neighbors[ranking]
            word = self.id_to_word[word_id]
            cosine_similarity = 1 - distances[word_id]
            print(f'{cosine_similarity:.4f}\t{word}')
        print()
        

class PhrasePair(NamedTuple):
    query: str
    neighbor: str
    deno_sim: float
    cono_sim: float
        

def load_cherry(path, exclude_hard_examples=True):
    data = []
    with open(path) as file:
        if path.endswith('tsv'):
            reader = csv.DictReader(file, dialect=csv.excel_tab)
        else:
            reader = csv.DictReader(file)
        for row in reader:
            if row['semantic_similarity'] and row['cono_similarity']:
                if (exclude_hard_examples and 
                        'hard example' in row['comment'].lower()):
                    continue
                data.append(PhrasePair(
                    row['query'], 
                    row['neighbor'], 
#                     row['query_words'], 
#                     row['neighbor_words'], 
                    float(row['semantic_similarity']), 
                    float(row['cono_similarity'])))
    print(f'Loaded {len(data)} labeled entries at {path}')
    return data


def load_MTurk_results(path):
    data = []
    with open(path) as file:
        if path.endswith('tsv'):
            reader = csv.DictReader(file, dialect=csv.excel_tab)
        else:
            reader = csv.DictReader(file)
        for row in reader:
            if row['median_deno'] and row['median_query_cono']:
                
                cono_sim = 5 - abs(
                    float(row['median_query_cono']) - 
                    float(row['median_neighbor_cono']))
                
                data.append(PhrasePair( 
                    row['query_words'], 
                    row['neighbor_words'], 
                    float(row['median_deno']), 
                    cono_sim))
    print(f'Loaded {len(data)} labeled entries at {path}')
    return data


def correlate_sim_deltas(model, ref_model, phrase_pairs, verbose=False):
    label_deltas = []
    model_deltas = []
    if verbose:
        print(f'deno_sim\tcono_sim\tref_sim\tmodel_sim')
    
    for pair in phrase_pairs:
        try:
            sim = model.cosine_similarity(pair.query, pair.neighbor)
            ref_sim = ref_model.cosine_similarity(pair.query, pair.neighbor)
        except KeyError:
            continue 
        model_delta = sim - ref_sim
        model_deltas.append(model_delta)
        label_deltas.append(pair.deno_sim - pair.cono_sim)
            
        if verbose:
            print(f'{pair.deno_sim}  {pair.cono_sim}  {ref_sim:.2%}  {sim:.2%}  '
                  f'{pair.query}  {pair.neighbor}')

    median = np.median(model_deltas)
    mean = np.mean(model_deltas)
    stddev = np.std(model_deltas)
    rho, _ = spearmanr(model_deltas, label_deltas)
    return rho, median, mean, stddev

## Cherry Data

In [None]:
Dem_pairs = load_cherry(    
    '../../data/evaluation/cherries/labeled_Dem_samples.tsv',
    exclude_hard_examples=True)
GOP_pairs = load_cherry(
    '../../data/evaluation/cherries/labeled_GOP_samples.tsv',
    exclude_hard_examples=True)
test_data = Dem_pairs + GOP_pairs

# Same entity denotation, different party connotation.
euphemism = [pair for pair in test_data
             if pair.deno_sim > pair.cono_sim]

# Different entity denotation, same party connotation.
party_platform = [pair for pair in test_data
                  if pair.deno_sim < pair.cono_sim]
party_platform += load_cherry(
    '../../data/evaluation/cherries/remove_deno.tsv',
    exclude_hard_examples=False)

print(f'{len(euphemism)} euphemism (deno_sim > cono_sim)')
print(f'{len(party_platform)} party platform (deno_sim < cono_sim)')

## Pilot Batch

In [2]:
# test_data = load_MTurk_results('../../data/evaluation/qualification_30.csv')
test_data = load_MTurk_results('../../data/evaluation/combined_result.csv')

# Same entity denotation, different party connotation.
euphemism = [pair for pair in test_data
             if pair.deno_sim > pair.cono_sim]

# Different entity denotation, same party connotation.
party_platform = [pair for pair in test_data
                  if pair.deno_sim < pair.cono_sim]

print(f'{len(euphemism)} euphemism (deno_sim > cono_sim)')
print(f'{len(party_platform)} party platform (deno_sim < cono_sim)')

Loaded 370 labeled entries at ../../data/evaluation/combined_result.csv
14 euphemism (deno_sim > cono_sim)
325 party platform (deno_sim < cono_sim)


In [4]:
# preview
for stuff in euphemism:
    q, n, d, c = stuff
    print(d, c, q, n, sep='\t')

2.5	2.0	swapo	sovietbacked
2.5	2.0	hanoi	north_vietnam
4.5	4.0	laidoff_workers	displaced_workers
3.0	2.0	class_sizes	reducing_class_size
5.0	1.0	death_tax	estate_taxes
4.0	3.0	government_option	government_health_care
2.0	1.0	national_energy_tax	carbon_pollution
2.5	2.0	livable_wage	per_child_tax
4.0	3.0	living_wage	minimum_wage
4.0	2.0	occupation_of_haiti	invasion_of_haiti
5.0	1.0	private_accounts	personal_accounts
3.0	2.0	libyan_oil	libyan
2.0	1.0	elitists	far_right
3.0	2.0	genocide_convention	convention


### Load Pretrained Embedding

In [3]:
pretrained = Embedding('../../data/pretrained_word2vec/for_real.txt', 'plain_text')

vocab_size = 111,387, num_dimensions = 300
Loading embeddings from ../../data/pretrained_word2vec/for_real.txt
Done


## Decompose Denotation +d -c models 
similarity should increase for euphemism

In [6]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/' 
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.016	0.011	0.025	-0.112
1d -1c/epoch4.pt	0.012	0.006	0.027	-0.228
1d -1c/epoch6.pt	0.009	0.002	0.024	0.045
1d -1c/epoch8.pt	0.002	-0.001	0.021	-0.045
1d -1c/epoch10.pt	0.005	0.000	0.025	-0.052
1d -1c/epoch12.pt	0.004	-0.001	0.022	-0.228
1d -1c/epoch14.pt	-0.007	-0.003	0.027	-0.131
1d -1c/epoch16.pt	0.004	0.000	0.030	-0.228
1d -1c/epoch20.pt	0.004	-0.004	0.032	-0.262
1d -1c/epoch24.pt	-0.001	-0.006	0.036	-0.183
1d -1c/epoch28.pt	0.009	-0.001	0.032	-0.281
1d -1c/epoch30.pt	0.011	0.002	0.029	-0.228
				
1d -2c/epoch2.pt	0.020	0.019	0.031	0.150
1d -2c/epoch4.pt	0.017	0.009	0.035	0.060
1d -2c/epoch6.pt	0.007	-0.005	0.036	0.105
1d -2c/epoch8.pt	0.014	-0.007	0.039	0.045
1d -2c/epoch10.pt	0.001	-0.009	0.037	-0.098
1d -2c/epoch12.pt	0.003	-0.008	0.036	-0.052
1d -2c/epoch14.pt	0.004	-0.007	0.037	-0.295
1d -2c/epoch16.pt	-0.009	-0.012	0.030	-0.190
1d -2c/epoch20.pt	-0.005	-0.010	0.033	0.071
1d -2c/epoch24.pt	-0.008	-0.012	0.03

In [7]:
# Evaluate against 1d 0c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '1d 0c'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.005	0.006	0.016	-0.355
1d -1c/epoch4.pt	0.015	0.013	0.015	-0.478
1d -1c/epoch6.pt	0.006	0.010	0.010	-0.359
1d -1c/epoch8.pt	0.010	0.011	0.013	-0.445
1d -1c/epoch10.pt	0.007	0.010	0.020	-0.228
1d -1c/epoch12.pt	0.006	0.008	0.017	-0.478
1d -1c/epoch14.pt	0.007	0.008	0.015	-0.426
1d -1c/epoch16.pt	0.012	0.010	0.021	-0.367
1d -1c/epoch20.pt	0.011	0.009	0.019	-0.433
1d -1c/epoch24.pt	0.009	0.008	0.020	-0.262
1d -1c/epoch28.pt	0.021	0.016	0.023	-0.359
1d -1c/epoch30.pt	0.018	0.019	0.020	-0.516
				
1d -2c/epoch2.pt	0.016	0.014	0.017	-0.060
1d -2c/epoch4.pt	0.017	0.015	0.024	-0.150
1d -2c/epoch6.pt	0.011	0.004	0.024	-0.052
1d -2c/epoch8.pt	0.009	0.004	0.021	-0.262
1d -2c/epoch10.pt	0.001	0.001	0.020	-0.367
1d -2c/epoch12.pt	0.007	0.001	0.029	-0.243
1d -2c/epoch14.pt	0.008	0.004	0.021	-0.452
1d -2c/epoch16.pt	-0.012	-0.003	0.025	-0.219
1d -2c/epoch20.pt	0.001	0.002	0.029	-0.024
1d -2c/epoch24.pt	0.003	0.002	0.029	-0.419
1d

## Decompose Denotation +d -c models 
Similarity should decrease for party platform.

In [8]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.017	0.016	0.033	-0.076
1d -1c/epoch4.pt	0.008	0.005	0.035	-0.022
1d -1c/epoch6.pt	-0.002	-0.003	0.037	0.007
1d -1c/epoch8.pt	-0.006	-0.005	0.037	0.010
1d -1c/epoch10.pt	-0.004	-0.006	0.040	0.050
1d -1c/epoch12.pt	-0.007	-0.008	0.037	0.009
1d -1c/epoch14.pt	-0.004	-0.006	0.039	0.045
1d -1c/epoch16.pt	-0.004	-0.008	0.040	0.077
1d -1c/epoch20.pt	-0.009	-0.009	0.041	0.073
1d -1c/epoch24.pt	-0.009	-0.009	0.044	0.088
1d -1c/epoch28.pt	-0.006	-0.010	0.045	0.072
1d -1c/epoch30.pt	-0.008	-0.010	0.044	0.048
				
1d -2c/epoch2.pt	0.027	0.027	0.035	-0.010
1d -2c/epoch4.pt	0.015	0.015	0.035	-0.027
1d -2c/epoch6.pt	0.006	0.005	0.036	-0.027
1d -2c/epoch8.pt	-0.000	-0.002	0.039	-0.001
1d -2c/epoch10.pt	-0.000	-0.006	0.038	-0.048
1d -2c/epoch12.pt	-0.002	-0.007	0.039	0.001
1d -2c/epoch14.pt	-0.007	-0.009	0.040	-0.033
1d -2c/epoch16.pt	-0.004	-0.011	0.039	-0.005
1d -2c/epoch20.pt	-0.010	-0.010	0.040	-0.002
1d -2c/epoch24.pt	-0.007	-

In [9]:
# Evaluate against 1d 0c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '1d 0c'
models = ['1d -1c', '1d -2c', '1d -4c', '1d -8c', '1d -10c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.3f}\t{mean:.3f}\t{stddev:.3f}\t{spearman_rho:.3f}')
#         print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
1d -1c/epoch2.pt	0.008	0.010	0.020	-0.017
1d -1c/epoch4.pt	0.009	0.011	0.022	0.001
1d -1c/epoch6.pt	0.009	0.009	0.023	0.027
1d -1c/epoch8.pt	0.008	0.009	0.024	0.031
1d -1c/epoch10.pt	0.008	0.009	0.026	0.047
1d -1c/epoch12.pt	0.008	0.009	0.025	-0.035
1d -1c/epoch14.pt	0.012	0.012	0.029	0.016
1d -1c/epoch16.pt	0.010	0.010	0.028	0.028
1d -1c/epoch20.pt	0.009	0.010	0.031	0.012
1d -1c/epoch24.pt	0.009	0.011	0.035	0.042
1d -1c/epoch28.pt	0.007	0.010	0.033	0.027
1d -1c/epoch30.pt	0.009	0.010	0.034	0.001
				
1d -2c/epoch2.pt	0.018	0.021	0.027	0.035
1d -2c/epoch4.pt	0.019	0.020	0.028	-0.006
1d -2c/epoch6.pt	0.015	0.016	0.029	-0.008
1d -2c/epoch8.pt	0.011	0.012	0.029	-0.002
1d -2c/epoch10.pt	0.009	0.009	0.028	-0.059
1d -2c/epoch12.pt	0.009	0.010	0.032	-0.050
1d -2c/epoch14.pt	0.006	0.009	0.033	-0.049
1d -2c/epoch16.pt	0.006	0.007	0.033	-0.074
1d -2c/epoch20.pt	0.008	0.009	0.036	-0.059
1d -2c/epoch24.pt	0.007	0.008	0.036	-0.094
1d -2c/epoch28.

## Decompose Connotaion -d +c models 
similarity should decrease for euphemism

In [10]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-7.94%	-7.94%	12.94%	-31.18%
-0.005d 1c/epoch4.pt	-4.34%	-7.42%	8.94%	-21.42%
-0.005d 1c/epoch6.pt	-5.97%	-8.99%	9.78%	15.23%
-0.005d 1c/epoch8.pt	-4.40%	-6.75%	9.00%	-24.51%
-0.005d 1c/epoch10.pt	-6.71%	-9.34%	11.85%	-2.38%
-0.005d 1c/epoch12.pt	-7.95%	-11.25%	11.01%	-8.33%
-0.005d 1c/epoch14.pt	-9.84%	-10.04%	9.88%	-8.09%
-0.005d 1c/epoch16.pt	-11.01%	-9.72%	7.20%	-10.23%
-0.005d 1c/epoch20.pt	-10.03%	-10.45%	6.26%	-25.94%
-0.005d 1c/epoch24.pt	-13.80%	-12.54%	8.22%	-7.62%
-0.005d 1c/epoch28.pt	-13.17%	-10.69%	7.87%	-4.76%
-0.005d 1c/epoch30.pt	-12.88%	-13.20%	9.90%	11.42%
				
-0.05d 1c/epoch2.pt	21.40%	25.12%	10.16%	-39.51%
-0.05d 1c/epoch4.pt	21.42%	25.63%	10.47%	-36.18%
-0.05d 1c/epoch6.pt	15.67%	16.75%	8.61%	-39.51%
-0.05d 1c/epoch8.pt	13.77%	14.92%	8.86%	-42.13%
-0.05d 1c/epoch10.pt	11.09%	11.61%	8.85%	-33.80%
-0.05d 1c/epoch12.pt	10.56%	11.13%	9.15%	-29.27%
-0.05d 1c/epoch14.pt	10.82%	10.45%	9.23%	-38.32

In [11]:
# Evaluate against 0d 1c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '0d 1c'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, euphemism, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.31%	-1.72%	3.72%	-0.24%
-0.005d 1c/epoch4.pt	-3.02%	-2.98%	3.74%	39.03%
-0.005d 1c/epoch6.pt	-3.14%	-2.34%	2.23%	41.17%
-0.005d 1c/epoch8.pt	-3.41%	-4.03%	4.02%	36.65%
-0.005d 1c/epoch10.pt	-2.72%	-3.94%	4.09%	28.08%
-0.005d 1c/epoch12.pt	-4.18%	-3.69%	4.67%	5.47%
-0.005d 1c/epoch14.pt	-1.16%	-2.46%	3.52%	0.95%
-0.005d 1c/epoch16.pt	-2.28%	-3.73%	5.34%	-3.33%
-0.005d 1c/epoch20.pt	-2.46%	-2.66%	3.68%	-8.57%
-0.005d 1c/epoch24.pt	-4.92%	-4.60%	4.29%	-34.51%
-0.005d 1c/epoch28.pt	-4.78%	-5.49%	5.42%	-25.70%
-0.005d 1c/epoch30.pt	-4.24%	-4.82%	7.51%	-4.76%
				
-0.05d 1c/epoch2.pt	24.30%	31.34%	20.04%	10.95%
-0.05d 1c/epoch4.pt	29.49%	30.07%	15.27%	11.66%
-0.05d 1c/epoch6.pt	19.78%	23.40%	13.32%	-10.71%
-0.05d 1c/epoch8.pt	16.70%	17.65%	10.27%	5.47%
-0.05d 1c/epoch10.pt	13.02%	17.02%	13.15%	-20.71%
-0.05d 1c/epoch12.pt	14.81%	18.70%	14.83%	-19.28%
-0.05d 1c/epoch14.pt	14.31%	18.03%	12.83%	-15.47%
-0.05d 1c/epoch1

## Decompose Connotation -d +c models 
similarity should increase for party platform.

In [12]:
# Evaluate against frozen pretrained word2vec
base_dir = '../../results/for_real_NS/'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]
reference_embed = pretrained

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        embed = Embedding(base_dir + model_path)

        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-1.22%	-4.62%	14.51%	9.38%
-0.005d 1c/epoch4.pt	-3.01%	-5.28%	11.75%	5.70%
-0.005d 1c/epoch6.pt	-4.90%	-7.31%	10.50%	10.38%
-0.005d 1c/epoch8.pt	-6.16%	-7.83%	9.71%	5.10%
-0.005d 1c/epoch10.pt	-6.68%	-7.85%	8.16%	8.27%
-0.005d 1c/epoch12.pt	-7.15%	-9.04%	9.08%	12.39%
-0.005d 1c/epoch14.pt	-8.33%	-9.75%	8.97%	4.32%
-0.005d 1c/epoch16.pt	-9.40%	-10.57%	9.23%	2.73%
-0.005d 1c/epoch20.pt	-10.65%	-11.34%	9.20%	4.75%
-0.005d 1c/epoch24.pt	-11.45%	-11.77%	9.12%	7.39%
-0.005d 1c/epoch28.pt	-10.98%	-12.43%	9.55%	3.08%
-0.005d 1c/epoch30.pt	-10.99%	-12.53%	9.22%	11.09%
				
-0.05d 1c/epoch2.pt	27.94%	29.01%	10.94%	-18.71%
-0.05d 1c/epoch4.pt	28.23%	29.25%	11.49%	-16.78%
-0.05d 1c/epoch6.pt	18.18%	19.83%	9.63%	-15.77%
-0.05d 1c/epoch8.pt	16.18%	17.22%	9.20%	-12.64%
-0.05d 1c/epoch10.pt	12.81%	13.94%	9.23%	-12.63%
-0.05d 1c/epoch12.pt	11.89%	12.78%	9.19%	-12.67%
-0.05d 1c/epoch14.pt	10.89%	11.93%	9.16%	-11.72%
-0.05d 1c/epoc

In [13]:
# Evaluate against 0d 1c ceteris paribus trained models
base_dir = '../../results/for_real_NS/'
reference_model = '0d 1c'
models = ['-0.005d 1c', '-0.05d 1c', '-0.01d 1c', '-0.1d 1c', '-0.2d 1c', '-0.4d 1c', '-0.8d 1c']
epochs = [2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 30]

print('Model\tMedian Delta\tMean Delta\tStd. Dev.\tSpearman Rho')
for model in models:
    for epoch in epochs:
        model_path = f'{model}/epoch{epoch}.pt'
        reference_model_path = f'{reference_model}/epoch{epoch}.pt'
        
        embed = Embedding(base_dir + model_path)
        reference_embed = Embedding(base_dir + reference_model_path)
        
        spearman_rho, median, mean, stddev = correlate_sim_deltas(
            embed, reference_embed, party_platform, verbose=False)
        
        print(f'{model_path}\t{median:.2%}\t{mean:.2%}\t{stddev:.2%}\t{spearman_rho:.2%}')
    print('\t\t\t\t')

Model	Median Delta	Mean Delta	Std. Dev.	Spearman Rho
-0.005d 1c/epoch2.pt	-2.12%	-2.15%	3.41%	4.94%
-0.005d 1c/epoch4.pt	-1.97%	-2.53%	3.92%	8.06%
-0.005d 1c/epoch6.pt	-3.34%	-3.51%	3.94%	3.09%
-0.005d 1c/epoch8.pt	-3.53%	-4.14%	4.18%	7.01%
-0.005d 1c/epoch10.pt	-4.16%	-4.23%	3.87%	12.60%
-0.005d 1c/epoch12.pt	-4.32%	-4.44%	4.18%	7.39%
-0.005d 1c/epoch14.pt	-4.11%	-4.58%	4.42%	11.72%
-0.005d 1c/epoch16.pt	-4.16%	-4.29%	4.52%	-1.37%
-0.005d 1c/epoch20.pt	-5.08%	-5.03%	5.11%	9.61%
-0.005d 1c/epoch24.pt	-4.43%	-4.88%	5.18%	8.11%
-0.005d 1c/epoch28.pt	-5.47%	-5.76%	5.71%	3.62%
-0.005d 1c/epoch30.pt	-5.02%	-5.33%	6.37%	8.62%
				
-0.05d 1c/epoch2.pt	28.41%	31.48%	19.46%	-17.91%
-0.05d 1c/epoch4.pt	29.19%	32.00%	17.93%	-11.98%
-0.05d 1c/epoch6.pt	21.19%	23.63%	14.54%	-15.03%
-0.05d 1c/epoch8.pt	18.81%	20.90%	13.29%	-8.84%
-0.05d 1c/epoch10.pt	16.56%	17.56%	11.96%	-9.34%
-0.05d 1c/epoch12.pt	15.53%	17.38%	12.79%	-14.15%
-0.05d 1c/epoch14.pt	15.23%	17.10%	12.81%	-6.89%
-0.05d 1c/epoch16.pt	26.