In [1]:
import pickle
import csv
import os
from typing import Set, Tuple, List, Dict, Counter

import torch
import numpy as np
import scipy
from sklearn.metrics import pairwise
from tqdm import tqdm

from adversarial import AdversarialDecomposer, AdversarialConfig
# from preprocessing.S4_export_training_corpus import Document

np.random.seed(42)
torch.manual_seed(42)

class Embedding():
    
    def __init__(self, source: str, path: str):
        if source == 'adversarial':
            self.init_from_adversarial(path)
        elif source == 'skip_gram':
            self.init_from_skip_gram(path)
        elif source == 'plain_text':
            self.init_from_plain_text(path)
        else:
            raise ValueError('Unknown embedding source.')
            
    def init_from_adversarial(self, path: str, device=torch.device('cpu')):
        payload = torch.load(path, map_location=device)
        model = payload['model']
        self.word_to_id = model.word_to_id
        self.id_to_word = model.id_to_word 
        self.Dem_frequency: Counter[str] = model.Dem_frequency
        self.GOP_frequency: Counter[str] = model.GOP_frequency
        
        # encoded layer
        self.embedding = model.export_encoded_embedding(device=device)
#         self.embedding = model.export_decomposed_embedding(device=device)

#         # manually choose which layer to export
#         all_vocab_ids = torch.arange(
#             len(self.word_to_id), dtype=torch.long, device=device)
#         with torch.no_grad():
#             embed = model.embedding(all_vocab_ids)
#             encoded = model.encoder(embed)
#             self.cono_logits = model.cono_decoder(encoded)
            
#     def init_from_adversarial(self, path: str):        
#         config = DenotationEncoderConfig()
#         config.input_dir = '../../data/processed/adversarial/44_Obama_1e-5'
#         data = AdversarialDataset(config)
#         model = DenotationEncoder(config, data)
#         model.load_state_dict(torch.load(path))
#         self.embedding = model.export_decomposed_embedding().to('cpu')
#         self.word_to_id = model.word_to_id
#         self.id_to_word = model.id_to_word

    def init_from_skip_gram(self, paths: Tuple[str, str]) -> None:
        """Directly extract the weights of a single layer."""
        model_path, vocab_path = paths
        with open(model_path, 'rb') as model_file:
            state_dict = torch.load(model_file, map_location='cpu')
    #     print(state_dict.keys())
        self.embedding = state_dict['center_embedding.weight'].numpy()
        with open(vocab_path, 'rb') as vocab_file:
            self.word_to_id, self.id_to_word, _ = pickle.load(vocab_file)

    def init_from_plain_text(self, path: str) -> Tuple[np.array, Dict[str, int]]:
        id_generator = 0
        word_to_id: Dict[str, int] = {}
        embeddings: List[float] = []
        embedding_file = open(path)
        vocab_size, num_dimensions = map(int, embedding_file.readline().split())
        print(f'vocab_size = {vocab_size:,}, num_dimensions = {num_dimensions}')
        print(f'Loading embeddings from {path}', flush=True)
        for line in embedding_file:
            line: List[str] = line.split()  # type: ignore
            word = line[0]
            vector = np.array(line[-num_dimensions:], dtype=np.float64)
            embeddings.append(vector)
            word_to_id[word] = id_generator
            id_generator += 1
        embedding_file.close()
        print('Done')
        self.id_to_word = {val: key for key, val in word_to_id.items()}
        self.word_to_id = word_to_id
        self.embedding = np.array(embeddings)
        
    def write_to_tensorboard_projector(self, tb_dir: str) -> None:
        from torch.utils import tensorboard
        tb = tensorboard.SummaryWriter(log_dir=tb_dir)
        all_vocab_ids = range(len(self.word_to_id))
        embedding_labels = [
            self.id_to_word[word_id]
            for word_id in all_vocab_ids]
        tb.add_embedding(
            self.embedding[:9999], 
            embedding_labels[:9999], 
            global_step=0)
        
    def export_web_projector(self, out_dir: str) -> None:
        random_indices = np.random.randint(len(self.embedding), size=10000)
        subset_embedding = self.embedding[random_indices].tolist()
        
        vector_path = os.path.join(out_dir, 'tensorboard.tsv')
        with open(vector_path, 'w') as vector_file:
            for vector in subset_embedding:
                vector_file.write('\t'.join(map(str, vector)) + '\n')

        label_path = os.path.join(out_dir, 'tensorboard_labels.tsv')
        with open(label_path, 'w') as label_file:
            for index in random_indices:
                label_file.write(self.id_to_word[index] + '\n')

    def cosine_similarity(self, query1: str, query2: str) -> float:
        try:
            query1_id = self.word_to_id[query1]
        except KeyError as error:
            print(f'Out of vocabulary: {query1}')
            raise error
        try:
            query2_id = self.word_to_id[query2]
        except KeyError as error:
            print(f'Out of vocabulary: {query2}')
            raise error
        vectors = self.embedding[(query1_id, query2_id), :]
        similarity = 1 - scipy.spatial.distance.cosine(vectors[0], vectors[1])
        return similarity

    def nearest_neighbor(self, query: str, top_k: int = 10):
        try:
            query_id = self.word_to_id[query]
        except KeyError:
            raise KeyError(f'{query} is out of vocabulary. Sorry!')    
        query_vec = self.embedding[query_id]
        
        distances = [scipy.spatial.distance.cosine(query_vec, vec) 
                     for vec in self.embedding]
        neighbors = np.argsort(distances)
        print(f"{query}'s neareset neighbors:")
        for ranking in range(1, top_k + 1):
            word_id = neighbors[ranking]
            word = self.id_to_word[word_id]
            cosine_similarity = 1 - distances[word_id]
            print(f'{cosine_similarity:.4f}\t{word}')
        print()
        

In [None]:
# models['cono'].write_to_tensorboard_projector(
#     '../../results/adversarial/Obama/p8_.55to.75/d0_c1/embedding_projector')
# models['cono'].export_web_tensorboard('../../results/adversarial/Obama/p8_.55to.75/d0_c1/web_projector')

## Nearest Neighbors

In [None]:
model = models['-10c']
cherries = [
    'estate_tax', 'death_tax', 
    'undocumented', 'illegal_aliens', 
    'music', 'language']
for cherry in cherries:
    model.nearest_neighbor(cherry)

## Cherry-Picking

In [None]:
data = []
with open('GOP_sample.tsv') as file:
    reader = csv.DictReader(file, dialect=csv.excel_tab)
    for row in reader:
        data.append(row)
print(len(data))

In [None]:
labeled_data = [
    obs for obs in data 
    if obs['same_denotation'] or obs['same_connotation']]
print(len(labeled_data))

In [11]:
def cherry_pick(model1, model2):
    """prints difference := model2 - model1"""   
#     def print_similarities(pairs):
#         for word1, word2 in pairs:
#             try: 
#                 print(f'{model.cosine_similarity(word1, word2):.4f}  '
#                       f'{word1:<30}{word2:<30}')
#             except KeyError:
#                 pass
    
    def compare_similarities(pairs):
        deltas = []
        for word1, word2 in pairs:
            try:
                sim1 = model1.cosine_similarity(word1, word2)
                sim2 = model2.cosine_similarity(word1, word2)
            except KeyError:
                continue
            delta = sim2 - sim1
            deltas.append(delta)
            print(f'{sim1:.1%}\t{sim2:.1%}\t{delta:+.1%}  '
                  f'{word1:<30}{word2:<30}')
        median_delta = np.median(deltas)
        print(f'Median Delta = {median_delta:+.1%}')
        
    print('Same entity, different parties.\n'
          'Removing connotation should increase similarity.\n'
#           'Removing denotation should decrease similarity.'
    )
    cherries = [
        ('estate_tax', 'death_tax'), 
        ('undocumented_immigrants', 'illegals'),
                
        ('health_care_reform', 'obamacare'),
        ('public_option', 'governmentrun'),
        ('national_health_insurance', 'government_takeover'),
        ('national_health_insurance', 'welfare_state'),
        ('singlepayer', 'governmentrun_health_care'),
        ('singlepayer', 'socialized_medicine'),
        ('universal_health_care', 'socialized_medicine'),
        
        ('campaign_spending', 'political_speech'),
        ('independent_expenditures', 'political_speech'),
        
        ('recovery_and_reinvestment', 'stimulus_bill'),  # Note
        ('military_spending', 'washington_spending'),
        ('progrowth', 'create_jobs'),
        
        ('unborn', 'fetus'),
        ('prochoice', 'proabortion'),
        ('family_planning', 'proabortion')
#         ('icbms', 'star_wars_program'),
#         ('excessive_speculation', 'highfrequency'),
#         ('corporate_profits', 'earnings'), 
#         ('megabanks', 'aig'),
#         ('unemployment_insurance_benefits', 'stimulus'),
#         ('retroactive_immunity', 'the_fisa_bill'),
#         ('give_tax_breaks', 'cut_taxes'),
#         ('sodomy', 'sex'),
#         ('flat_tax', 'income_tax')
    # capandtax, wall_street_reform, the_recovery_act, lesbian, inequality
    #     'health_care_bill', ',
    #     'the_wall_street_reform_legislation', 'financial_stability', 'capital_gains_tax',
    #     'deficit_spending', 'bush_tax_cuts'
    ]    
    compare_similarities(cherries)
     
    print('\n\nDifferent entities, same party.\n'
          'Removing connotation should decresase similarity.\n'
#           'Removing denotation should increase similarity.'
    )
    ideologies = [
        ('tax_cuts', 'right_to_life'),
        ('new_entitlements', 'religious_freedom'),
        ('illegals', 'unborn'),
        ('antitrust_laws', 'lesbian'),
        ('wall_street_reform', 'the_recovery_act'),
        ('nuclear_option', 'clean_energy_jobs'), 
        ('record_deficits', 'living_wage'),
        ('manmade_global_warming', 'radical_jihadists')
    ]
#     ideologies = [  # difficult to distinguish
#         ('tax_cuts', 'new_entitlements'),
#         ('religious_freedom', 'right_to_life')
#     ]
    compare_similarities(ideologies)
    
    print('\n\nDifferent entities, different parties. Control group.')
    controls = [
        ('traditional_marriage', 'lgbt'),
        ('taxes', 'antitrust_laws'),
        ('carbon', 'guns'),
        ('abortion', 'minimum_wage'),
        ('apple', 'piano'),
        ('beef', 'computer'),
    ]
    compare_similarities(controls)

## Load Models

In [83]:
models = {}
models['w2v'] = Embedding('plain_text', '../../data/pretrained_word2vec/for_real.txt')

vocab_size = 111,387, num_dimensions = 300
Loading embeddings from ../../data/pretrained_word2vec/for_real.txt
Done


In [75]:
base_dir = '../../results/grid/with decoder/'
epoch = 4
models['0c'] = Embedding('adversarial', base_dir + f'1d 0c/epoch{epoch}.pt')
models['-1c'] = Embedding('adversarial', base_dir + f'1d -1c/epoch{epoch}.pt')
models['-2c'] = Embedding('adversarial', base_dir + f'1d -2c/epoch{epoch}.pt')
models['-4c'] = Embedding('adversarial', base_dir + f'1d -4c/epoch{epoch}.pt')
models['-8c'] = Embedding('adversarial', base_dir + f'1d -8c/epoch{epoch}.pt')

In [51]:
base_dir = '../../results/grid/sans decoder/'
models['0c'] = Embedding('adversarial', base_dir + '1d 0c/epoch7.pt')
models['-1c'] = Embedding('adversarial', base_dir + '1d -8c/epoch7.pt')

In [89]:
# Locally Trained
base_dir = '../../results/with decoder/'
epoch = 8
models['0c'] = Embedding('adversarial', base_dir + f'1d 0c/epoch{epoch}.pt')
models['-10c'] = Embedding('adversarial', base_dir + f'1d -10c/epoch{epoch}.pt')

In [90]:
cherry_pick(models['w2v'], models['0c'])

Same entity, different parties.
Removing connotation should increase similarity.
Removing denotation should decrease similarity.
89.2%	91.1%	+1.8%  estate_tax                    death_tax                     
75.9%	76.0%	+0.1%  undocumented_immigrants       illegals                      
57.4%	62.5%	+5.1%  health_care_reform            obamacare                     
72.7%	75.3%	+2.6%  public_option                 governmentrun                 
48.9%	39.2%	-9.7%  national_health_insurance     government_takeover           
37.6%	39.0%	+1.4%  national_health_insurance     welfare_state                 
80.4%	82.0%	+1.6%  singlepayer                   governmentrun_health_care     
69.9%	69.7%	-0.3%  singlepayer                   socialized_medicine           
49.1%	49.1%	+0.0%  universal_health_care         socialized_medicine           
68.2%	63.4%	-4.8%  campaign_spending             political_speech              
62.4%	57.7%	-4.7%  independent_expenditures      political_speech      

In [91]:
cherry_pick(models['w2v'], models['-10c'])

Same entity, different parties.
Removing connotation should increase similarity.
Removing denotation should decrease similarity.
89.2%	88.0%	-1.2%  estate_tax                    death_tax                     
75.9%	87.4%	+11.5%  undocumented_immigrants       illegals                      
57.4%	68.6%	+11.3%  health_care_reform            obamacare                     
72.7%	79.1%	+6.4%  public_option                 governmentrun                 
48.9%	75.5%	+26.5%  national_health_insurance     government_takeover           
37.6%	70.5%	+32.9%  national_health_insurance     welfare_state                 
80.4%	78.9%	-1.4%  singlepayer                   governmentrun_health_care     
69.9%	63.2%	-6.8%  singlepayer                   socialized_medicine           
49.1%	65.3%	+16.2%  universal_health_care         socialized_medicine           
68.2%	67.8%	-0.3%  campaign_spending             political_speech              
62.4%	66.0%	+3.6%  independent_expenditures      political_speech 

In [92]:
cherry_pick(models['0c'], models['-10c'])

Same entity, different parties.
Removing connotation should increase similarity.
Removing denotation should decrease similarity.
91.1%	88.0%	-3.1%  estate_tax                    death_tax                     
76.0%	87.4%	+11.3%  undocumented_immigrants       illegals                      
62.5%	68.6%	+6.2%  health_care_reform            obamacare                     
75.3%	79.1%	+3.8%  public_option                 governmentrun                 
39.2%	75.5%	+36.2%  national_health_insurance     government_takeover           
39.0%	70.5%	+31.5%  national_health_insurance     welfare_state                 
82.0%	78.9%	-3.0%  singlepayer                   governmentrun_health_care     
69.7%	63.2%	-6.5%  singlepayer                   socialized_medicine           
49.1%	65.3%	+16.2%  universal_health_care         socialized_medicine           
63.4%	67.8%	+4.5%  campaign_spending             political_speech              
57.7%	66.0%	+8.3%  independent_expenditures      political_speech  

In [None]:
cherry_pick(models['-10c'], models['-12c'])

In [None]:
# does not work
cherry_pick(models['vanilla cono'], models['cono minus deno'])

## Top Changes in Vector Spaces

In [None]:
def compare_all_vocab(model1, model2, top_k=100, min_freq=100, max_freq=300):  
    assert model1.id_to_word == model2.id_to_word
    assert model1.Dem_frequency == model2.Dem_frequency
    assert model1.GOP_frequency == model2.GOP_frequency
    id_to_word = model1.id_to_word
    freq: Counter[str] = model1.Dem_frequency + model1.GOP_frequency

    # TODO select by indices
    embed1 = np.array(
        [vec.numpy() for word_id, vec in enumerate(model1.embedding) 
         if max_freq >= freq[id_to_word[word_id]] >= min_freq])
    embed2 = np.array(
        [vec.numpy() for word_id, vec in enumerate(model2.embedding) 
         if max_freq >= freq[id_to_word[word_id]] >= min_freq])    
    print(f'min_freq = {min_freq}, filtered vocab size = {len(embed1):,}')
    
    sim1 = pairwise.cosine_similarity(embed1)
    sim2 = pairwise.cosine_similarity(embed2)
    sim1 = np.triu(sim1)
    sim2 = np.triu(sim2)
    sim_diff = sim1 - sim2
    top_changes = np.argsort(sim_diff, axis=None)
    
    top_changed = []
    for i, row in enumerate(sim_diff):
        sorted_indices = np.argsort(row)
        x = sorted_indices[:top_k]
        y = sorted_indices[-top_k:]
        both_extremes = np.hstack((x, y))
        top_changed += [(sim_diff[i, j], id_to_word[i], id_to_word[j]) 
                        for j in both_extremes]
            
    top_changed.sort(key=lambda tup: tup[0], reverse=True)
    return top_changed

#     top_changed[:100]
    
#     top_changed.sort(key=lambda tup: tup[0], reverse=True)
#     for sim_delta, x, y in top_changed:
#         if 0.3 < sim_delta < 0.4:
#             print(f'{sim_delta:.4f}  {x:<25}{y:<25}')

In [None]:
stuff = compare_all_vocab(models['vanilla deno'], models['-10c'])

In [None]:
stuff[:500]

In [None]:
sim1 = pairwise.cosine_similarity(models['deno minus cono'].embedding)
sim2 = pairwise.cosine_similarity(models['vanilla deno'].embedding)
# Filter > 100 freq
sim1 = np.triu(sim1)
sim2 = np.triu(sim2)
sim_diff = sim1 - sim2

In [None]:
vocab = models['vanilla deno'].id_to_word
top_k = 5
top_changed = []
for i, row in enumerate(sim_diff):
    sorted_indices = np.argsort(row)
    x = sorted_indices[:top_k]
    y = sorted_indices[-top_k:]
    both_extremes = np.hstack((x, y))
    top_changed += [(sim_diff[i, j], vocab[i], vocab[j]) 
                    for j in both_extremes]

In [None]:
top_changed.sort(key=lambda tup: tup[0], reverse=True)
top_changed[:100]

In [None]:
top_changed.sort(key=lambda tup: tup[0], reverse=True)
for sim_delta, x, y in top_changed:
    if 0.3 < sim_delta < 0.4:
        print(f'{sim_delta:.4f}  {x:<25}{y:<25}')