In [1]:
import pandas
from collections import Counter, defaultdict
import math
from gensim.models import Word2Vec


def read_data():
    """
    Reads data from the data/ folder.

    Returns:
        pd.DataFrame: dataframe containing the text and metadata
    """
    data = open("congress_data/cr_111-114.txt").readlines()
    meta_df = pandas.read_csv("congress_data/cr_metadata.csv")
    
    ### verifying the length of text file and metadata file are the same
    assert(len(data) == len(meta_df))
    
    ### merging files
    meta_df["text"] = data
    
    return meta_df

politics_words = [
                  'freedom', 'justice', 'equality', 'democracy', # political abstractions
                  'abortion', 'immigration', 'welfare', 'taxes', # partisan political issues   
                  'democrat', 'republican' # political parties               
                 ] # from Rodriguez and Spirling 2021



In [2]:
data = read_data()
data.head()

Unnamed: 0,gender,party,state,chamber,nonvoting,session_id,text
0,M,D,CT,H,voting,111,our democracy renews itself every years member...
1,M,R,IN,H,voting,111,madam clerk chairman the republican conference...
2,M,R,MO,S,voting,111,mack lee\n
3,M,R,OH,H,voting,111,madam speaker leader hoyer fellow members and ...
4,F,D,CA,H,voting,111,thank you very much leader boehner together we...


In [3]:
data_df = read_data()
r_df = data_df[data_df.party == "R"]
d_df = data_df[data_df.party == "D"]

window = 50
min_count = 10
seed = 42
workers = 16

r_model = Word2Vec(r_df.text.apply(lambda x: x.split()), window=window, min_count=min_count, seed=seed,
                   workers=workers)

In [4]:
d_model = Word2Vec(d_df.text.apply(lambda x: x.split()), window=window, min_count=min_count, seed=seed,
                   workers=workers)

In [5]:
r_model.wv.most_similar("the")

[('that', 0.6664916276931763),
 ('and', 0.6591695547103882),
 ('this', 0.6179782748222351),
 ('which', 0.5493958592414856),
 ('not', 0.49469226598739624),
 ('moreover', 0.45700719952583313),
 ('omit', 0.43984830379486084),
 ('only', 0.4277924597263336),
 ('envisions', 0.41959047317504883),
 ('drawings', 0.418395459651947)]

In [6]:
r_model.wv.key_to_index["the"]

0

In [7]:
# find most used words
print(r_model.wv.index_to_key[:100])
print(d_model.wv.index_to_key[:100])

['the', 'and', 'that', 'this', 'for', 'have', 'are', 'not', 'they', 'our', 'with', 'from', 'will', 'was', 'has', 'you', 'would', 'but', 'what', 'their', 'bill', 'about', 'people', 'who', 'all', 'president', 'there', 'time', 'more', 'speaker', 'going', 'these', 'can', 'his', 'been', 'one', 'were', 'when', 'because', 'out', 'just', 'which', 'those', 'now', 'its', 'american', 'amendment', 'years', 'government', 'had', 'them', 'states', 'here', 'other', 'get', 'some', 'over', 'want', 'year', 'think', 'today', 'know', 'than', 'country', 'health', 'congress', 'very', 'house', 'senate', 'new', 'care', 'said', 'make', 'work', 'federal', 'should', 'also', 'need', 'many', 'like', 'dont', 'say', 'back', 'state', 'yield', 'senator', 'support', 'way', 'law', 'how', 'percent', 'committee', 'united', 'well', 'tax', 'budget', 'legislation', 'act', 'only', 'into']
['the', 'and', 'that', 'for', 'this', 'have', 'are', 'our', 'with', 'they', 'not', 'from', 'will', 'was', 'their', 'has', 'but', 'bill', 'wo

In [8]:
print(len(r_model.wv.index_to_key))
print(len(d_model.wv.index_to_key))

33693
35968


In [9]:
def find_intersect_words(model1, model2):
    return set(model1.wv.index_to_key).intersection(set(model2.wv.index_to_key))

In [10]:
intersect_words = find_intersect_words(r_model, d_model)

In [11]:
print(len(intersect_words))

28824


In [12]:
top_100_r = r_model.wv.index_to_key[:100]
top_100_d = d_model.wv.index_to_key[:100]

In [13]:
interset = set(top_100_r).intersection(set(top_100_d))

In [14]:
len(interset)

93

In [15]:
n = 300
top_n_r = r_model.wv.index_to_key[:n]
top_n_d = d_model.wv.index_to_key[:n]

intersect = []
for item in top_n_r:
    if item in top_n_d:
        intersect.append(item)


In [16]:
frequent_word_list = []
for item in intersect:
    r_index = r_model.wv.key_to_index[item]
    d_index = d_model.wv.key_to_index[item]
    frequent_word_list.append((item, item, r_index, d_index))

In [17]:
with open("words_congress/frerquent_words.txt", "w") as f:
    for item in frequent_word_list:
        f.write(f"{item[0]}\t{item[1]}\t{item[2]}\t{item[3]}\n")

In [18]:
from utils import w2v_to_numpy
import numpy as np

r_embeddings, (r_idx, r_iidx) = w2v_to_numpy(r_model)
d_embeddings, (d_idx, d_iidx) = w2v_to_numpy(d_model)

In [19]:
r_intersect_index = [r_model.wv.key_to_index[item] for item in intersect]
d_intersect_index = [d_model.wv.key_to_index[item] for item in intersect]


In [20]:
r_embeddings_trunc = r_embeddings[r_intersect_index]
d_embeddings_trunc = d_embeddings[d_intersect_index]

In [21]:
print(r_embeddings_trunc.shape)
print(d_embeddings_trunc.shape)
print(r_index)

(268, 100)
(268, 100)
299


In [22]:
print(r_intersect_index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174, 175, 176, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 198, 200, 201, 202, 203, 205, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 221, 222, 224, 225, 226, 227, 228, 229, 230,

In [23]:
freq_weights = []
idiff_weights = []

freq_sum = [x + y for x, y in zip(r_intersect_index, d_intersect_index)]
max_sum = max(freq_sum)
min_sum = min(freq_sum)
idiff_list = [abs(x - y) for x, y in zip(r_intersect_index, d_intersect_index)]
max_idiff = max(idiff_list)
min_idiff = min(idiff_list)


for i in range(len(r_intersect_index)):
    freq_weights.append(2 - (freq_sum[i] - min_sum) / (max_sum - min_sum))
    idiff_weights.append(2 - (idiff_list[i] - min_idiff) / (max_idiff - min_idiff))

In [24]:
idiff_weights

[2.0,
 2.0,
 2.0,
 1.9932432432432432,
 1.9932432432432432,
 2.0,
 2.0,
 1.9797297297297298,
 1.9932432432432432,
 1.9864864864864864,
 1.9864864864864864,
 2.0,
 2.0,
 2.0,
 1.9932432432432432,
 1.9662162162162162,
 1.9864864864864864,
 1.9932432432432432,
 1.9594594594594594,
 1.9662162162162162,
 1.9797297297297298,
 1.9864864864864864,
 2.0,
 1.972972972972973,
 1.9797297297297298,
 1.9864864864864864,
 2.0,
 1.9864864864864864,
 1.9932432432432432,
 1.9391891891891893,
 1.885135135135135,
 1.9797297297297298,
 1.9864864864864864,
 1.9932432432432432,
 1.9797297297297298,
 1.9864864864864864,
 2.0,
 1.9797297297297298,
 1.9932432432432432,
 1.912162162162162,
 1.9324324324324325,
 1.9594594594594594,
 1.9662162162162162,
 1.9797297297297298,
 1.972972972972973,
 2.0,
 1.9797297297297298,
 1.9594594594594594,
 1.6621621621621623,
 1.9256756756756757,
 1.9932432432432432,
 1.9189189189189189,
 1.8513513513513513,
 1.9932432432432432,
 1.945945945945946,
 1.912162162162162,
 1.9121621

In [25]:
import torch

In [26]:
from alignment_methods import procrustes, direct_alignment, weighted_lr, closed_form_linear_regression, weighted_lr_train_only
X = procrustes(r_embeddings_trunc, d_embeddings_trunc)
r_embeddings_aligned = r_embeddings @ X 

X = direct_alignment(r_embeddings_trunc, d_embeddings_trunc)
r_embeddings_aligned_direct = r_embeddings @ X

X = closed_form_linear_regression(r_embeddings_trunc, d_embeddings_trunc)
r_embeddings_aligned_lr = r_embeddings @ X

X = weighted_lr_train_only(r_embeddings_trunc, d_embeddings_trunc, freq_weights)
r_embeddings_aligned_wlr_freq = X(torch.tensor(r_embeddings, dtype=torch.float32)).detach().numpy()

X = weighted_lr_train_only(r_embeddings_trunc, d_embeddings_trunc, idiff_weights)
r_embeddings_aligned_wlr_idiff = X(torch.tensor(r_embeddings, dtype=torch.float32)).detach().numpy()

In [27]:
from utils import get_accuracy_scores
accuracy_scores = get_accuracy_scores(d_embeddings, r_embeddings_aligned, d_intersect_index, r_intersect_index, d_iidx, 3, verbose=True)

the ['the', 'that', 'and']
and ['and', 'the', 'our']
that ['that', 'not', 'but']
this ['this', 'the', 'that']
for ['for', 'nts', 'and']
have ['have', 'has', 'havent']
are ['are', 'arent', 'being']
not ['not', 'that', 'but']
they ['they', 'them', 'because']
our ['our', 'and', 'americas']
with ['with', 'and', 'ultimatums']
from ['from', 'georgia', 'junior']
will ['will', 'can', 'wont']
was ['was', 'had', 'went']
has ['has', 'hasnt', 'have']
you ['you', 'your', 'yourself']
would ['would', 'could', 'might']
but ['but', 'because', 'not']
what ['what', 'that', 'something']
their ['their', 'themselves', 'they']
bill ['bill', 'legislation', 'provision']
about ['about', 'aboutand', 'aboutthe']
people ['people', 'folks', 'americans']
who ['who', 'whom', 'whose']
all ['all', 'what', 'that']
president ['president', 'barack', 'presidentelect']
there ['there', 'some', 'but']
time ['time', 'days', 'congressto']
more ['more', 'less', 'faster']
speaker ['speaker', 'chair', 'chairperson']
going ['going'

In [28]:
accuracy_scores

(1.0, 268)

In [29]:
print(r_intersect_index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174, 175, 176, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 193, 194, 195, 196, 197, 198, 200, 201, 202, 203, 205, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 221, 222, 224, 225, 226, 227, 228, 229, 230,

In [30]:
print(intersect)

['the', 'and', 'that', 'this', 'for', 'have', 'are', 'not', 'they', 'our', 'with', 'from', 'will', 'was', 'has', 'you', 'would', 'but', 'what', 'their', 'bill', 'about', 'people', 'who', 'all', 'president', 'there', 'time', 'more', 'speaker', 'going', 'these', 'can', 'his', 'been', 'one', 'were', 'when', 'because', 'out', 'just', 'which', 'those', 'now', 'its', 'american', 'amendment', 'years', 'government', 'had', 'them', 'states', 'here', 'other', 'get', 'some', 'over', 'want', 'year', 'think', 'today', 'know', 'than', 'country', 'health', 'congress', 'very', 'house', 'senate', 'new', 'care', 'said', 'make', 'work', 'federal', 'should', 'also', 'need', 'many', 'like', 'dont', 'say', 'back', 'state', 'yield', 'senator', 'support', 'way', 'law', 'how', 'percent', 'committee', 'united', 'well', 'tax', 'budget', 'legislation', 'act', 'only', 'into', 'chairman', 'any', 'jobs', 'right', 'where', 'last', 'gentleman', 'money', 'could', 'take', 'every', 'first', 'even', 'thank', 'colleagues',

In [31]:
print(r_model.wv.index_to_key[53])

other


In [32]:
intersect_all = []
for word in r_model.wv.index_to_key:
    if word in d_model.wv.index_to_key:
        intersect_all.append(word)

In [33]:
intersect_all[:10]

['the', 'and', 'that', 'this', 'for', 'have', 'are', 'not', 'they', 'our']

In [34]:
r_intersect_index_all = [r_model.wv.key_to_index[item] for item in intersect_all]
d_intersect_index_all = [d_model.wv.key_to_index[item] for item in intersect_all]

In [35]:
r_embeddings_trunc_all = r_embeddings[r_intersect_index_all]
d_embeddings_trunc_all = d_embeddings[d_intersect_index_all]


In [36]:
X_all = procrustes(r_embeddings_trunc_all, d_embeddings_trunc_all)
r_embeddings_aligned_all = r_embeddings @ X_all

In [37]:
accuracy_scores_all = get_accuracy_scores(d_embeddings, r_embeddings_aligned_all, d_intersect_index_all, r_intersect_index_all, d_iidx, 3)

In [38]:
accuracy_scores_all

(0.4216278101582015, 28824)

In [39]:
# now let's try again with X being top 179 words
accuracy_scores_all = get_accuracy_scores(d_embeddings, r_embeddings_aligned, d_intersect_index_all, r_intersect_index_all, d_iidx, 3)


In [40]:
accuracy_scores_all

(0.38662225922842075, 28824)

In [41]:
# find the words that have largest difference in cosine similarity
def find_largest_diff_words(d_embeddings, r_embeddings_aligned, d_intersect_index, r_intersect_index, d_iidx, top_n = 10):
    diff_dict = {}
    for i in range(len(r_intersect_index)):
        r_index = r_intersect_index[i]
        d_index = d_intersect_index[i]
        d_word = d_iidx[d_index]
        cos_sim = np.dot(d_embeddings[d_index], r_embeddings_aligned[r_index]) / (np.linalg.norm(d_embeddings[d_index]) * np.linalg.norm(r_embeddings_aligned[r_index]))
        diff_dict[d_word] = cos_sim
    diff_dict = dict(sorted(diff_dict.items(), key=lambda item: item[1]))
    return list(diff_dict.items())[:top_n]

In [42]:
item = find_largest_diff_words(d_embeddings, r_embeddings_aligned, d_intersect_index_all, r_intersect_index_all, d_iidx)

In [43]:
item

[('meto', -0.31950065),
 ('wedded', -0.2653914),
 ('increaseand', -0.24787006),
 ('extracts', -0.24365585),
 ('structuring', -0.239897),
 ('excepted', -0.23194943),
 ('thingsthat', -0.21693844),
 ('othersto', -0.21677633),
 ('calabro', -0.2156397),
 ('infinitum', -0.21418825)]

In [44]:
item = find_largest_diff_words(d_embeddings, r_embeddings_aligned_all, d_intersect_index_all, r_intersect_index_all, d_iidx)
item

[('calabro', -0.32215145),
 ('meto', -0.30824497),
 ('wedded', -0.28844658),
 ('thingsthat', -0.2592696),
 ('unattended', -0.24431552),
 ('structuring', -0.24011657),
 ('extracts', -0.2296219),
 ('stateto', -0.22482207),
 ('othersto', -0.2207074),
 ('wildeyed', -0.21878976)]

In [45]:
print(d_intersect_index_all)

[0, 1, 2, 4, 3, 5, 6, 10, 9, 7, 8, 11, 12, 13, 15, 20, 18, 16, 24, 14, 17, 23, 22, 19, 21, 27, 26, 25, 29, 38, 47, 28, 30, 32, 31, 33, 36, 34, 39, 52, 50, 35, 37, 46, 48, 45, 43, 41, 98, 60, 51, 63, 74, 54, 62, 68, 69, 55, 73, 82, 57, 72, 61, 49, 40, 76, 71, 79, 42, 66, 67, 104, 58, 44, 87, 75, 64, 70, 59, 88, 105, 94, 90, 78, 85, 53, 65, 100, 115, 91, 89, 77, 96, 107, 112, 117, 80, 56, 97, 99, 120, 108, 81, 114, 118, 103, 137, 302, 153, 119, 122, 109, 106, 132, 92, 83, 102, 146, 86, 123, 133, 93, 128, 130, 138, 95, 121, 141, 134, 84, 163, 164, 116, 140, 158, 124, 174, 279, 136, 161, 149, 142, 127, 170, 152, 145, 165, 150, 296, 172, 113, 101, 155, 129, 159, 212, 176, 181, 135, 110, 144, 189, 111, 162, 215, 139, 182, 208, 157, 169, 304, 184, 178, 196, 166, 230, 233, 341, 206, 221, 216, 180, 177, 223, 183, 193, 262, 250, 168, 207, 126, 209, 373, 143, 286, 179, 191, 173, 167, 316, 205, 255, 151, 195, 575, 156, 313, 248, 200, 224, 210, 237, 190, 175, 201, 171, 198, 213, 211, 220, 334, 245,

In [46]:
# find the words that have largest difference in cosine similarity
def find_largest_diff_words_with_limit(d_embeddings, r_embeddings_aligned, d_intersect_index, r_intersect_index, d_iidx, top_n = 10000, top_diff_count = 10):
    diff_dict = {}
    for i in range(top_n):
        d_index = d_intersect_index[i]
        r_index = r_intersect_index[i]
        d_word = d_iidx[d_index]
        cos_sim = np.dot(d_embeddings[d_index], r_embeddings_aligned[r_index]) / (np.linalg.norm(d_embeddings[d_index]) * np.linalg.norm(r_embeddings_aligned[r_index]))
        diff_dict[d_word] = cos_sim
    diff_dict = dict(sorted(diff_dict.items(), key=lambda item: item[1]))
    return list(diff_dict.items())[:top_diff_count]


In [47]:
item = find_largest_diff_words_with_limit(d_embeddings, r_embeddings_aligned_all, d_intersect_index_all, r_intersect_index_all, d_iidx)

In [63]:
#print words only
[item[0] for item in item]

['monstrosity',
 'kermit',
 'loaf',
 'majoring',
 'wyomings',
 'nationalization',
 'partial',
 'rhode',
 'practically',
 'north']

In [64]:
item = find_largest_diff_words_with_limit(d_embeddings, r_embeddings_aligned, d_intersect_index_all, r_intersect_index_all, d_iidx)

In [65]:
[item[0] for item in item]


['monstrosity',
 'partial',
 'loaf',
 'majoring',
 'nationalized',
 'stepbystep',
 'practically',
 'kermit',
 'rhode',
 'nationalization']

In [66]:
item = find_largest_diff_words_with_limit(d_embeddings, r_embeddings_aligned_direct, d_intersect_index_all, r_intersect_index_all, d_iidx)

In [67]:
[item[0] for item in item]


['monstrosity',
 'kermit',
 'loaf',
 'majoring',
 'wyomings',
 'nationalization',
 'partial',
 'rhode',
 'north',
 'practically']

In [68]:
item = find_largest_diff_words_with_limit(d_embeddings, r_embeddings_aligned_lr, d_intersect_index_all, r_intersect_index_all, d_iidx)

In [69]:
[item[0] for item in item]


['monstrosity',
 'kermit',
 'loaf',
 'majoring',
 'wyomings',
 'nationalization',
 'partial',
 'rhode',
 'north',
 'practically']

In [70]:
item = find_largest_diff_words_with_limit(d_embeddings, r_embeddings_aligned_wlr_freq, d_intersect_index_all, r_intersect_index_all, d_iidx)

In [71]:
[item[0] for item in item]


['monstrosity',
 'kermit',
 'loaf',
 'majoring',
 'partial',
 'wyomings',
 'nationalization',
 'rhode',
 'practically',
 'north']

In [72]:
item = find_largest_diff_words_with_limit(d_embeddings, r_embeddings_aligned_wlr_idiff, d_intersect_index_all, r_intersect_index_all, d_iidx)


In [73]:
[item[0] for item in item]

['monstrosity',
 'kermit',
 'loaf',
 'majoring',
 'wyomings',
 'nationalization',
 'partial',
 'rhode',
 'practically',
 'north']

In [91]:
def find_politic_word_disagreement(d_embeddings, r_embeddings_aligned, politics_words):
    for word in politics_words:
        d_index = d_model.wv.key_to_index[word]
        r_index = r_model.wv.key_to_index[word]
        d_word = d_iidx[d_index]
        cos_sim = np.dot(d_embeddings[d_index], r_embeddings_aligned[r_index]) / (np.linalg.norm(d_embeddings[d_index]) * np.linalg.norm(r_embeddings_aligned[r_index]))
        print(word)

In [92]:
find_politic_word_disagreement(d_embeddings, r_embeddings_aligned, politics_words)

freedom
justice
equality
democracy
abortion
immigration
welfare
taxes
democrat
republican


In [83]:
find_politic_word_disagreement(d_embeddings, r_embeddings_aligned_all, politics_words)

0.8271054
0.81216997
0.74653965
0.80377513
0.594437
0.73678476
0.63298404
0.73294187
0.5833646
0.561932


In [84]:
find_politic_word_disagreement(d_embeddings, r_embeddings_aligned_direct, politics_words)

0.71143943
0.71216804
0.6343141
0.7583278
0.6178261
0.604306
0.53548837
0.7378518
0.57620066
0.5848397


In [85]:
find_politic_word_disagreement(d_embeddings, r_embeddings_aligned_lr, politics_words)

0.71143925
0.71216804
0.6343142
0.75832784
0.6178261
0.6043055
0.53548837
0.7378521
0.5762012
0.58483994


In [86]:
find_politic_word_disagreement(d_embeddings, r_embeddings_aligned_wlr_freq, politics_words)

0.71572083
0.7097303
0.63120216
0.76314104
0.616808
0.600491
0.5395277
0.74076515
0.57942134
0.585039


In [87]:
find_politic_word_disagreement(d_embeddings, r_embeddings_aligned_wlr_idiff, politics_words)

0.7076318
0.7123462
0.6292194
0.7543711
0.620811
0.6052799
0.53074217
0.7395971
0.57742107
0.58340424


In [112]:
# find most similar / different words in the top 1000 words based on cosine similarity
import nltk
def find_most_similar_and_different(d_embeddings, r_aligned, d_intersect_index, r_intersect_index, top_n=10000):
    greatest_similarity = {}
    least_similarity = {}
    for i in range(top_n):
        d_index = d_intersect_index[i + 300]
        r_index = r_intersect_index[i + 300]
        d_word = d_iidx[d_index]
        pos_tag = nltk.pos_tag([d_word])[0][1]
        cos_sim = np.dot(d_embeddings[d_index], r_aligned[r_index]) / (np.linalg.norm(d_embeddings[d_index]) * np.linalg.norm(r_aligned[r_index]))
        if pos_tag in greatest_similarity:
            if cos_sim > greatest_similarity[pos_tag][1]:
                greatest_similarity[pos_tag] = (d_word, cos_sim)
        else:
            greatest_similarity[pos_tag] = (d_word, cos_sim)
            
        if pos_tag in least_similarity:
            if cos_sim < least_similarity[pos_tag][1]:
                least_similarity[pos_tag] = (d_word, cos_sim)
        else:
            least_similarity[pos_tag] = (d_word, cos_sim)
    return greatest_similarity, least_similarity
        
g, l = find_most_similar_and_different(d_embeddings, r_embeddings_aligned, d_intersect_index_all, r_intersect_index_all)
g_all, l_all = find_most_similar_and_different(d_embeddings, r_embeddings_aligned_all, d_intersect_index_all, r_intersect_index_all)
g_direct, l_direct = find_most_similar_and_different(d_embeddings, r_embeddings_aligned_direct, d_intersect_index_all, r_intersect_index_all)
g_lr, l_lr = find_most_similar_and_different(d_embeddings, r_embeddings_aligned_lr, d_intersect_index_all, r_intersect_index_all)
g_wlr_freq, l_wlr_freq = find_most_similar_and_different(d_embeddings, r_embeddings_aligned_wlr_freq, d_intersect_index_all, r_intersect_index_all)
g_wlr_idiff, l_wlr_idiff = find_most_similar_and_different(d_embeddings, r_embeddings_aligned_wlr_idiff, d_intersect_index_all, r_intersect_index_all)


In [119]:
# print all similar words
pos_tags = set(g.keys())
for tag in pos_tags:
    print(tag, end=": ")
    #print seperated by comma
    print(g[tag][0], g_all[tag][0], g_direct[tag][0], g_lr[tag][0], g_wlr_freq[tag][0], g_wlr_idiff[tag][0], sep=", ")

JJ: fiscal, fiscal, several, several, several, several
VBN: asked, worked, taken, taken, taken, taken
WDT: whatever, whatever, whatever, whatever, whatever, whatever
JJS: worst, greatest, greatest, greatest, greatest, greatest
PRP: themselves, themselves, myself, myself, myself, myself
IN: despite, despite, despite, despite, despite, despite
WP: whom, whom, whom, whom, whom, whom
NNS: efforts, efforts, nations, nations, nations, nations
VBZ: makes, requires, makes, makes, makes, makes
CC: nor, nor, nor, nor, nor, nor
JJR: greater, easier, worse, worse, worse, worse
WP$: whose, whose, whose, whose, whose, whose
VBD: served, served, went, went, went, went
DT: each, each, each, each, each, each
RB: rather, rather, rather, rather, rather, rather
NN: colleague, honor, colleague, colleague, colleague, colleague
MD: might, might, might, might, might, might
RBR: less, earlier, less, less, less, less
VBG: making, including, making, making, making, making
CD: three, three, three, three, three, t

In [121]:
for tag in pos_tags:
    print(tag, end=": ")
    #print seperated by comma
    print(l_all[tag][0], l[tag][0], l_direct[tag][0], l_lr[tag][0], l_wlr_freq[tag][0], l_wlr_idiff[tag][0], sep=", ")

JJ: partial, partial, partial, partial, partial, partial
VBN: nationalized, nationalized, nationalized, nationalized, nationalized, nationalized
WDT: whatever, whatever, whatever, whatever, whatever, whatever
JJS: weakest, weakest, weakest, weakest, weakest, weakest
PRP: yourself, himself, himself, himself, himself, himself
IN: besides, besides, besides, besides, besides, besides
WP: whom, whom, whom, whom, whom, whom
NNS: wyomings, carolinas, wyomings, wyomings, wyomings, wyomings
VBZ: expects, succeeds, succeeds, succeeds, succeeds, succeeds
CC: plus, plus, plus, plus, plus, plus
JJR: slower, slower, slower, slower, slower, slower
WP$: whose, whose, whose, whose, whose, whose
VBD: wrapped, wrapped, wrapped, wrapped, wrapped, wrapped
DT: neither, neither, neither, neither, neither, neither
RB: practically, massively, massively, massively, massively, massively
NN: monstrosity, monstrosity, monstrosity, monstrosity, monstrosity, monstrosity
MD: shall, shall, shall, shall, shall, shall
R

In [114]:
print(pos_tags)

{'NN'}


In [115]:
print(g)


{'NNS': ('efforts', 0.93569857), 'DT': ('each', 0.8841167), 'VBG': ('making', 0.9420995), 'IN': ('despite', 0.9007039), 'NN': ('colleague', 0.9386699), 'JJ': ('fiscal', 0.9242697), 'VBN': ('asked', 0.9233071), 'VB': ('ensure', 0.91148734), 'RBR': ('less', 0.93427026), 'CD': ('three', 0.9137507), 'RB': ('rather', 0.95849943), 'MD': ('might', 0.8663486), 'PRP': ('themselves', 0.87364966), 'JJS': ('worst', 0.8339169), 'VBD': ('served', 0.91650134), 'VBZ': ('makes', 0.91624725), 'JJR': ('greater', 0.8707279), 'WP': ('whom', 0.8104981), 'WDT': ('whatever', 0.8435412), 'WP$': ('whose', 0.8289185), 'CC': ('nor', 0.73260224)}


In [111]:
for i in l:
    # print(i)
    print(l[i][0])

carolinas
neither
majoring
besides
monstrosity
partial
nationalized
soar
earlier
nine
massively
shall
himself
weakest
wrapped
succeeds
slower
whom
whatever
whose
plus


In [104]:
#create table for most similar and different words
def create_table(g, l):
    table = []
    for key in g:
        table.append((key, g[key][0], g[key][1], l[key][0], l[key][1]))
    return table


In [105]:
create_table(g, l)

[('NNS', 'efforts', 0.93569857, 'carolinas', 0.15722075),
 ('DT', 'each', 0.8841167, 'neither', 0.6387919),
 ('VBG', 'making', 0.9420995, 'majoring', 0.0273699),
 ('IN', 'despite', 0.9007039, 'besides', 0.46316662),
 ('NN', 'colleague', 0.9386699, 'monstrosity', -0.05395666),
 ('JJ', 'fiscal', 0.9242697, 'partial', 0.015069652),
 ('VBN', 'asked', 0.9233071, 'nationalized', 0.03547022),
 ('VB', 'ensure', 0.91148734, 'soar', 0.49870375),
 ('RBR', 'less', 0.93427026, 'earlier', 0.92373115),
 ('CD', 'three', 0.9137507, 'nine', 0.7183859),
 ('RB', 'rather', 0.95849943, 'massively', 0.05930778),
 ('MD', 'might', 0.8663486, 'shall', 0.7907694),
 ('PRP', 'themselves', 0.87364966, 'himself', 0.74708164),
 ('JJS', 'worst', 0.8339169, 'weakest', 0.32085222),
 ('VBD', 'served', 0.91650134, 'wrapped', 0.3048239),
 ('VBZ', 'makes', 0.91624725, 'succeeds', 0.47568902),
 ('JJR', 'greater', 0.8707279, 'slower', 0.6480937),
 ('WP', 'whom', 0.8104981, 'whom', 0.8104981),
 ('WDT', 'whatever', 0.8435412, '

In [122]:
print(len(intersect_all))

28824
