In [1]:
from google.colab import drive
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import math
from prettytable import PrettyTable

# Mount

In [2]:
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Experiments/Galvan-MyVersion-Test
!ls

Mounted at /content/drive
/content/drive/MyDrive/Experiments/Galvan-MyVersion-Test
max_score_model.torch  wordsim_relatedness_goldstandard.txt
min_loss_model.torch   wordsim_similarity_goldstandard.txt
voca.txt


# Classes for the model

## Galvan Model Class

In [3]:
class Modello(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()

        self.n_vocab = n_vocab
        self.n_embed = n_embed

        self.in_embed = nn.Embedding(n_vocab, n_embed, dtype=torch.float64)
        self.out_embed = nn.Embedding(n_vocab, n_embed, dtype=torch.float64)
        # initrange = 0.5 / n_embed
        self.in_embed.weight.data.uniform_(-1, 1)
        self.out_embed.weight.data.uniform_(-1, 1)

    def forward_input(self, input_words): # takes a batch of input words and returns their embeddings.
        input_vector = self.in_embed(input_words)
        return input_vector

    def forward_output(self, output_words1, output_words2): # takes two batches of output words and returns their embeddings.
        output_vector1 = self.out_embed(output_words1)
        output_vector2 = self.out_embed(output_words2)
        return output_vector1, output_vector2

    def input_embeddings(self): # returns the input embeddings as a numpy array
        return self.in_embed.weight.data.cpu().numpy()

    def embeddinginput_dictionary(self, id2word): # return dictionary that map words to their corresponding input embeddings
        embedding = self.in_embed.weight.cpu().data.numpy()
        E = {}
        for wid, w in id2word.items():
            E[w] = embedding[wid]
        return E

    def embeddingoutput_dictionary(self, id2word): # return dictionary that map words to their corresponding input embeddings
        embedding = self.out_embed.weight.cpu().data.numpy()
        E = {}
        for wid, w in id2word.items():
            E[w] = embedding[wid]
        return E

    def forward_noise(self, noise_words): # takes a batch of noise words and returns their embeddings
        noise_vector = self.out_embed(noise_words)
        return noise_vector


# Reading Words

## Galvan

In [4]:
f = open('voca.txt')
line = f.readline()

galvan_vocab = []
galvan_wordindex = dict()
index = 0

while line:
    word = line.strip().split()[0]
    galvan_vocab.append(word)
    galvan_wordindex[word] = index
    index = index + 1
    line = f.readline()

f.close()

# Getting Models

## Galvan

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedding_dim = 300

galvan_model_score = Modello(63503, embedding_dim).to(device)
galvan_model_loss = Modello(63503, embedding_dim).to(device)

In [6]:
galvan_model_score.load_state_dict(torch.load("max_score_model.torch", map_location=device))
galvan_model_loss.load_state_dict(torch.load("min_loss_model.torch", map_location=device))
galvan_model_loss.eval()

Modello(
  (in_embed): Embedding(63503, 300)
  (out_embed): Embedding(63503, 300)
)

In [7]:
galvan_score_embed = galvan_model_score.input_embeddings()
galvan_loss_embed = galvan_model_loss.input_embeddings()

In [8]:
# Get embedding from given word and vice versa

def galvan_score_emb_from_word(word):
  try:
    return galvan_score_embed[int(galvan_wordindex[word])]
  except KeyError:
    return None

def galvan_score_word_from_embed(embed):
  try:
    ex_idx = next((idx for idx, emb in enumerate(galvan_score_embed) if np.array_equal(emb, embed)), None)
    return next((key for key, value in galvan_wordindex.items() if value == ex_idx), None)
  except KeyError:
    return None


In [9]:
print(galvan_score_word_from_embed(galvan_score_emb_from_word("bread")))
print(galvan_score_word_from_embed(galvan_score_emb_from_word("tunusia")))

bread
None


In [10]:
# Get embedding from given word and vice versa

def galvan_loss_emb_from_word(word):
  try:
    return galvan_loss_embed[int(galvan_wordindex[word])]
  except KeyError:
    return None

def galvan_loss_word_from_embed(embed):
  try:
    ex_idx = next((idx for idx, emb in enumerate(galvan_loss_embed) if np.array_equal(emb, embed)), None)
    return next((key for key, value in galvan_wordindex.items() if value == ex_idx), None)
  except KeyError:
    return None

In [11]:
print(galvan_loss_word_from_embed(galvan_loss_emb_from_word("bread")))
print(galvan_loss_word_from_embed(galvan_loss_emb_from_word("tunusia")))

bread
None


# Standards

In [12]:
similarity_standard = dict()

with open('wordsim_similarity_goldstandard.txt', 'r') as f:
    lines = f.readlines()
    lines = [line.strip().split() for line in lines]

    similarity_word_pairs = [(line[0], line[1]) for line in lines]
    similarity_human_scores = [float(line[2]) for line in lines] #  / 10 * 2 - 1

    similarity_word_pairs.extend([(line[1], line[0]) for line in lines])
    similarity_human_scores.extend([float(line[2]) for line in lines])
    
    similarity_voc = {pair[0] for pair in similarity_word_pairs}

    for x in range(len(similarity_human_scores)):
      similarity_standard[similarity_word_pairs[x]] = similarity_human_scores[x]


In [13]:
similarity_standard[('school','center')]

3.44

In [14]:
relatedness_standard = dict()

with open('wordsim_relatedness_goldstandard.txt', 'r') as f:
    lines = f.readlines()
    lines = [line.strip().split() for line in lines]

    relatedness_word_pairs = [(line[0], line[1]) for line in lines]
    relatedness_human_scores = [float(line[2]) for line in lines] #  / 10 * 2 - 1

    relatedness_word_pairs.extend([(line[1], line[0]) for line in lines])
    relatedness_human_scores.extend([float(line[2]) for line in lines])
    
    relatedness_voc = {pair[0] for pair in relatedness_word_pairs}

    for x in range(len(relatedness_human_scores)):
      relatedness_standard[relatedness_word_pairs[x]] = relatedness_human_scores[x]

In [15]:
relatedness_standard[('morality', 'marriage')]

3.69

# Similarity Function

In [16]:
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    
    norm_v1 = np.linalg.norm(v1)
    
    norm_v2 = np.linalg.norm(v2)
    
    cos_sim = dot_product / (norm_v1 * norm_v2)
    
    return cos_sim

In [17]:
print(cosine_similarity(galvan_loss_emb_from_word("school"), galvan_loss_emb_from_word("student")))
print(cosine_similarity(galvan_score_emb_from_word("school"), galvan_score_emb_from_word("student")))

-0.06073248893174742
-0.06073248893174742


# Nearest Neighbors

## Galvan

In [20]:
def galvan_neighbors_from_word(word, model = "score", topk = 10):

  neighbor_words = ["<NULL>"] * topk
  neighbor_similarities = [-1.0] * topk
  min_neighbor = neighbor_similarities.index(min(neighbor_similarities))

  if word not in galvan_wordindex.keys():
    return neighbor_words, neighbor_similarities

  word2emb = galvan_score_emb_from_word

  if model == "loss":
    word2emb = galvan_loss_emb_from_word

  word_vector = word2emb(word)

  for w in galvan_wordindex.keys():
    emb = word2emb(w)
    sim = cosine_similarity(word_vector, emb)

    if sim >= neighbor_similarities[min_neighbor]:
      neighbor_similarities[min_neighbor] = sim
      neighbor_words[min_neighbor] = w

      min_neighbor = neighbor_similarities.index(min(neighbor_similarities))

  return neighbor_words, neighbor_similarities  

In [21]:
print(galvan_neighbors_from_word("car", "loss", 5)[0])
print(galvan_neighbors_from_word("car", "score", 5)[0])

['rectum', 'angered', 'viewpoint', 'car', 'monuc']
['rectum', 'angered', 'viewpoint', 'car', 'monuc']


## Standards

In [22]:
def standards_neighbors_from_word(word, std_type = "similarity", topk = 10):
  std = similarity_standard
  std_voc = similarity_voc

  if std_type == "relatedness":
    std_voc = relatedness_voc
    std = relatedness_standard

  neighbor_words = ["<NULL>"] * topk
  neighbor_similarities = [0.0] * topk
  min_neighbor = neighbor_similarities.index(min(neighbor_similarities))

  if word not in std_voc:
    return neighbor_words, neighbor_similarities

  similarities = [(pair[1], std[pair]) for pair in std.keys() if pair[0] == word]

  for s in similarities:
    w, sim = s

    if sim >= neighbor_similarities[min_neighbor]:
      neighbor_similarities[min_neighbor] = sim
      neighbor_words[min_neighbor] = w

      min_neighbor = neighbor_similarities.index(min(neighbor_similarities))

  return neighbor_words, neighbor_similarities

In [23]:
print(standards_neighbors_from_word("car", "similarity"))
print(standards_neighbors_from_word("car", "relatedness"))

(['automobile', 'flight', 'plane', 'train', 'jaguar', 'drink', '<NULL>', '<NULL>', '<NULL>', '<NULL>'], [8.94, 4.94, 5.77, 6.31, 7.27, 3.04, 0.0, 0.0, 0.0, 0.0])
(['flight', 'luxury', 'journey', 'drink', '<NULL>', '<NULL>', '<NULL>', '<NULL>', '<NULL>', '<NULL>'], [4.94, 6.47, 5.85, 3.04, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])


# Results

In [24]:
def neighbors_table(word, topk = 10, print_scores = False):
  print("\n\nFor given word: ", word, "\n")

  data = []
  data.append(standards_neighbors_from_word(word, "similarity", topk))
  data.append(standards_neighbors_from_word(word, "relatedness", topk))
  data.append(galvan_neighbors_from_word(word, "loss", topk))
  data.append(galvan_neighbors_from_word(word, "score", topk))

  table = PrettyTable()
  table.add_column("Similarity Standard", data[0][0])
  table.add_column("Relatedness Standard", data[1][0])
  table.add_column("Galvan Min Loss", data[2][0])
  table.add_column("Galvan Max Score", data[3][0])

  print("--- NEIGHBORS ---")   
  print(table)


  table2 = PrettyTable()
  table2.add_column("Similarity Standard", data[0][1])
  table2.add_column("Relatedness Standard", data[1][1])
  table2.add_column("Galvan Min Loss", data[2][1])
  table2.add_column("Galvan Max Score", data[3][1])

  if print_scores:
    print("\n\n--- SIMILARITIES ---")
    print(table2)

In [25]:
from collections import Counter

def most_frequent_items(lst1, lst2, n):
    counter = Counter(lst1)
    counter.update(lst2)
    most_common = counter.most_common(n)
    return [item for item, count in most_common]

lst1 = [pair[0] for pair in similarity_word_pairs]
lst2 = [pair[0] for pair in relatedness_word_pairs]
common = most_frequent_items(lst1, lst2, 5)

print("Most common words in standards: ", common)

Most common words in standards:  ['stock', 'money', 'cup', 'psychology', 'tiger']


In [26]:
topk = 5
print_scores = False

for word in common:
  neighbors_table(word, topk, print_scores)



For given word:  stock 

--- NEIGHBORS ---
+---------------------+----------------------+-----------------+------------------+
| Similarity Standard | Relatedness Standard | Galvan Min Loss | Galvan Max Score |
+---------------------+----------------------+-----------------+------------------+
|         live        |        market        |      stock      |      stock       |
|         egg         |         live         |    animation    |    animation     |
|        phone        |         egg          |  socialization  |  socialization   |
|          CD         |         oil          |      franjo     |      franjo      |
|        jaguar       |       company        |   standardised  |   standardised   |
+---------------------+----------------------+-----------------+------------------+


For given word:  money 

--- NEIGHBORS ---
+---------------------+----------------------+-----------------+------------------+
| Similarity Standard | Relatedness Standard | Galvan Min Loss | Galva

In [27]:
word = "car"
topk = 10
print_scores = True

neighbors_table(word, topk, print_scores)



For given word:  car 

--- NEIGHBORS ---
+---------------------+----------------------+-----------------+------------------+
| Similarity Standard | Relatedness Standard | Galvan Min Loss | Galvan Max Score |
+---------------------+----------------------+-----------------+------------------+
|      automobile     |        flight        |      monuc      |      monuc       |
|        flight       |        luxury        |      rectum     |      rectum      |
|        plane        |       journey        |       car       |       car        |
|        train        |        drink         |     robbers     |     robbers      |
|        jaguar       |        <NULL>        |     angered     |     angered      |
|        drink        |        <NULL>        |       cong      |       cong       |
|        <NULL>       |        <NULL>        |    viewpoint    |    viewpoint     |
|        <NULL>       |        <NULL>        |     classics    |     classics     |
|        <NULL>       |        <N

# Phase 2

# Context Embeddings

In [28]:
galvan_score_context = galvan_model_score.out_embed.weight.data.cpu().numpy()
galvan_loss_context = galvan_model_loss.out_embed.weight.data.cpu().numpy()

# Utils

In [29]:
def galvan_loss_emb_from_word2(word):
  try:
    return galvan_loss_context[int(galvan_wordindex[word])]
  except KeyError:
    return None

def galvan_loss_word_from_embed2(embed):
  try:
    ex_idx = next((idx for idx, emb in enumerate(galvan_loss_context) if np.array_equal(emb, embed)), None)
    return next((key for key, value in galvan_wordindex.items() if value == ex_idx), None)
  except KeyError:
    return None

In [30]:
def galvan_score_emb_from_word2(word):
  try:
    return galvan_score_context[int(galvan_wordindex[word])]
  except KeyError:
    return None

def galvan_score_word_from_embed2(embed):
  try:
    ex_idx = next((idx for idx, emb in enumerate(galvan_score_context) if np.array_equal(emb, embed)), None)
    return next((key for key, value in galvan_wordindex.items() if value == ex_idx), None)
  except KeyError:
    return None


# Neighbor Functions

In [31]:
def galvan_neighbors_from_word2(word, model = "score", topk = 10):

  neighbor_words = ["<NULL>"] * topk
  neighbor_similarities = [-1.0] * topk
  min_neighbor = neighbor_similarities.index(min(neighbor_similarities))

  if word not in galvan_wordindex.keys():
    return neighbor_words, neighbor_similarities

  word2emb = galvan_score_emb_from_word2

  if model == "loss":
    word2emb = galvan_loss_emb_from_word2

  word_vector = word2emb(word)

  for w in galvan_wordindex.keys():
    emb = word2emb(w)
    sim = cosine_similarity(word_vector, emb)

    if sim >= neighbor_similarities[min_neighbor]:
      neighbor_similarities[min_neighbor] = sim
      neighbor_words[min_neighbor] = w

      min_neighbor = neighbor_similarities.index(min(neighbor_similarities))

  return neighbor_words, neighbor_similarities 

# Results

In [32]:
def neighbors_table2(word, topk = 10, print_scores = False):
  print("\n\nFor given word: ", word, "\n")

  data = []
  data.append(standards_neighbors_from_word(word, "similarity", topk))
  data.append(standards_neighbors_from_word(word, "relatedness", topk))
  data.append(galvan_neighbors_from_word2(word, "loss", topk))
  data.append(galvan_neighbors_from_word2(word, "score", topk))

  table = PrettyTable()
  table.add_column("Similarity Standard", data[0][0])
  table.add_column("Relatedness Standard", data[1][0])
  table.add_column("Galvan Min Loss", data[2][0])
  table.add_column("Galvan Max Score", data[3][0])

  print("--- NEIGHBORS ---")   
  print(table)


  table2 = PrettyTable()
  table2.add_column("Similarity Standard", data[0][1])
  table2.add_column("Relatedness Standard", data[1][1])
  table2.add_column("Galvan Min Loss", data[2][1])
  table2.add_column("Galvan Max Score", data[3][1])

  if print_scores:
    print("\n\n--- SIMILARITIES ---")
    print(table2)

In [33]:
topk = 5
print_scores = False

for word in common:
  neighbors_table2(word, topk, print_scores)



For given word:  stock 

--- NEIGHBORS ---
+---------------------+----------------------+-----------------+------------------+
| Similarity Standard | Relatedness Standard | Galvan Min Loss | Galvan Max Score |
+---------------------+----------------------+-----------------+------------------+
|         live        |        market        |      ravens     |      ravens      |
|         egg         |         live         |    subspecies   |    subspecies    |
|        phone        |         egg          |      corso      |      corso       |
|          CD         |         oil          |      stock      |      stock       |
|        jaguar       |       company        |       dump      |       dump       |
+---------------------+----------------------+-----------------+------------------+


For given word:  money 

--- NEIGHBORS ---
+---------------------+----------------------+-----------------+------------------+
| Similarity Standard | Relatedness Standard | Galvan Min Loss | Galva

In [34]:
word = "tango"
topk = 3
print_scores = True

neighbors_table2(word, topk, print_scores)



For given word:  tango 

--- NEIGHBORS ---
+---------------------+----------------------+-----------------+------------------+
| Similarity Standard | Relatedness Standard | Galvan Min Loss | Galvan Max Score |
+---------------------+----------------------+-----------------+------------------+
|        <NULL>       |        <NULL>        |     arrakis     |     arrakis      |
|        <NULL>       |        <NULL>        |     layouts     |     layouts      |
|        <NULL>       |        <NULL>        |      tango      |      tango       |
+---------------------+----------------------+-----------------+------------------+


--- SIMILARITIES ---
+---------------------+----------------------+---------------------+---------------------+
| Similarity Standard | Relatedness Standard |   Galvan Min Loss   |   Galvan Max Score  |
+---------------------+----------------------+---------------------+---------------------+
|         0.0         |         0.0          |  0.2541713391603402 |  0

# Phase 3

In [35]:
from statistics import mean 

In [36]:
galvan_loss_ct = []
galvan_score_ct = []
sgns_ct = []

for word in galvan_wordindex.keys():
  galvan_loss_ct.append(cosine_similarity(galvan_loss_emb_from_word(word), galvan_loss_emb_from_word2(word)))
  galvan_score_ct.append(cosine_similarity(galvan_score_emb_from_word(word), galvan_score_emb_from_word2(word)))

print("Average cosine similarity of target & context embeddings of each word in...")
print("Galvan Min Loss: ", mean(galvan_loss_ct))
print("Galvan Max Score: ", mean(galvan_score_ct))

Average cosine similarity of target & context embeddings of each word in...
Galvan Min Loss:  4.941992656110668e-05
Galvan Max Score:  4.941992656110668e-05
