## This is a starter program for training and apply word embeddings

In [23]:
# -----------------------------------------------------------------
# Import text8 file, which is the first 100MB of the English Wikipedia.
# The file is available at   https://www.kaggle.com/datasets/includelgc/word2vectext8
# The file is already downloaded and saved in the folder text8 Wikipedia.
# The file is a single line of text with 17,005,207 words and 100,000,000 characters.
# The file is read and the first 1,000,000 characters are read and stored in the variable text.
# The text is split into words and the punctuation is removed.
# -----------------------------------------------------------------



import numpy as np
import tensorflow as tf
import time
# put a time stamp with hour, min, sec
print(time.strftime("%H:%M:%S", time.gmtime()))


# filename = './text8'
filename = './reviews_Video_Games_5.json'

def read_data(filename, char):
    with open(filename, 'r') as f:
        data = f.read(char)
    return data


text = read_data(filename, 1000000)

print(time.strftime("%H:%M:%S", time.gmtime()))

len(text)

print(time.strftime("%H:%M:%S", time.gmtime()))

# split the text into words
words = text.split()
len(words)
print(time.strftime("%H:%M:%S", time.gmtime()))

# remove the punctuation
import string
table = str.maketrans('', '', string.punctuation)
words = [w.translate(table) for w in words]
print(len(words))
print(time.strftime("%H:%M:%S", time.gmtime()))
print(words)

21:52:03
21:52:03
21:52:03
21:52:03
165805
21:52:03


In [24]:
# -----------------------------------------------------------------
# Preprocessing the text
# remove the non-alphabetic tokens
# convert the words to lower case
# remove the stop words
# remove the short words
# count the frequency of each word
# remove the words that appear only once
# create a vocabulary
# create a dictionary to convert words to integers
# -----------------------------------------------------------------



print(time.strftime("%H:%M:%S", time.gmtime()))

# remove the non-alphabetic tokens
words = [word for word in words if word.isalpha()]
print("step 1", len(words))

print(time.strftime("%H:%M:%S", time.gmtime()))


# convert the words to lower case
words = [word.lower() for word in words]
print("step 2", len(words))

print(time.strftime("%H:%M:%S", time.gmtime()))

# remove the stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

words = [word for word in words if word not in stop_words]
print("step 3", len(words))
print(time.strftime("%H:%M:%S", time.gmtime()))


# # remove the short words
# words = [word for word in words if len(word) > 1]
# print("step 4", len(words))

# print(time.strftime("%H:%M:%S", time.gmtime()))

# count the frequency of each word
from collections import Counter
word_freq = Counter(words)
print("step 5", len(word_freq))

print(time.strftime("%H:%M:%S", time.gmtime()))

# remove the words that appear only once
words = [word for word in words if word_freq[word] > 1]
print("step 6", len(words))

print(time.strftime("%H:%M:%S", time.gmtime()))

# create a vocabulary
vocab = set(words)
print("step 7", len(vocab))

print(time.strftime("%H:%M:%S", time.gmtime()))


# create a dictionary to convert words to integers
word_to_int = {w: i for i, w in enumerate(vocab)}
print(len(word_to_int))

# convert the words to integers
int_words = [word_to_int[word] for word in words]
print(len(int_words))

# create a dictionary to convert integers to words
int_to_word = {i: w for i, w in enumerate(vocab)}
print(len(int_to_word))

print(time.strftime("%H:%M:%S", time.gmtime()))

# create corpus by converting words to numbers


21:52:11
step 1 155335
21:52:11
step 2 155335
21:52:11
step 3 84961
21:52:11
step 5 11896
21:52:11
step 6 78205
21:52:11
step 7 5140
21:52:11
5140
78205
5140
21:52:11


In [25]:
# -----------------------------------------------------------------
# Word2Vec is a predictive model that learns to predict the context from the target word.
# The input to the model is a word and the output is the context words.
# The window size is the maximum distance between the target word and the context words.
# The input to the model is a one-hot encoded vector of the target word.
# The output of the model is a probability distribution over the vocabulary.
# The probability distribution is the likelihood of the context words given the target word.
# The model is trained to maximize the likelihood of the context words given the target word (Skip-gram model).
# The model is trained using the negative log likelihood loss function.
# The model is trained using the stochastic gradient descent optimization algorithm.
# The word2vec algorithm is a two-layer neural network.
# -----------------------------------------------------------------


from gensim.models import word2vec

model_cb = word2vec.Word2Vec(sentences=[words],  window = 2 ,epochs = 50, vector_size=300,  min_count=1)

print(time.strftime("%H:%M:%S", time.gmtime()))

# # Optionally save the model
# model.save('word2vec.model_cb')

# print(time.strftime("%H:%M:%S", time.gmtime()))

# # load the model


# model_cb = word2vec.Word2Vec.load('word2vec.model_cb')
print(time.strftime("%H:%M:%S", time.gmtime()))

# # get the word vector for a word
# print(model_cb.wv['diabetes'].shape)

print(time.strftime("%H:%M:%S", time.gmtime()))

# get the most similar words
print(model_cb.wv.most_similar('one', topn=5))

print(time.strftime("%H:%M:%S", time.gmtime()))



21:52:25
21:52:25
21:52:25
[('exactly', 0.9967808723449707), ('made', 0.9964603781700134), ('silicone', 0.9964349269866943), ('cover', 0.9963006973266602), ('best', 0.9957575798034668)]
21:52:25


In [26]:
# print the vocabulary in text file
# len(vocab)
vocab

{'heat',
 'market',
 'named',
 'functions',
 'fisher',
 'ward',
 'teepo',
 'swinging',
 'checking',
 'trying',
 'hyrule',
 'remaining',
 'gold',
 'argue',
 'ultimate',
 'hair',
 'possible',
 'constantly',
 'classic',
 'row',
 'time',
 'drawback',
 'chamber',
 'unlock',
 'anytime',
 'actually',
 'hang',
 'item',
 'necessity',
 'owners',
 'blends',
 'ended',
 'knock',
 'yet',
 'knew',
 'screw',
 'crowbar',
 'defining',
 'solved',
 'raquel',
 'silo',
 'important',
 'tm',
 'knight',
 'gotta',
 'anthony',
 'tunes',
 'show',
 'met',
 'sacred',
 'crapthe',
 'fit',
 'line',
 'frustration',
 'fingers',
 'involves',
 'gf',
 'trex',
 'asin',
 'zombies',
 'uzis',
 'radio',
 'satisfy',
 'mass',
 'tradition',
 'mission',
 'legends',
 'retrieve',
 'learning',
 'allowed',
 'neo',
 'nuke',
 'structures',
 'wanted',
 'phantasmagoria',
 'multitude',
 'touch',
 'fiddling',
 'z',
 'always',
 'availablethe',
 'whenever',
 'rich',
 'left',
 'alter',
 'todays',
 'consoles',
 'deep',
 'nuts',
 'petersen',
 'in

In [27]:
# list the words in the vocabulary that the model learned
len(model_cb.wv.index_to_key)
model_cb.wv.index_to_key

['game',
 'overall',
 'helpful',
 'reviewerid',
 'asin',
 'summary',
 'reviewtext',
 'reviewtime',
 'unixreviewtime',
 'reviewername',
 'one',
 'games',
 'time',
 'like',
 'get',
 'great',
 'play',
 'good',
 'best',
 'fun',
 'first',
 'zelda',
 'graphics',
 'even',
 'really',
 'still',
 'also',
 'much',
 'ever',
 'dont',
 'played',
 'well',
 'many',
 'better',
 'levels',
 'go',
 'back',
 'mario',
 'would',
 'way',
 'characters',
 'find',
 'playing',
 'story',
 'new',
 'level',
 'world',
 'series',
 'link',
 'make',
 'buy',
 'lot',
 'pretty',
 'music',
 'nintendo',
 'never',
 'raider',
 'different',
 'tomb',
 'little',
 'gameplay',
 'around',
 'thing',
 'made',
 'know',
 'could',
 'say',
 'classic',
 'love',
 'ocarina',
 'got',
 'though',
 'old',
 'people',
 'think',
 'things',
 'right',
 'original',
 'want',
 'im',
 'easy',
 'years',
 'hard',
 'look',
 'youre',
 'beat',
 'take',
 'use',
 'long',
 'every',
 'cant',
 'mode',
 'version',
 'weapons',
 'two',
 'part',
 'ive',
 'crash',
 'se

In [29]:
#   -----------------------------------------------------------------
# get the similarity between two words
#   -----------------------------------------------------------------

print(model_cb.wv.similarity('zelda', 'heart'))
print(time.strftime("%H:%M:%S", time.gmtime()))



0.8097283
21:52:56


In [30]:
# Experimentation with data structures
# Convert model.wv to a list of dictionaries


import pandas as pd

# Convert model.wv to a list of dictionaries
word_vectors = [{'word': word, 'vector': model_cb.wv[word]} for word in model_cb.wv.index_to_key]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame.from_records(word_vectors)

# Print the DataFrame
print(df.head(n = 50))



              word                                             vector
0             game  [0.14726841, 0.2675027, 0.15350933, 0.18677944...
1          overall  [-0.41355503, 0.50909615, 0.06568519, 0.265697...
2          helpful  [0.6562877, 0.104419164, 0.44994003, 0.2843785...
3       reviewerid  [0.2581557, 0.23815838, 0.5039665, 0.39773974,...
4             asin  [0.39486924, 0.19696124, 0.59134626, 0.2907761...
5          summary  [-0.2557342, 0.41270193, 0.17242137, 0.4297283...
6       reviewtext  [0.49070364, 0.10820619, 0.3717794, 0.33909672...
7       reviewtime  [0.22230281, 0.3045503, 0.5091604, 0.445033, -...
8   unixreviewtime  [0.049603473, 0.28855324, 0.37950322, 0.320178...
9     reviewername  [0.5175663, 0.21741618, 0.5265212, 0.4151329, ...
10             one  [0.07360341, 0.29183272, 0.15452932, 0.1891607...
11           games  [0.13681239, 0.4731109, 0.21195054, 0.14671946...
12            time  [0.18521364, 0.3131428, 0.18284409, 0.1700498,...
13            like  

In [31]:
#   -----------------------------------------------------------------
# Training a Skip-gram model
#   -----------------------------------------------------------------

from gensim.models import Word2Vec

model_sg = Word2Vec([words], window=2, min_count=1, sg=1, vector_size=300, epochs=50)

# Print the most similar words
print(model_sg.wv.most_similar("king"))


[('diminishing', 0.9932238459587097), ('innovation', 0.9912392497062683), ('features', 0.9892754554748535), ('returns', 0.9883993864059448), ('addition', 0.9878091812133789), ('gorgeous', 0.9870482087135315), ('combination', 0.9869081377983093), ('gameif', 0.986084520816803), ('ego', 0.9857894778251648), ('obsession', 0.9851407408714294)]


In [34]:
# -----------------------------------------------------------------
# Finding the cosine distance between the word embeddings
# The cosine distance between two vectors is the cosine of the angle between the two vectors.
# The cosine distance is a measure of similarity between two vectors.
# The cosine distance ranges from -1 to 1.
# -----------------------------------------------------------------


from sklearn.metrics.pairwise import cosine_similarity

# Get the word embeddings for "america" from Skip-gram and CBOW models
embedding_sg = model_sg.wv["world"].reshape(1, -1)
embedding_cb = model_cb.wv["world"].reshape(1, -1)

# Calculate the cosine similarity between the two embeddings
cosine_distance = 1 - cosine_similarity(embedding_sg, embedding_cb)[0][0]

print("Cosine distance between Skip-gram and CBOW embeddings for 'world':", cosine_distance)



Cosine distance between Skip-gram and CBOW embeddings for 'world': 0.08574259281158447


## GloVe: Global Vectors

In [35]:
# -----------------------------------------------------------------
# import glove model and print embeddings for a word
# 100 dimension model
# -----------------------------------------------------------------

from gensim.models import KeyedVectors


print(time.strftime("%H:%M:%S", time.gmtime()))

# -----------------------------------------------------------------
# # Load the GloVe model
# The key is to use the no_header parameter to indicate that the file does not contain a header
# This caused the model to load successfully and the embeddings to be accessible.
# -----------------------------------------------------------------
glove_model3_100 = KeyedVectors.load_word2vec_format('./glove.6B/glove.6B.100d.txt', binary=False, no_header=True)

print(time.strftime("%H:%M:%S", time.gmtime()))
# 

22:07:11
22:07:41


In [37]:
# -----------------------------------------------------------------
# 300 dimension model
# -----------------------------------------------------------------
print(time.strftime("%H:%M:%S", time.gmtime()))
glove_model3_300 = KeyedVectors.load_word2vec_format('./glove.6B/glove.6B.300d.txt', binary=False, no_header=True)
print(time.strftime("%H:%M:%S", time.gmtime()))


22:08:03
22:09:14


In [38]:
# -----------------------------------------------------------------
# Print vocaublary size for both glove models
# -----------------------------------------------------------------
print(len(glove_model3_100.index_to_key))
print(len(glove_model3_300.index_to_key))


400000
400000


In [39]:
# -----------------------------------------------------------------
# print embeddings for a word in glove_model3
# -----------------------------------------------------------------
# print(glove_model3['america'])

print(glove_model3_300['america'])

# # Print the most similar words
print(glove_model3_300.most_similar("king"))


[ 5.8489e-02 -2.5842e-01 -1.3714e-02 -1.2991e-01 -2.4317e-01  8.0817e-01
 -7.3859e-01  6.8369e-01  3.5096e-02 -1.4020e+00  9.0090e-01  1.6326e-01
  1.6109e-01 -2.7931e-01  2.1604e-01 -2.6137e-01  1.4417e-01 -2.6277e-01
 -2.4252e-01  8.2353e-02 -6.2638e-02  3.7031e-01  6.5962e-01  2.3178e-01
 -4.4202e-01 -1.8971e-01  2.1155e-01 -8.6515e-02 -1.1751e-01 -1.7021e-01
 -4.3983e-02  5.4723e-01 -1.4636e-01 -4.0266e-01 -8.3730e-01  1.2009e-01
 -7.8500e-02 -4.2377e-01 -2.0081e-01 -1.8051e-01 -5.1198e-02 -1.6930e-01
 -1.3624e-01  4.8605e-01  8.1530e-02  6.3648e-02  3.1479e-01  2.3262e-01
  1.1605e-01 -2.5602e-01  1.7535e-01  5.9225e-02  5.4992e-03  3.5402e-01
 -4.2574e-01  1.0849e-01 -3.1691e-01 -2.0802e-01  1.2963e-01 -6.8191e-02
  2.4378e-01 -2.4580e-02  9.8606e-02 -1.8751e-01 -2.9935e-01  4.0835e-01
 -4.0737e-02  7.1439e-01  2.2632e-02 -2.0012e-02 -2.4854e-01  5.9211e-02
 -1.2204e-01  7.5426e-02 -1.6433e-01 -7.0031e-02 -2.1207e-01  2.0083e-02
 -3.9992e-01 -3.7857e-01 -3.9331e-01  2.3015e-01 -3

## Fasttext

In [None]:
print(time.strftime("%H:%M:%S", time.gmtime()))
# Load Fasttext model
from gensim.models import KeyedVectors

# Load the FastText model
fasttext_model = KeyedVectors.load_word2vec_format('/Users/p.mittal/Library/Mobile Documents/com~apple~CloudDocs/Roux/Courses/NLP/Data/Embeddings/Fasttext/cc.en.300.vec', binary=False)


# Print the word embedding for "america"
print(fasttext_model["america"])
print(time.strftime("%H:%M:%S", time.gmtime()))


In [None]:
# print the length of total words in the model
print(len(fasttext_model.index_to_key))

# print a list of 100 randomly chosen words in the model
import random
random_words = random.sample(fasttext_model.index_to_key, 100)
print(random_words)

# Print all subwords of diabetes in the model
subwords = [subword for subword in fasttext_model.index_to_key if 'diabetes' in subword]
print(subwords)

print(len(subwords))


In [None]:
# print the most similar words to a word using fasttext model
# limit to 5 most similar words
print(fasttext_model.most_similar("diabetes", topn=10))


In [None]:
# print dimensions of all the models to compare
print("word2vec model:", model_sg.wv['america'].shape)
print("glove model 300:", glove_model3_300['america'].shape)
print("fasttext model:", fasttext_model['america'].shape)





In [None]:
# Compare all of the above models using the word "diabetes"
# Get the word embeddings for "diabetes" from all models
embedding_glove_model3_300 = glove_model3_300["diabetes"].reshape(1, -1)
embedding_fasttext = fasttext_model["diabetes"].reshape(1, -1)
embedding_sg = model_sg.wv["diabetes"].reshape(1, -1)
embedding_cb = model_cb.wv["diabetes"].reshape(1, -1)

# # Calculate the cosine similarity between the embeddings
cosine_distance_glove_fasttext = 1 - cosine_similarity(embedding_glove_model3_300, embedding_fasttext)[0][0]
cosine_distance_sg_cb = 1 - cosine_similarity(embedding_sg, embedding_cb)[0][0]
cosine_distance_glove_sg = 1 - cosine_similarity(embedding_glove_model3_300, embedding_sg)[0][0]
cosine_distance_glove_cb = 1 - cosine_similarity(embedding_glove_model3_300, embedding_cb)[0][0]
cosine_distance_fasttext_sg = 1 - cosine_similarity(embedding_fasttext, embedding_sg)[0][0]
cosine_distance_fasttext_cb = 1 - cosine_similarity(embedding_fasttext, embedding_cb)[0][0]

print("Cosine distance between GloVe and FastText embeddings for 'diabetes':", cosine_distance_glove_fasttext)
print("Cosine distance between Skip-gram and CBOW embeddings for 'diabetes':", cosine_distance_sg_cb)
print("Cosine distance between GloVe and Skip-gram embeddings for 'diabetes':", cosine_distance_glove_sg)
print("Cosine distance between GloVe and CBOW embeddings for 'diabetes':", cosine_distance_glove_cb)
print("Cosine distance between FastText and Skip-gram embeddings for 'diabetes':", cosine_distance_fasttext_sg)
print("Cosine distance between FastText and CBOW embeddings for 'diabetes':", cosine_distance_fasttext_cb)


# Since the cosine distance is larger than 1 for some of the models, it is not a valid distance measure.
# let's normalize the embeddings and then calculate the cosine distance

# Normalize the embeddings
embedding_glove_model3_300_nm = embedding_glove_model3_300/ np.linalg.norm(embedding_glove_model3_300)
embedding_fasttext_nm = embedding_fasttext / np.linalg.norm(embedding_fasttext)
embedding_sg_nm = embedding_sg / np.linalg.norm(embedding_sg)
embedding_cb_nm = embedding_cb / np.linalg.norm(embedding_cb)


# # Calculate the cosine similarity between the embeddings
cosine_distance_glove_fasttext = 1 - cosine_similarity(embedding_glove_model3_300_nm, embedding_fasttext_nm)[0][0]
cosine_distance_sg_cb = 1 - cosine_similarity(embedding_sg_nm, embedding_cb_nm)[0][0]
cosine_distance_glove_sg = 1 - cosine_similarity(embedding_glove_model3_300_nm, embedding_sg_nm)[0][0]
cosine_distance_glove_cb = 1 - cosine_similarity(embedding_glove_model3_300_nm, embedding_cb_nm)[0][0]
cosine_distance_fasttext_sg = 1 - cosine_similarity(embedding_fasttext_nm, embedding_sg_nm)[0][0]
cosine_distance_fasttext_cb = 1 - cosine_similarity(embedding_fasttext_nm, embedding_cb_nm)[0][0]

print("Cosine distance between GloVe and FastText embeddings for 'diabetes':", cosine_distance_glove_fasttext)
print("Cosine distance between Skip-gram and CBOW embeddings for 'diabetes':", cosine_distance_sg_cb)
print("Cosine distance between GloVe and Skip-gram embeddings for 'diabetes':", cosine_distance_glove_sg)
print("Cosine distance between GloVe and CBOW embeddings for 'diabetes':", cosine_distance_glove_cb)
print("Cosine distance between FastText and Skip-gram embeddings for 'diabetes':", cosine_distance_fasttext_sg)
print("Cosine distance between FastText and CBOW embeddings for 'diabetes':", cosine_distance_fasttext_cb)




In [None]:
# Find the norm of the embeddings
print("Norm of GloVe model 300:", np.linalg.norm(embedding_glove_model3_300))
print("Norm of FastText model:", np.linalg.norm(embedding_fasttext))
print("Norm of Skip-gram model:", np.linalg.norm(embedding_sg))
print("Norm of CBOW model:", np.linalg.norm(embedding_cb))

print("Norm of GloVe model 300:", np.linalg.norm(embedding_glove_model3_300_nm))
print("Norm of FastText model:", np.linalg.norm(embedding_fasttext_nm))
print("Norm of Skip-gram model:", np.linalg.norm(embedding_sg_nm))
print("Norm of CBOW model:", np.linalg.norm(embedding_cb_nm))


In [None]:
# Extracting word vectors from the model glove_model3_300

# glove_model3_300.vectors.shape



# -------------------------
# Reducing dimensions to 3
# -------------------------

umap_model = UMAP(n_components=3)

glove_umap_embeddings = umap_model.fit_transform(glove_model3_300.vectors)

glove_umap_embeddings.shape


In [None]:
# print the first 5 rows of the embeddings
# print(glove_umap_embeddings[:5])

# 3d plot first 10000 words using plotly
import plotly.express as px

# Create a DataFrame with the UMAP embeddings and the words
df = pd.DataFrame(glove_umap_embeddings[:10000], columns=["x", "y", "z"])
df["word"] = glove_model3_300.index_to_key[:10000]

# Create a 3D scatter plot

fig = px.scatter_3d(df, x="x", y="y", z="z", text="word", title="UMAP 3D Embeddings")
fig.update_traces(textposition='top center')
fig.show()




## Experimentation this point onward

In [None]:


# filename = '/Users/p.mittal/Library/Mobile Documents/com~apple~CloudDocs/Roux/Courses/NLP/Data/text8 Wikipedia/text8'

# def read_data(filename, char):
#     with open(filename, 'r') as f:
#         data = f.read(char)
#     return data


# text = read_data(filename, 100000)

# print(time.strftime("%H:%M:%S", time.gmtime()))

# len(text)

# print(time.strftime("%H:%M:%S", time.gmtime()))

# # split the text into words
# words = text.split()
# len(words)
# print(time.strftime("%H:%M:%S", time.gmtime()))

# # remove the punctuation
# import string
# table = str.maketrans('', '', string.punctuation)
# words = [w.translate(table) for w in words]
# print(len(words))
# print(time.strftime("%H:%M:%S", time.gmtime()))



In [None]:
# # Build subwords vocabulary using fasttext model
# from gensim.models import FastText

# # Train the FastText model
# fasttext_model = FastText([words], window=5, min_count=1, sg=1, vector_size=300, epochs=50)

# # Print the most similar words
# # print(fasttext_model.wv.most_similar("king"))

# # print the total voculabulary size including subwords
# print(len(fasttext_model.wv.index_to_key))


# # how many subwords are there for the word "diabetes"

In [None]:
# # print subwords in the model
# subwords = [subword for subword in fasttext_model.wv.index_to_key if 'anarchist' in subword]
# print(subwords)

In [None]:
# # create fasttext embeddings using the text file
# from gensim.models import FastText

# # # Train the FastText model
# model_ft = FastText([words], window=2, min_count=1, sg=1, vector_size=300, epochs=50)

# # # Print the most similar words
# # print(model_ft.wv.most_similar("king"))

# # # Get the word embeddings for "america" from FastText model
# # create fasttext embeddings using the text file
# # Train the FastText model
# model_ft = FastText(window=2, min_count=1, sg=1, vector_size=300, epochs=50)

# # Build the vocabulary with subwords
# model_ft.build_vocab(corpus_file='/path/to/your/text/file')

# # Train the model on the corpus
# model_ft.train(corpus_file='/path/to/your/text/file', total_examples=model_ft.corpus_count, epochs=model_ft.epochs)

# # Print the most similar words
# print(model_ft.wv.most_similar("king"))

# # Get the word embeddings for "america" from FastText model
# embedding_ft = model_ft.wv["america"].reshape(1, -1)

# # Calculate the cosine similarity between the embeddings
# cosine_distance = 1 - cosine_similarity(embedding_ft, embedding_glove_model3_300)[0][0]

# print("Cosine distance between FastText and GloVe embeddings for 'america':", cosine_distance)

# # embedding_ft = model_ft.wv["america"].reshape(1, -1)

# # # Calculate the cosine similarity between the embeddings
# # cosine_distance = 1 - cosine_similarity(embedding_ft, embedding_glove_model3_300)[0][0]

# # print("Cosine distance between FastText and GloVe embeddings for 'america':", cosine_distance)

