In [9]:
import os

# Only the TensorFlow backend supports string inputs.
os.environ["KERAS_BACKEND"] = "tensorflow"

import pathlib
import numpy as np
import pandas as pd
import tensorflow.data as tf_data
import keras
from keras import layers
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

In [4]:
path_to_glove_file = r'glove.6B\glove.6B.50d.txt'

embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
np.dot(embeddings_index["man"], embeddings_index["woman"])

In [6]:
glove_df = pd.DataFrame.from_dict(embeddings_index, orient="index")

In [7]:
glove_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
the,0.418,0.24968,-0.41242,0.1217,0.34527,-0.044457,-0.49688,-0.17862,-0.00066,-0.6566,...,-0.29871,-0.15749,-0.34758,-0.045637,-0.44251,0.18785,0.002785,-0.18411,-0.11514,-0.78581
",",0.013441,0.23682,-0.16899,0.40951,0.63812,0.47709,-0.42852,-0.55641,-0.364,-0.23938,...,-0.080262,0.63003,0.32111,-0.46765,0.22786,0.36034,-0.37818,-0.56657,0.044691,0.30392
.,0.15164,0.30177,-0.16763,0.17684,0.31719,0.33973,-0.43478,-0.31086,-0.44999,-0.29486,...,-6.4e-05,0.068987,0.087939,-0.10285,-0.13931,0.22314,-0.080803,-0.35652,0.016413,0.10216
of,0.70853,0.57088,-0.4716,0.18048,0.54449,0.72603,0.18157,-0.52393,0.10381,-0.17566,...,-0.34727,0.28483,0.075693,-0.062178,-0.38988,0.22902,-0.21617,-0.22562,-0.093918,-0.80375
to,0.68047,-0.039263,0.30186,-0.17792,0.42962,0.032246,-0.41376,0.13228,-0.29847,-0.085253,...,-0.094375,0.018324,0.21048,-0.03088,-0.19722,0.082279,-0.09434,-0.073297,-0.064699,-0.26044


In [19]:
# Step 1: Perform KMeans clustering
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(glove_df)

# Step 2: Get cluster centers
cluster_centers = kmeans.cluster_centers_

# Step 3: Find closest words to each cluster center
closest_words = []
for center in cluster_centers:
    similarities = cosine_similarity([center], glove_df)
    closest_word_index = similarities.argsort()[0][::-1][:5]  # Top 5 closest words
    closest_words.append(glove_df.index[closest_word_index])

# Print the closest words for each cluster
for i, words in enumerate(closest_words):
    print(f"Cluster {i+1}: {words}")

Cluster 1: Index(['kwit', 'darthard', 'nohn', 'yood', 'duhm'], dtype='object')
Cluster 2: Index(['se5', 'rrg', 'el1l', 'emcc', 'kds'], dtype='object')
Cluster 3: Index(['chanjindamanee', 'rungfapaisarn', 'zety', 'barinov', 'dainis'], dtype='object')
Cluster 4: Index(['amphiprion', 'saurolophus', 'atractaspis', 'auctor', 'pesma'], dtype='object')
Cluster 5: Index(['rather', 'even', 'making', 'instead', 'their'], dtype='object')
Cluster 6: Index(['raucousness', 'self-reflexive', 'essayistic', 'twisties',
       'dark-coloured'],
      dtype='object')
Cluster 7: Index(['belmullet', 'chiplun', 'segbwema', 'lugazi', 'castledawson'], dtype='object')
Cluster 8: Index(['36.39', '56.94', '67.71', '63.39', '59.17'], dtype='object')
Cluster 9: Index(['dukker', 'kaufer', 'juday', 'goldgeier', 'rumbiak'], dtype='object')
Cluster 10: Index(['coutinho', 'aleixo', 'guedes', 'laureano', 'manelli'], dtype='object')


In [35]:
words_similar_to_frog = glove_df.index[cosine_similarity([glove_df.loc["frog"]], glove_df).argsort()[0][::-1][:10]]
for word in words_similar_to_frog:
    print(word + ":\t" + str(np.dot(embeddings_index["frog"], embeddings_index[word])))

frog:	23.030615
snake:	19.313877
ape:	14.616077
toad:	17.365572
monkey:	17.25082
spider:	17.16853
lizard:	17.575285
spiny:	18.676533
orchid:	18.294285
cat:	15.7096195


In [59]:
glove_df.index[np.dot(glove_df, glove_df.loc["queen"]).argsort()[::-1][:10]]

Index(['queen', 'princess', 'king', 'throne', 'royal', 'daughter', 'her',
       'empress', 'prince', 'mother'],
      dtype='object')

In [56]:
glove_df.index[np.linalg.norm(glove_df, axis=1).argsort()[::-1][:10]]

Index(['non-families', '202-383-7824', 'non-institutionalized', 'www.star',
       'non-obligatory', 'officership', 'republish', 'http://www.nyse.com',
       '20003', '25-64'],
      dtype='object')

In [57]:
# Maybe there's a game here?
np.dot(glove_df.loc["king"] - glove_df.loc["man"] + glove_df.loc["woman"], glove_df.loc["queen"])

24.641138

In [61]:
np.dot(glove_df.loc["king"], glove_df.loc["queen"])

21.877506

In [None]:
# Initialize KMeans with the number of clusters you want
num_clusters = 10  # You can choose any number of clusters you want
kmeans = KMeans(n_clusters=num_clusters)

# Fit KMeans to your GloVe data
kmeans.fit(glove_data)

# Get cluster labels and cluster centers
cluster_labels = kmeans.labels_
cluster_centers = kmeans.cluster_centers_

# Perform dimensionality reduction for visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(glove_data)
reduced_centers = pca.transform(cluster_centers)

# Visualize the clusters
plt.figure(figsize=(12, 8))
for i in range(num_clusters):
    cluster_points = reduced_data[cluster_labels == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {i}')
plt.scatter(reduced_centers[:, 0], reduced_centers[:, 1], marker='x', color='black', s=100, label='Cluster Centers')
plt.title('K-Means Clustering of GloVe Word Vectors')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()