In [10]:
import numpy as np
from sklearn.manifold import TSNE
from transformers import BertTokenizer, BertModel
import plotly.graph_objs as go
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# A more diverse list of words grouped by semantic categories
words = [
    "woman", "man", "child", "king", "queen", "teacher", "professor", 
    "student", "doctor", "nurse", "car", "bus", "train", "airplane", 
    "bicycle", "banana", "apple", "orange", "grape", "skyscraper", 
    "house", "castle", "bridge", "mountain", "river", "ocean", "school", 
    "college", "university", "city", "village"
]

# Tokenize words and get embeddings
def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model(**inputs)
    # Get the embedding of the [CLS] token (first token)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Get embeddings for all words
word_embeddings = np.array([get_word_embedding(word).squeeze() for word in words])

# Reduce dimensionality using t-SNE (from 768D to 3D)
tsne = TSNE(n_components=3, perplexity=5, n_iter=1000, random_state=42)
word_embeddings_3d = tsne.fit_transform(word_embeddings)

# Semantic categories for coloring (manual labeling of word groups)
categories = [
    'people', 'people', 'people', 'royalty', 'royalty', 'profession', 'profession',
    'profession', 'profession', 'profession', 'vehicle', 'vehicle', 'vehicle', 'vehicle',
    'vehicle', 'food', 'food', 'food', 'food', 'building',
    'building', 'building', 'structure', 'nature', 'nature', 'nature', 'institution',
    'institution', 'institution', 'location', 'location'
]

# Map categories to unique color codes
category_colors = {
    'people': 'blue',
    'royalty': 'purple',
    'profession': 'green',
    'vehicle': 'orange',
    'food': 'red',
    'building': 'brown',
    'structure': 'cyan',
    'nature': 'darkgreen',
    'institution': 'pink',
    'location': 'yellow'
}
colors = [category_colors[cat] for cat in categories]

# Prepare data for Plotly
x = word_embeddings_3d[:, 0]
y = word_embeddings_3d[:, 1]
z = word_embeddings_3d[:, 2]

# Create a 3D scatter plot with color coding for categories
trace = go.Scatter3d(
    x=x, 
    y=y, 
    z=z,
    text=words,  # Labels for each point
    mode='markers+text',
    marker=dict(size=8, color=colors, opacity=0.8),  # Color by category
    textposition='top center'
)

layout = go.Layout(
    title='3D t-SNE Visualization of Word Embeddings (BERT)',
    scene=dict(
        xaxis=dict(title='Dimension 1'),
        yaxis=dict(title='Dimension 2'),
        zaxis=dict(title='Dimension 3')
    ),
)

fig = go.Figure(data=[trace], layout=layout)

# Show the interactive plot
fig.show()



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



In [16]:
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load FastText pretrained model from Gensim
print("Loading FastText Model...")
fasttext_model = api.load("fasttext-wiki-news-subwords-300")
print("Model loaded!")

# Get embedding for 'student'
student_embedding = fasttext_model.get_vector('student')

# Convert all FastText vectors into a single matrix (faster comparison)
vocab_size = len(fasttext_model.index_to_key)
embedding_matrix = np.zeros((vocab_size, fasttext_model.vector_size))

for i, word in enumerate(fasttext_model.index_to_key):
    embedding_matrix[i] = fasttext_model.get_vector(word)

# Compute cosine similarities in a vectorized way (all at once)
similarities = cosine_similarity([student_embedding], embedding_matrix)[0]

# Get the top 10 most similar words (excluding 'student' itself)
top_k = 10
top_indices = similarities.argsort()[-top_k-1:][::-1][1:]  # Exclude 'student'

# Get the corresponding words for the top 10 indices
top_words = [fasttext_model.index_to_key[i] for i in top_indices]

print(f"The 10 closest words to 'student' are: {top_words}")


Loading FastText Model...
Model loaded!
The 10 closest words to 'student' are: ['students', 'studen', 'college-student', 'students-and', 'ex-student', 'non-student', 'teacher', 'grad-student', 'graduate-student', 'student-student']


In [24]:
# Get embeddings for the words
def get_embedding(word):
    return fasttext_model.get_vector(word)

# Perform vector arithmetic: queen - man + woman
def word_vector_math(positive_words, negative_words):
    result_vector = np.zeros(fasttext_model.vector_size)
    
    for word in positive_words:
        result_vector += get_embedding(word)
    for word in negative_words:
        result_vector -= get_embedding(word)
    
    return result_vector

# Example: king - man + woman = ?
vector = word_vector_math(positive_words=['king', 'woman'], negative_words=['man'])


def find_similar_words(vector, top_k=10):
    
    # Convert all FastText vectors into a single matrix (faster comparison)
    vocab_size = len(fasttext_model.index_to_key)
    embedding_matrix = np.zeros((vocab_size, fasttext_model.vector_size))

    for i, word in enumerate(fasttext_model.index_to_key):
        embedding_matrix[i] = fasttext_model.get_vector(word)

    # Compute cosine similarities with the result vector
    similarities = cosine_similarity([vector], embedding_matrix)[0]

    # Get the top 10 most similar words
    top_k = 10
    top_indices = similarities.argsort()[-top_k:][::-1]  # Get top 10 in descending order

    # Get the corresponding words for the top 10 indices
    top_words = [fasttext_model.index_to_key[i] for i in top_indices]

    return str(f"The 10 closest words to this vector are: {top_words}")


In [25]:
vector_1 = word_vector_math(positive_words=['paris', 'germany'], negative_words=['france'])


find_similar_words(vector_1)



"The 10 closest words to this vector are: ['paris', 'germany', 'berlin', 'germany.', 'munich', 'germanys', 'frankfurt', 'german', 'austria', 'dresden']"

In [26]:
vector_2 = word_vector_math(positive_words=['he', 'woman'], negative_words=['man'])

find_similar_words(vector_2)


'The 10 closest words to this vector are: [\'he\', \'she\', \'He\', \'--she\', \'She\', \'-she\', \'heshe\', \'he-she\', \'.she\', "\'she"]'

In [28]:
vector = word_vector_math(positive_words=['rich', 'wealth'], negative_words=['poor'])

find_similar_words(vector)


"The 10 closest words to this vector are: ['wealth', 'rich', 'riches', 'wealth-', 'wealths', 'richness', 'wealth.', 'treasures', 'richest', 'wealthly']"