In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import tools as Tools


X_train = Tools.read_pickle_data("X.pickle")
vectorizer_X = Tools.read_pickle_data("vectorizer_X.pickle")
omni_embeddings = Tools.read_pickle_data("omni_embeddings.pickle")

words = sorted([
    "happy", "sad", "joy", "anger", "fear", "love", "hate", "excited", "nervous", "calm",
    # Professions
    "doctor", "engineer", "teacher", "lawyer", "artist", "scientist", "nurse", "chef", "pilot", "writer",
    # Nature
    "tree", "river", "mountain", "ocean", "flower", "desert", "forest", "sky", "cloud", "animal",
    # Technology
    "computer", "internet", "robot", "AI", "software", "hardware", "phone", "camera", "network", "algorithm",
    # Relationships
    "father", "mother", "brother", "sister", "friend", "husband", "wife", "child", "partner", "neighbor",
    # Food
    "bread", "apple", "pizza", "coffee", "chocolate", "milk", "soup", "rice", "cake", "cheese",
    # Geography
    "city", "village", "country", "continent", "river", "lake", "mountain", "valley", "desert", "island",
    # Abstract Concepts
    "freedom", "justice", "peace", "war", "knowledge", "power", "truth", "beauty", "faith", "wealth",
    # Animals
    "cat", "dog", "lion", "tiger", "elephant", "bird", "fish", "whale", "dolphin", "butterfly",
    # Vehicles
    "car", "truck", "bicycle", "train", "airplane", "ship", "boat", "motorcycle", "subway", "helicopter",
    # Sports
    "soccer", "basketball", "tennis", "cricket", "baseball", "golf", "hockey", "boxing", "running", "swimming",
    # Royalty/Leadership
    "king", "queen", "prince", "princess", "leader", "president", "minister", "senator", "governor", "mayor",
    # Miscellaneous
    "book", "music", "movie", "art", "language", "history", "science", "medicine", "education", "philosophy"
])


# Define a subset of words that are valid (present in the vocabulary)
valid_words = [word for word in words if word in vectorizer_X.vocabulary_]
for word in valid_words:
    if word not in omni_embeddings:
        print(f"Word '{word}' not found in token embeddings.")
        continue

# Extract embeddings for valid words from token_embeddings
subset_embeddings = np.array([omni_embeddings[word] for word in valid_words if word in omni_embeddings])

# Ensure the length of valid_words matches the subset_embeddings
valid_words = [word for word in valid_words if word in omni_embeddings]

# Apply t-SNE on the subset_embeddings
# tsne = TSNE(n_components=2, random_state=42, perplexity=5) 
tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, n_iter=2000, random_state=42)
reduced_subset = tsne.fit_transform(subset_embeddings)


# Plot the results
plt.figure(figsize=(6, 6), dpi=100)  # Adjust figure size and resolution
plt.scatter(reduced_subset[:, 0], reduced_subset[:, 1], alpha=0.7, color='#1f77b4', s=20)  # Reduced point size

# Annotate the points with their corresponding words
for i, word in enumerate(valid_words):
    plt.text(reduced_subset[i, 0], reduced_subset[i, 1], word, fontsize=6)  # Reduced font size

# Remove axis legends
plt.xticks([])
plt.yticks([])

# Save the plot as a PDF
plt.savefig('word_embeddings_visualization.pdf', format='pdf', bbox_inches='tight')

# Show the plot (optional, since we're saving it)
plt.show()