In [None]:
import numpy as np
import cupy as cp
from cuml.decomposition import TruncatedSVD
from scipy.stats import spearmanr

In [None]:
class ReducedEmbeddingModel:
    def __init__(self, vectors, index_to_key):
        self.vectors = vectors
        self.index_to_key = index_to_key
        self.key_to_index = {word: idx for idx, word in enumerate(index_to_key)}
    
    def __getitem__(self, word):
        if word in self.key_to_index:
            return self.vectors[self.key_to_index[word]]
        raise KeyError(f"Word '{word}' not found in vocabulary")
    
    def __contains__(self, word):
        return word in self.key_to_index

def load_glove_model(file_path):
    """Loads GloVe embeddings from a text file."""
    index_to_key = []
    vectors = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            index_to_key.append(word)
            vectors.append(vector)
    vectors = np.vstack(vectors)
    return ReducedEmbeddingModel(vectors, index_to_key)
    
# Load SimLex-999 word similarity data
def load_simlex_999(file_path):
    """Loads SimLex-999 and returns word pairs with similarity scores."""
    data = []
    with open(file_path, 'r') as file:
        next(file)  # Skip header
        for line in file:
            parts = line.strip().split('\t')
            word1, word2, similarity_score = parts[0], parts[1], float(parts[3])
            data.append((word1, word2, similarity_score))
    return data

# Cosine similarity function for similarity evaluation
def cosine_similarity(v1, v2):
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return np.dot(v1, v2) / (norm1 * norm2)

# Evaluate embeddings on the word similarity data
def evaluate_embeddings(model, word_pairs):
    
    predicted_scores = []
    true_scores = []

    model_dict = {}
    for i in range(len(model.index_to_key)):
        model_dict[model.index_to_key[i]] = model.vectors[i]
    
    for word1, word2, true_score in word_pairs:
        if word1 in model_dict and word2 in model_dict:
            sim_score = cosine_similarity(model_dict[word1], model_dict[word2])
            predicted_scores.append(sim_score)
            true_scores.append(true_score)
    
    # Calculate Spearman correlation for word similarity performance
    correlation, _ = spearmanr(true_scores, predicted_scores)
    return correlation

def task_driven_dimensionality_reduction(embeddings, target_dim, word_pairs, model):
    """
    Reduces dimensions using TruncatedSVD with optimization for similarity, similar to PCA approach.
    """
    cu_embeddings = cp.asarray(embeddings)
    original_dim = cu_embeddings.shape[1]
    
    best_correlation = -1.0
    best_embeddings = None
    results = []

    # Perform initial TruncatedSVD to a larger dimension to capture more variance initially
    initial_n_components = min(original_dim, 300)  # Example: Reduce to 300 initially
    svd = TruncatedSVD(n_components=initial_n_components, algorithm="full")
    reduced_embeddings = svd.fit_transform(cu_embeddings)

    for num_remove in range(initial_n_components - target_dim + 1):
        # Remove top components
        temp_embeddings = reduced_embeddings[:, num_remove:]

        # Further reduce to target_dim if necessary
        if temp_embeddings.shape[1] > target_dim:
            svd_final = TruncatedSVD(n_components=target_dim, algorithm="full")
            final_embeddings = svd_final.fit_transform(temp_embeddings)
        else:
            final_embeddings = temp_embeddings
        
        final_embeddings_np = cp.asnumpy(final_embeddings)

        # Create a reduced model for evaluation
        reduced_model = ReducedEmbeddingModel(final_embeddings_np, model.index_to_key)

        # Evaluate
        correlation = evaluate_embeddings(reduced_model, word_pairs)
        results.append((num_remove, correlation))
        print(f"Removed top {num_remove} components, new dim = {final_embeddings.shape[1]}, correlation = {correlation:.4f}")

        if correlation > best_correlation:
            best_correlation = correlation
            best_embeddings = final_embeddings_np

    print(f"\nBest correlation: {best_correlation:.4f}")
    return best_embeddings, best_correlation, results

In [None]:
glove_file_path = '/teamspace/uploads/glove.6B.300d.txt'  # Update this path to your GloVe file
simlex_file_path = '/teamspace/studios/pca/SimLex-999.txt'  # Update this path to your SimLex-999 file
target_dim = 2

print("Loading GloVe model...")
original_model = load_glove_model(glove_file_path)

# Load word similarity dataset
word_pairs = load_simlex_999(simlex_file_path)

In [None]:
# Perform task-driven dimensionality reduction with TruncatedSVD and top component removal
optimized_embeddings, best_correlation, results = task_driven_dimensionality_reduction(
    original_model.vectors,
    target_dim=target_dim,
    word_pairs=word_pairs,
    model=original_model
)

In [None]:
import pandas as pd
# Save the 2D embeddings with word labels to a CSV file
def save_embeddings_to_csv(words, embeddings, file_path):
    print(f"Saving 2D embeddings to {file_path}...")
    df = pd.DataFrame(embeddings, columns=['x', 'y'])
    df['word'] = words
    df.to_csv(file_path, index=False)
    print("2D embeddings saved successfully.")

In [None]:
# Get words and their high-dimensional embeddings
words = original_model.index_to_key  # List of words in vocabulary
file_path="Glove_TSVD_2d_word_embeddings.csv"
# Save the embeddings and words to a CSV file
save_embeddings_to_csv(words, optimized_embeddings, file_path)