In [1]:
import numpy as np
import cupy as cp
import cuml
from cuml.decomposition import PCA as cumlPCA
from gensim.models import KeyedVectors
from scipy.stats import spearmanr
import csv
import pandas as pd

In [2]:
class ReducedEmbeddingModel:
    def __init__(self, vectors, index_to_key):
        self.vectors = vectors
        self.index_to_key = index_to_key
        self.key_to_index = {word: idx for idx, word in enumerate(index_to_key)}
    
    def __getitem__(self, word):
        if word in self.key_to_index:
            return self.vectors[self.key_to_index[word]]
        raise KeyError(f"Word '{word}' not found in vocabulary")
    
    def __contains__(self, word):
        return word in self.key_to_index

def load_word2vec_model(file_path):
    """Loads Word2Vec embeddings from a binary file."""
    print("Loading Word2Vec model...")
    wv = KeyedVectors.load_word2vec_format(file_path, binary=True)
    vectors = wv.vectors
    index_to_key = wv.index_to_key
    return ReducedEmbeddingModel(vectors, index_to_key)

# Load SimLex-999 word similarity data
def load_simlex_999(file_path):
    """Loads SimLex-999 and returns word pairs with similarity scores."""
    data = []
    with open(file_path, 'r') as file:
        next(file)  # Skip header
        for line in file:
            parts = line.strip().split('\t')
            word1, word2, similarity_score = parts[0], parts[1], float(parts[3])
            data.append((word1, word2, similarity_score))
    return data

# Cosine similarity function for similarity evaluation
def cosine_similarity(v1, v2):
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return np.dot(v1, v2) / (norm1 * norm2)

# Evaluate embeddings on the word similarity data
def evaluate_embeddings(model, word_pairs):
    predicted_scores = []
    true_scores = []
    
    for word1, word2, true_score in word_pairs:
        if word1 in model and word2 in model:
            sim_score = cosine_similarity(model[word1], model[word2])
            predicted_scores.append(sim_score)
            true_scores.append(true_score)
    
    # Calculate Spearman correlation for word similarity performance
    correlation, _ = spearmanr(true_scores, predicted_scores)
    return correlation

# Task-driven dimensionality reduction using cuML's PCA with optimization for similarity
def task_driven_dimensionality_reduction(embeddings, target_dim=2, word_pairs=None, model=None):
    """Reduces dimensions while optimizing for similarity correlation and visualizes component removal impact."""
    
    # Convert numpy array to CuPy array for GPU processing
    cu_embeddings = cp.asarray(embeddings)
    
    # Initialize cuML PCA for dimensionality reduction
    pca = cumlPCA(n_components=target_dim)
    reduced_embeddings = pca.fit_transform(cu_embeddings)
    
    # Convert back to numpy for further processing
    reduced_embeddings = cp.asnumpy(reduced_embeddings)
    
    # Create a reduced model to store optimized embeddings
    optimized_model = {word: reduced_embeddings[i] for i, word in enumerate(model.index_to_key)}
    
    # Evaluate initial PCA reduction
    initial_correlation = evaluate_embeddings(optimized_model, word_pairs)
    print(f"Initial Spearman Correlation after PCA to {target_dim}D: {initial_correlation:.4f}")
    
    # Initialize variables for tracking
    best_correlation = initial_correlation
    best_embeddings = reduced_embeddings
    correlations = [initial_correlation]  # Track correlations for each component removal count
    components_removed = [0]  # Track the number of components removed

    # Fine-tune PCA by adjusting top components
    for top_n in range(297, 299, 1):  # Try removing 1 to 25 top components
        # Re-apply PCA with additional top components
        pca_adjusted = cumlPCA(n_components=target_dim + top_n)
        expanded_embeddings = pca_adjusted.fit_transform(cu_embeddings)
        
        # Convert back to numpy
        expanded_embeddings = cp.asnumpy(expanded_embeddings)
        
        # Remove top components by zeroing them out to retain task-specific dimensions
        adjusted_embeddings = expanded_embeddings[:, top_n:]
        
        # Update the model with new adjusted embeddings
        adjusted_model = {word: adjusted_embeddings[i] for i, word in enumerate(model.index_to_key)}
        
        # Evaluate correlation on SimLex-999
        correlation = evaluate_embeddings(adjusted_model, word_pairs)
        print(f"Adjusted Correlation after removing top {top_n} components: {correlation:.4f}")
        
        # Append data for plotting
        correlations.append(correlation)
        components_removed.append(top_n)
        
        # Track the best adjustment that maximizes the Spearman correlation
        if correlation > best_correlation:
            best_correlation = correlation
            best_embeddings = adjusted_embeddings
    
    print(f"Best Spearman Correlation with Task-Driven Reduction: {best_correlation:.4f}")
    
    return best_embeddings, best_correlation    

In [3]:
word2vec_file_path = '/teamspace/uploads/GoogleNews-vectors-negative300.bin'  # Path to your Word2Vec binary file
simlex_file_path = '/teamspace/studios/pca/SimLex-999.txt'  # Update this path to your SimLex-999 file
target_dim = 2  # Adjust target_dim as needed for testing (e.g., 150, 100, 50)

In [4]:
# Load Word2Vec model
original_model = load_word2vec_model(word2vec_file_path)

Loading Word2Vec model...


In [5]:
num_embeddings = len(original_model.index_to_key)
print(f"Number of embeddings in the dataset: {num_embeddings}")

Number of embeddings in the dataset: 3000000


In [5]:
# Load word similarity dataset
word_pairs = load_simlex_999(simlex_file_path)

In [6]:
# Perform task-driven dimensionality reduction
optimized_embeddings, best_correlation = task_driven_dimensionality_reduction(
    original_model.vectors, 
    target_dim=target_dim, 
    word_pairs=word_pairs, 
    model=original_model
)

print(f"\nFinal Optimized Spearman Correlation at {target_dim}D: {best_correlation:.4f}")

Initial Spearman Correlation after PCA to 2D: 0.0417
Adjusted Correlation after removing top 297 components: 0.1081
Adjusted Correlation after removing top 298 components: 0.1231
Best Spearman Correlation with Task-Driven Reduction: 0.1231

Final Optimized Spearman Correlation at 2D: 0.1231


In [7]:
# Perform task-driven dimensionality reduction
optimized_embeddings, best_correlation = task_driven_dimensionality_reduction(
    original_model.vectors, 
    target_dim=target_dim, 
    word_pairs=word_pairs, 
    model=original_model
)

print(f"\nFinal Optimized Spearman Correlation at {target_dim}D: {best_correlation:.4f}")

Initial Spearman Correlation after PCA to 2D: 0.0417
Adjusted Correlation after removing top 0 components: 0.0417
Adjusted Correlation after removing top 1 components: -0.0316
Adjusted Correlation after removing top 2 components: -0.0272
Adjusted Correlation after removing top 3 components: 0.0526
Adjusted Correlation after removing top 4 components: 0.0694
Adjusted Correlation after removing top 5 components: 0.0693
Adjusted Correlation after removing top 6 components: 0.0812
Adjusted Correlation after removing top 7 components: 0.0598
Adjusted Correlation after removing top 8 components: 0.0755
Adjusted Correlation after removing top 9 components: 0.1276
Adjusted Correlation after removing top 10 components: 0.1133
Adjusted Correlation after removing top 11 components: 0.0879
Adjusted Correlation after removing top 12 components: 0.0479
Adjusted Correlation after removing top 13 components: 0.0519
Adjusted Correlation after removing top 14 components: 0.0949
Adjusted Correlation afte

In [20]:
# Save the 2D embeddings with word labels to a CSV file
def save_embeddings_to_csv(words, embeddings, file_path):
    print(f"Saving 2D embeddings to {file_path}...")
    df = pd.DataFrame(embeddings, columns=['x', 'y'])
    df['word'] = words
    df.to_csv(file_path, index=False)
    print("2D embeddings saved successfully.")

In [19]:
# Get words and their high-dimensional embeddings
words = original_model.index_to_key  # List of words in vocabulary
file_path="Word2Vec_CuPCA_2d_word_embeddings.csv"
# Save the embeddings and words to a CSV file
save_embeddings_to_csv(words, optimized_embeddings, file_path)

Saving 2D embeddings to Word2Vec_CuPCA_2d_word_embeddings.csv...


3D embeddings saved successfully.
