# Text Embedding and Dimensionality Reduction

This notebook generates semantic embeddings from research papers and performs dimensionality reduction:
1. **Text Embedding**: Convert paper abstracts to high-dimensional vectors using Cohere API
2. **UMAP Reduction**: Reduce embeddings to 2D coordinates for visualization
3. **HDBSCAN Clustering**: Group papers by semantic similarity

**Input**: Cleaned paper data from previous step  
**Output**: Papers with embeddings, UMAP coordinates, and cluster assignments

## 1. Setup and Configuration

In [None]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
import cohere
import umap
import hdbscan

# Configuration
COHERE_API_KEY = "your_cohere_api_key_here"  # Replace with your API key
EMBEDDING_MODEL = "embed-english-v3.0"
BATCH_SIZE = 48
SLEEP_TIME = 1.2
MAX_RETRIES = 5

# File paths
INPUT_FILE = "Embedding Table.csv"
OUTPUT_FILE = "papers_with_embeddings.pkl"

# Initialize Cohere client
co = cohere.Client(COHERE_API_KEY)

# Test connection
try:
    test_response = co.embed(
        model=EMBEDDING_MODEL,
        texts=["test sentence"],
        input_type="search_document"
    )
    print(f"‚úÖ Cohere API connected. Embedding dimension: {len(test_response.embeddings[0])}")
except Exception as e:
    print(f"‚ùå API connection failed: {e}")

## 2. Load and Prepare Data

In [None]:
# Load cleaned paper data
df = pd.read_csv(INPUT_FILE)
print(f"Loaded dataset: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Prepare texts for embedding
texts = df["Embedding Text"].astype(str).tolist()
print(f"Ready to embed {len(texts)} texts")

# Display sample text
print(f"\nSample text: {texts[0][:200]}...")

## 3. Generate Text Embeddings

This process converts text to high-dimensional vectors with robust error handling and progress tracking.

In [None]:
def generate_embeddings_with_resume(texts, save_file):
    """Generate embeddings with ability to resume from interruptions"""
    
    # Check for existing progress
    if os.path.exists(save_file):
        df_saved = pd.read_pickle(save_file)
        existing_embeddings = df_saved["embedding"].tolist()
        print(f"üìÑ Resuming from {len(existing_embeddings)} existing embeddings")
    else:
        existing_embeddings = []
        print("üÜï Starting fresh embedding generation")
    
    start_idx = len(existing_embeddings)
    all_embeddings = existing_embeddings.copy()
    
    # Process remaining texts in batches
    for i in tqdm(range(start_idx, len(texts), BATCH_SIZE), 
                  desc="Generating embeddings",
                  initial=start_idx//BATCH_SIZE):
        
        batch = texts[i:i + BATCH_SIZE]
        retries = 0
        
        while retries < MAX_RETRIES:
            try:
                # Generate embeddings for batch
                response = co.embed(
                    model=EMBEDDING_MODEL,
                    texts=batch,
                    input_type="search_document"
                )
                
                # Add to collection
                all_embeddings.extend(response.embeddings)
                
                # Save progress incrementally
                df_partial = pd.DataFrame({
                    "EID": df["EID"][:len(all_embeddings)],
                    "Year": df["Year"][:len(all_embeddings)],
                    "Embedding Text": df["Embedding Text"][:len(all_embeddings)],
                    "embedding": all_embeddings
                })
                df_partial.to_pickle(save_file)
                
                time.sleep(SLEEP_TIME)
                break
                
            except Exception as e:
                retries += 1
                wait_time = 10 * retries
                print(f"\n‚ö†Ô∏è  Batch {i} failed (attempt {retries}/{MAX_RETRIES}): {e}")
                
                if retries < MAX_RETRIES:
                    print(f"‚è≥ Waiting {wait_time}s before retry...")
                    time.sleep(wait_time)
                else:
                    print(f"‚ùå Skipping batch {i} after {MAX_RETRIES} failures")
    
    return pd.read_pickle(save_file)

# Generate embeddings
df_with_embeddings = generate_embeddings_with_resume(texts, OUTPUT_FILE)
print(f"\n‚úÖ Embedding generation complete!")
print(f"üìä Dataset shape: {df_with_embeddings.shape}")
print(f"üßÆ Embedding dimension: {len(df_with_embeddings['embedding'][0])}")

## 4. Dimensionality Reduction with UMAP

Create 2D coordinates for visualization and reduced dimensions for clustering.

In [None]:
# Prepare embedding matrix
X = np.vstack(df_with_embeddings["embedding"].values)
print(f"Embedding matrix shape: {X.shape}")

# UMAP for clustering (higher dimensions)
print("üîÑ Reducing dimensions for clustering...")
umap_cluster = umap.UMAP(
    n_neighbors=15,
    n_components=10,
    min_dist=0.0,
    metric="cosine",
    random_state=42
)
X_cluster = umap_cluster.fit_transform(X)
print(f"Cluster embedding shape: {X_cluster.shape}")

# UMAP for visualization (2D)
print("üîÑ Creating 2D visualization coordinates...")
umap_viz = umap.UMAP(
    n_neighbors=30,
    n_components=2,
    min_dist=0.1,
    metric="cosine",
    random_state=42
)
X_viz = umap_viz.fit_transform(X)

# Add coordinates to dataframe
df_with_embeddings["umap_x"] = X_viz[:, 0]
df_with_embeddings["umap_y"] = X_viz[:, 1]

print("‚úÖ UMAP reduction complete")

## 5. Semantic Clustering with HDBSCAN

Group papers by semantic similarity using the reduced dimensional embeddings.

In [None]:
# Perform HDBSCAN clustering
print("üîÑ Performing semantic clustering...")
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=30,
    min_samples=5,
    cluster_selection_method="eom",
    metric="euclidean"
)

cluster_labels = clusterer.fit_predict(X_cluster)
df_with_embeddings["topic"] = cluster_labels

# Analyze clustering results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
noise_ratio = (cluster_labels == -1).mean()
cluster_sizes = pd.Series(cluster_labels).value_counts().sort_index()

print(f"\nüìä Clustering Results:")
print(f"   üìå Number of clusters: {n_clusters}")
print(f"   üîá Noise ratio: {noise_ratio:.1%}")
print(f"   üìà Largest cluster: {cluster_sizes.iloc[1:].max() if len(cluster_sizes) > 1 else 0} papers")
print(f"   üìâ Average cluster size: {cluster_sizes.iloc[1:].mean():.1f} papers")

# Show sample from largest cluster
if n_clusters > 0:
    largest_cluster = cluster_sizes.iloc[1:].idxmax()
    sample_texts = df_with_embeddings[df_with_embeddings["topic"] == largest_cluster]["Embedding Text"].head(3)
    print(f"\nüìù Sample from cluster {largest_cluster}:")
    for i, text in enumerate(sample_texts, 1):
        print(f"   {i}. {text[:100]}...")

## 6. Save Final Results

In [None]:
# Save complete dataset with all features
df_with_embeddings.to_pickle("research_with_topics_umap.pkl")
print("‚úÖ Saved complete dataset with embeddings")

# Save visualization-ready dataset (without embeddings for smaller file size)
df_viz = df_with_embeddings.drop(columns=["embedding"])
df_viz.to_csv("research_map_ready.csv", index=False)
print("‚úÖ Saved visualization-ready CSV")

print(f"\nüìã Final dataset summary:")
print(f"   üìÑ Total papers: {len(df_with_embeddings)}")
print(f"   üè∑Ô∏è  Clustered papers: {len(df_with_embeddings[df_with_embeddings['topic'] != -1])}")
print(f"   üîç Ready for visualization and analysis")