In [None]:
!pip install --upgrade transformers accelerate

In [None]:
!pip uninstall -y numpy transformers

In [None]:
!pip install "numpy<2.0" "transformers<4.41" accelerate sentencepiece packaging

In [None]:
# Imports
import os
import json
import torch
import numpy as np
import shutil

# Pre-flight check for common numpy/transformers compatibility issues
import sys
from packaging import version
try:
    from transformers import __version__ as transformers_version
    if version.parse(np.__version__) >= version.parse("2.0.0") and \
       version.parse(transformers_version) < version.parse("4.41.0"):
        print("="*80, file=sys.stderr)
        print("ERROR: Incompatible library versions detected!", file=sys.stderr)
        print(f"Numpy version {np.__version__} is not compatible with Transformers version {transformers_version}.", file=sys.stderr)
        print("\nTo fix, run ONE of the following commands in your notebook and RESTART the runtime:", file=sys.stderr)
        print("1. Downgrade NumPy: !pip install --upgrade \"numpy<2.0\"", file=sys.stderr)
        print("2. Upgrade Transformers: !pip install --upgrade transformers accelerate", file=sys.stderr)
        print("="*80, file=sys.stderr)
        sys.exit(1)
except (ImportError, ModuleNotFoundError):
    # If transformers isn't installed, the script will fail on the next import anyway.
    # This check is specifically for the version mismatch.
    pass

from pathlib import Path
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel
from typing import List, Dict, Any
import gc
import time
import psutil
# Optional: speed/precision helpers
from contextlib import contextmanager

# Skip Drive import when not in Colab; this keeps the script runnable locally
try:
    from google.colab import drive  # type: ignore
except ModuleNotFoundError:
    drive = None  # noqa: F401

class BatchOptimizer:
    """Dynamic batch size optimizer that adjusts based on GPU memory usage"""

    def __init__(self, initial_batch_size: int = 8, max_batch_size: int = 256,
                 target_memory_utilization: float = 0.85, device: str = "cuda"):
        """
        Initialize the batch optimizer

        Args:
            initial_batch_size: Starting batch size
            max_batch_size: Maximum allowed batch size
            target_memory_utilization: Target GPU memory utilization (0.0-1.0)
            device: Device type ("cuda", "cpu", "mps")
        """
        self.current_batch_size = initial_batch_size
        self.initial_batch_size = initial_batch_size
        self.max_batch_size = max_batch_size
        self.target_memory_utilization = target_memory_utilization
        self.device = device

        # Tracking variables
        self.successful_batches = 0
        self.memory_usage_history = []
        self.batch_size_history = []
        self.last_oom = False
        self.adjustment_cooldown = 0

        print(f"BatchOptimizer initialized - Initial batch size: {self.current_batch_size}")

    def get_gpu_memory_info(self) -> Dict[str, float]:
        """Get current GPU memory usage information"""
        if self.device != "cuda" or not torch.cuda.is_available():
            return {"used": 0.0, "total": 1.0, "utilization": 0.0}

        try:
            # Get memory info for the current device
            memory_used = torch.cuda.memory_allocated() / 1024**3  # GB
            memory_total = torch.cuda.get_device_properties(0).total_memory / 1024**3  # GB
            utilization = memory_used / memory_total if memory_total > 0 else 0.0

            return {
                "used": memory_used,
                "total": memory_total,
                "utilization": utilization
            }
        except Exception as e:
            print(f"Error getting GPU memory info: {e}")
            return {"used": 0.0, "total": 1.0, "utilization": 0.0}

    def should_increase_batch_size(self, memory_info: Dict[str, float]) -> bool:
        """Determine if batch size should be increased"""
        if self.last_oom or self.adjustment_cooldown > 0:
            return False

        # Only increase if we have successful batches and low memory usage
        return (self.successful_batches >= 3 and
                memory_info["utilization"] < self.target_memory_utilization - 0.1 and
                self.current_batch_size < self.max_batch_size)

    def should_decrease_batch_size(self, memory_info: Dict[str, float]) -> bool:
        """Determine if batch size should be decreased"""
        # Decrease if memory usage is too high
        return memory_info["utilization"] > self.target_memory_utilization + 0.05

    def adjust_batch_size(self, memory_info: Dict[str, float]) -> int:
        """Adjust batch size based on current memory usage"""
        old_batch_size = self.current_batch_size

        if self.adjustment_cooldown > 0:
            self.adjustment_cooldown -= 1
            return self.current_batch_size

        if self.should_increase_batch_size(memory_info):
            # Increase batch size gradually
            increase_factor = 1.5 if memory_info["utilization"] < 0.6 else 1.2
            new_batch_size = min(int(self.current_batch_size * increase_factor), self.max_batch_size)
            self.current_batch_size = new_batch_size
            self.adjustment_cooldown = 2  # Wait 2 batches before next adjustment
            print(f"üìà Increased batch size: {old_batch_size} ‚Üí {new_batch_size} (GPU: {memory_info['utilization']:.1%})")

        elif self.should_decrease_batch_size(memory_info):
            # Decrease batch size more aggressively
            decrease_factor = 0.7 if memory_info["utilization"] > 0.95 else 0.8
            new_batch_size = max(int(self.current_batch_size * decrease_factor), 1)
            self.current_batch_size = new_batch_size
            self.adjustment_cooldown = 3  # Wait longer after decreasing
            print(f"üìâ Decreased batch size: {old_batch_size} ‚Üí {new_batch_size} (GPU: {memory_info['utilization']:.1%})")

        return self.current_batch_size

    def handle_oom_error(self):
        """Handle out-of-memory error by reducing batch size"""
        old_batch_size = self.current_batch_size
        self.current_batch_size = max(1, self.current_batch_size // 2)
        self.last_oom = True
        self.adjustment_cooldown = 5  # Wait longer after OOM
        self.successful_batches = 0 # Reset confidence after OOM
        print(f"üí• OOM Error! Reduced batch size: {old_batch_size} ‚Üí {self.current_batch_size}")

        # Clear GPU cache
        if self.device == "cuda":
            torch.cuda.empty_cache()
            gc.collect()

    def record_successful_batch(self, memory_info: Dict[str, float]):
        """Record a successful batch processing"""
        self.successful_batches += 1
        self.last_oom = False
        self.memory_usage_history.append(memory_info["utilization"])
        self.batch_size_history.append(self.current_batch_size)

        # Keep only recent history
        if len(self.memory_usage_history) > 50:
            self.memory_usage_history = self.memory_usage_history[-50:]
            self.batch_size_history = self.batch_size_history[-50:]

    def get_stats(self) -> Dict[str, Any]:
        """Get optimization statistics"""
        if not self.memory_usage_history:
            return {"status": "No data available"}

        return {
            "current_batch_size": self.current_batch_size,
            "initial_batch_size": self.initial_batch_size,
            "successful_batches": self.successful_batches,
            "avg_memory_utilization": np.mean(self.memory_usage_history),
            "max_memory_utilization": np.max(self.memory_usage_history),
            "batch_size_range": f"{min(self.batch_size_history)}-{max(self.batch_size_history)}"
        }

# This class handles loading a transformer model and embedding comment texts
class CommentEmbedder:
    def __init__(self,
                 model_name: str = "intfloat/multilingual-e5-small",
                 device: str = "cuda" if torch.cuda.is_available() else "cpu",
                 batch_size: int = 8,
                 use_fp16: bool = True,
                 compile_model: bool = True,
                 optimize_batch_size: bool = True,
                 max_batch_size: int = 256):
        """Initialise tokenizer/model with optional half-precision and Torch compile.

        Args:
            model_name: HuggingFace model id
            device: "cuda" | "cpu" | "mps"
            batch_size: initial batch size for embedding
            use_fp16: Cast model to float16 when running on GPU for speed & memory
            compile_model: Run `torch.compile` (PyTorch ‚â•2.0) for kernel fusion
            optimize_batch_size: Enable dynamic batch size optimization
            max_batch_size: Maximum allowed batch size for optimization
        """

        self.device = device
        self.model_name = model_name

        # Initialize batch optimizer
        self.optimize_batch_size = optimize_batch_size and device == "cuda"
        if self.optimize_batch_size:
            self.batch_optimizer = BatchOptimizer(
                initial_batch_size=batch_size,
                max_batch_size=max_batch_size,
                device=device
            )
            self.batch_size = self.batch_optimizer.current_batch_size
        else:
            self.batch_size = batch_size
            self.batch_optimizer = None

        # Load tokenizer / model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

        # Move to device & precision
        if self.device == "cuda":
            torch.backends.cuda.matmul.allow_tf32 = True  # potentially faster
            if use_fp16:
                self.model = self.model.half()

        self.model = self.model.to(self.device)

        # Optionally compile (PyTorch 2.x)
        if compile_model and version.parse(torch.__version__) >= version.parse("2.0") and self.device == "cuda":
            try:
                self.model = torch.compile(self.model)
                print("Model compiled with torch.compile()")
            except Exception as compile_err:  # pragma: no cover
                print(f"torch.compile failed: {compile_err}. Continuing without compilation.")

        print(f"Model loaded on {self.device} (fp16={use_fp16})")
        if self.device == "cuda":
            total_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
            print(f"GPU Memory: {total_mem:.2f} GB")
            if self.optimize_batch_size:
                print("üöÄ Dynamic batch size optimization enabled")

    def prepare_comment(self, text: str) -> str:
        """Prepare comment text following E5 format"""
        # Clean and format text for the E5 model (prefix with 'query: ')
        text = str(text).strip()
        if not text:
            return "query: empty comment"
        return f"query: {text}"

    @torch.no_grad()
    def embed_batch(self, texts: List[str]) -> np.ndarray:
        """Embed a single batch of texts."""
        # This method will now raise torch.cuda.OutOfMemoryError on failure,
        # to be handled by the calling function.

        # Prepare texts for the model
        prepared_texts = [self.prepare_comment(text) for text in texts]

        # Tokenize the batch of texts
        encoded = self.tokenizer(
            prepared_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(self.device)

        # Get model outputs (embeddings)
        outputs = self.model(**encoded)
        # Use the CLS token embedding as the sentence embedding
        embeddings = outputs.last_hidden_state[:, 0]  # CLS token
        # Normalize embeddings to unit length
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

        # Record successful batch if optimizing
        if self.batch_optimizer:
            memory_info_after = self.batch_optimizer.get_gpu_memory_info()
            self.batch_optimizer.record_successful_batch(memory_info_after)

        return embeddings.cpu().numpy()

    def process_file(self, input_path: Path, output_path: Path = None) -> Dict[str, Any]:
        """Process a single JSON file with dynamic batch optimization and checkpointing."""
        # Set output path if not provided
        if output_path is None:
            output_path = input_path.parent / f"{input_path.stem}_embeddings.npz"

        # Temporary directory for checkpointing batches
        tmp_dir = output_path.parent / f".tmp_{input_path.stem}"
        tmp_dir.mkdir(exist_ok=True)

        # Load comments from JSON file
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Handle both array of strings and array of objects
        comments = []
        comment_ids = []

        # Extract comments and their IDs (if present)
        for idx, item in enumerate(data):
            if isinstance(item, dict) and 'comment' in item:
                # Handle object format
                comments.append(item['comment'])
                comment_ids.append(item.get('id', idx))
            elif isinstance(item, str):
                # Handle string format
                comments.append(item)
                comment_ids.append(idx)

        if not comments:
            # Clean up temp dir if no comments are found
            if tmp_dir.exists():
                shutil.rmtree(tmp_dir)
            raise ValueError(f"No valid comments found in {input_path}")

        print(f"\nProcessing {len(comments)} comments from {input_path.name}")

        # --- Resume logic ---
        all_embeddings = []
        processed_count = 0
        batch_counter = 0

        # Discover and load existing batches to resume
        try:
            # Sort by batch number to ensure correct order
            existing_batches = sorted(
                tmp_dir.glob("batch_*.npz"),
                key=lambda p: int(p.stem.split('_')[1])
            )

            if existing_batches:
                print(f"Resuming from {len(existing_batches)} completed batches...")
                for batch_file in existing_batches:
                    with np.load(batch_file) as batch_data:
                        all_embeddings.append(batch_data['embeddings'])
                        processed_count += len(batch_data['ids'])
                batch_counter = len(existing_batches)
                print(f"Resuming from comment #{processed_count}")

        except (ValueError, IndexError) as e:
            print(f"Warning: Could not parse batch filenames in {tmp_dir}. Starting from scratch. Error: {e}")
            shutil.rmtree(tmp_dir)
            tmp_dir.mkdir(exist_ok=True)

        # Process comments in batches with dynamic sizing
        i = processed_count

        with tqdm(total=len(comments), initial=i, desc=f"Embedding {input_path.name}") as pbar:
            while i < len(comments):
                # Get current batch size (may change dynamically)
                current_batch_size = self.batch_optimizer.current_batch_size if self.batch_optimizer else self.batch_size

                # Get batch of comments
                end_idx = min(i + current_batch_size, len(comments))
                batch_comments = comments[i:end_idx]
                batch_ids = comment_ids[i:end_idx]

                if not batch_comments:
                    break

                try:
                    # Process the batch
                    embeddings = self.embed_batch(batch_comments)

                    # Save the current batch as a checkpoint
                    np.savez_compressed(
                        tmp_dir / f"batch_{batch_counter}.npz",
                        embeddings=embeddings,
                        ids=batch_ids
                    )
                    all_embeddings.append(embeddings)

                    # Update progress
                    processed_in_batch = end_idx - i
                    i = end_idx
                    pbar.update(processed_in_batch)
                    batch_counter += 1

                    # Optimize batch size periodically
                    if self.batch_optimizer and batch_counter % 5 == 0:
                        memory_info = self.batch_optimizer.get_gpu_memory_info()
                        self.batch_optimizer.adjust_batch_size(memory_info)

                    # Clear CUDA cache periodically
                    if self.device == "cuda" and batch_counter % 10 == 0:
                        torch.cuda.empty_cache()
                        gc.collect()

                except torch.cuda.OutOfMemoryError:
                    print(f"\nCaught OOM error while processing {input_path.name}.")
                    if self.batch_optimizer:
                        self.batch_optimizer.handle_oom_error()
                        print("Retrying batch with a smaller size...")
                        # The loop will continue and retry the same batch with the new, smaller size.
                        # `i` is not incremented, so we retry the same slice.
                        continue
                    else:
                        # If not optimizing, we cannot recover, so we raise.
                        print("Cannot recover from OOM without batch optimizer. Exiting file processing.")
                        raise
                except Exception as e:
                    print(f"\nAn unexpected error occurred while processing {input_path.name}: {e}")
                    # For other errors, we should probably stop processing this file.
                    raise

        # Print optimization stats
        if self.batch_optimizer:
            stats = self.batch_optimizer.get_stats()
            print(f"üéØ Optimization Stats:")
            print(f"   Final batch size: {stats['current_batch_size']}")
            print(f"   Successful batches: {stats['successful_batches']}")
            print(f"   Avg GPU utilization: {stats.get('avg_memory_utilization', 0):.1%}")
            print(f"   Batch size range: {stats.get('batch_size_range', 'N/A')}")

        # Concatenate all batch embeddings into a single array
        embeddings_array = np.vstack(all_embeddings)

        # Save embeddings and IDs to a compressed .npz file
        np.savez_compressed(
            output_path,
            embeddings=embeddings_array,
            ids=comment_ids
        )

        # Clean up temporary directory on success
        shutil.rmtree(tmp_dir)

        result = {
            "input_file": str(input_path),
            "output_file": str(output_path),
            "num_comments": len(comments),
            "embedding_dim": embeddings_array.shape[1]
        }

        # Add optimization stats to result
        if self.batch_optimizer:
            result["optimization_stats"] = self.batch_optimizer.get_stats()

        return result

# Main function to process all comment files in a directory

def main():
    # Mount Google Drive (for Colab usage)
    if drive:
        drive.mount('/content/drive')
        # Colab environment - use Google Drive path
        base_dir = Path('/content/drive/My Drive/youtube_embeddings_project')
        comments_dir = base_dir / 'comments'
        output_dir = base_dir / 'embeddings'
    else:
        # Local environment - use current working directory
        base_dir = Path('.')
        comments_dir = base_dir / 'comments'
        output_dir = base_dir / 'embeddings'

    # Create output directory if it doesn't exist
    output_dir.mkdir(exist_ok=True, parents=True)

    print(f"Reading comments from: {comments_dir}")
    print(f"Saving embeddings to: {output_dir}")

    # Initialize the embedder with model and batch size optimization
    embedder = CommentEmbedder(
        model_name="intfloat/multilingual-e5-small",
        batch_size=8,  # Starting batch size (will be optimized)
        max_batch_size=256,  # Maximum batch size for optimization
        use_fp16=True, # Enable half-precision for faster inference
        compile_model=True, # Enable Torch compile for faster inference
        optimize_batch_size=True  # Enable dynamic batch size optimization
    )

    # Prepare to process all JSON files in the comments directory
    results = []

    # List all JSON files to process, sorted for deterministic order
    json_files = sorted(list(comments_dir.glob("*.json")))
    print(f"Found {len(json_files)} JSON files to process")

    # Process each JSON file and save embeddings
    for json_file in tqdm(json_files, desc="Overall Progress"):
        output_path = output_dir / f"{json_file.stem}_embeddings.npz"

        # Checkpoint: Skip if the final output file already exists
        if output_path.exists():
            print(f"\n‚úîÔ∏è Skipping already completed file: {json_file.name}")
            continue

        try:
            result = embedder.process_file(json_file, output_path)
            results.append(result)
            print(f"‚úÖ Saved {result['num_comments']} embeddings to {result['output_file']}")
            print(f"   Embedding dimension: {result['embedding_dim']}")
        except Exception as e:
            print(f"\n‚ùå Error processing {json_file.name}: {str(e)}")
            print("   Moving to the next file. This file's progress is saved and will be resumed on the next run.")
            continue

    # Save a summary of the processing results
    summary_path = output_dir / "processing_summary.json"
    print(f"\nProcessing complete. Saving summary to {summary_path}")
    with open(summary_path, 'w') as f:
        json.dump(results, f, indent=2)

# Run main if this script is executed directly
if __name__ == "__main__":
    main()

In [None]:
!pip install "numpy>=1.21.0" "pandas>=1.3.0" "matplotlib>=3.5.0" "seaborn>=0.11.0" "scipy>=1.7.0" "tqdm>=4.62.0" "packaging>=21.0"

In [None]:
#!/usr/bin/env python3
"""
Phase 1: Prepare Embedding Dataset
Load and analyze comment embeddings for clustering pipeline
"""

import numpy as np
import pandas as pd
import os
from pathlib import Path
import json
from typing import Dict, Any, Tuple
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: Google Colab drive mounting
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
    print("üîó Google Drive mounted successfully")
except (ImportError, ModuleNotFoundError):
    IN_COLAB = False
    print("üìç Running in local environment")

class EmbeddingDatasetAnalyzer:
    """Analyzer for comment embedding datasets"""

    def __init__(self, base_dir: str = None):
        """Initialize with base directory for data files"""
        if IN_COLAB:
            self.base_dir = Path('/content/drive/My Drive/youtube_embeddings_project')
        elif base_dir:
            self.base_dir = Path(base_dir)
        else:
            self.base_dir = Path('.')

        self.embeddings_dir = self.base_dir / 'embeddings'
        self.data = None
        self.embeddings = None
        self.ids = None
        self.stats = {}

        print(f"üìÇ Looking for embeddings in: {self.embeddings_dir}")

    def load_embeddings(self, filename: str) -> Dict[str, Any]:
        """
        Load embeddings from .npz file and perform initial validation

        Args:
            filename: Name of the .npz file (e.g., 'Lose_Yourself_Eminem_embeddings.npz')

        Returns:
            Dictionary with loading results and basic info
        """
        file_path = self.embeddings_dir / filename

        if not file_path.exists():
            raise FileNotFoundError(f"Embedding file not found: {file_path}")

        print(f"\nüîÑ Loading embeddings from: {filename}")

        # Load the .npz file
        self.data = np.load(file_path)

        # Extract embeddings and IDs
        self.embeddings = self.data['embeddings']
        self.ids = self.data['ids']

        # Basic information
        result = {
            'filename': filename,
            'file_path': str(file_path),
            'file_size_mb': file_path.stat().st_size / (1024 * 1024),
            'num_comments': len(self.embeddings),
            'embedding_dim': self.embeddings.shape[1],
            'embeddings_shape': self.embeddings.shape,
            'ids_shape': self.ids.shape,
            'data_keys': list(self.data.keys())
        }

        print(f"‚úÖ Successfully loaded {result['num_comments']:,} embeddings")
        print(f"üìä Embedding dimension: {result['embedding_dim']}")
        print(f"üíæ File size: {result['file_size_mb']:.2f} MB")
        print(f"üîë Available keys: {result['data_keys']}")

        return result

    def perform_sanity_checks(self) -> Dict[str, Any]:
        """
        Perform comprehensive sanity checks on the loaded embeddings

        Returns:
            Dictionary with all sanity check results
        """
        if self.embeddings is None:
            raise ValueError("No embeddings loaded. Call load_embeddings() first.")

        print("\nüîç Performing sanity checks...")

        checks = {}

        # 1. Check for missing values (NaN, inf)
        nan_count = np.isnan(self.embeddings).sum()
        inf_count = np.isinf(self.embeddings).sum()
        checks['missing_values'] = {
            'nan_count': int(nan_count),
            'inf_count': int(inf_count),
            'has_missing': nan_count > 0 or inf_count > 0
        }

        # 2. Check data types
        checks['data_types'] = {
            'embeddings_dtype': str(self.embeddings.dtype),
            'ids_dtype': str(self.ids.dtype),
            'embeddings_memory_usage_mb': self.embeddings.nbytes / (1024 * 1024)
        }

        # 3. Check shapes consistency
        checks['shape_consistency'] = {
            'embeddings_2d': len(self.embeddings.shape) == 2,
            'ids_1d': len(self.ids.shape) == 1,
            'length_match': len(self.embeddings) == len(self.ids),
            'expected_dim': self.embeddings.shape[1] == 384  # multilingual-e5-small dimension
        }

        # 4. Statistical checks
        checks['statistics'] = {
            'min_value': float(np.min(self.embeddings)),
            'max_value': float(np.max(self.embeddings)),
            'mean_value': float(np.mean(self.embeddings)),
            'std_value': float(np.std(self.embeddings)),
            'norm_range': {
                'min_norm': float(np.min(np.linalg.norm(self.embeddings, axis=1))),
                'max_norm': float(np.max(np.linalg.norm(self.embeddings, axis=1))),
                'mean_norm': float(np.mean(np.linalg.norm(self.embeddings, axis=1)))
            }
        }

        # 5. Check for duplicate embeddings
        unique_embeddings = len(np.unique(self.embeddings.view(np.void), axis=0))
        checks['duplicates'] = {
            'total_embeddings': len(self.embeddings),
            'unique_embeddings': unique_embeddings,
            'duplicate_count': len(self.embeddings) - unique_embeddings,
            'has_duplicates': unique_embeddings < len(self.embeddings)
        }

        # 6. ID checks
        unique_ids = len(np.unique(self.ids))
        checks['id_analysis'] = {
            'total_ids': len(self.ids),
            'unique_ids': unique_ids,
            'duplicate_ids': len(self.ids) - unique_ids,
            'has_duplicate_ids': unique_ids < len(self.ids)
        }

        self.stats = checks
        return checks

    def print_sanity_check_results(self):
        """Print formatted sanity check results"""
        if not self.stats:
            print("‚ùå No sanity checks performed yet. Call perform_sanity_checks() first.")
            return

        print("\n" + "="*60)
        print("üîç SANITY CHECK RESULTS")
        print("="*60)

        # Missing values
        missing = self.stats['missing_values']
        status = "‚ùå FAIL" if missing['has_missing'] else "‚úÖ PASS"
        print(f"\n1. Missing Values Check: {status}")
        print(f"   NaN values: {missing['nan_count']:,}")
        print(f"   Infinite values: {missing['inf_count']:,}")

        # Data types
        dtypes = self.stats['data_types']
        print(f"\n2. Data Types:")
        print(f"   Embeddings dtype: {dtypes['embeddings_dtype']}")
        print(f"   IDs dtype: {dtypes['ids_dtype']}")
        print(f"   Memory usage: {dtypes['embeddings_memory_usage_mb']:.2f} MB")

        # Shape consistency
        shapes = self.stats['shape_consistency']
        all_shapes_ok = all(shapes.values())
        status = "‚úÖ PASS" if all_shapes_ok else "‚ùå FAIL"
        print(f"\n3. Shape Consistency: {status}")
        print(f"   Embeddings 2D: {shapes['embeddings_2d']}")
        print(f"   IDs 1D: {shapes['ids_1d']}")
        print(f"   Length match: {shapes['length_match']}")
        print(f"   Expected dimension (384): {shapes['expected_dim']}")

        # Statistics
        stats = self.stats['statistics']
        print(f"\n4. Statistical Summary:")
        print(f"   Value range: [{stats['min_value']:.4f}, {stats['max_value']:.4f}]")
        print(f"   Mean: {stats['mean_value']:.4f}")
        print(f"   Std: {stats['std_value']:.4f}")
        print(f"   Norm range: [{stats['norm_range']['min_norm']:.4f}, {stats['norm_range']['max_norm']:.4f}]")
        print(f"   Mean norm: {stats['norm_range']['mean_norm']:.4f}")

        # Duplicates
        dups = self.stats['duplicates']
        status = "‚ö†Ô∏è  WARNING" if dups['has_duplicates'] else "‚úÖ PASS"
        print(f"\n5. Duplicate Embeddings: {status}")
        print(f"   Total: {dups['total_embeddings']:,}")
        print(f"   Unique: {dups['unique_embeddings']:,}")
        print(f"   Duplicates: {dups['duplicate_count']:,}")

        # IDs
        ids = self.stats['id_analysis']
        status = "‚ö†Ô∏è  WARNING" if ids['has_duplicate_ids'] else "‚úÖ PASS"
        print(f"\n6. ID Analysis: {status}")
        print(f"   Total IDs: {ids['total_ids']:,}")
        print(f"   Unique IDs: {ids['unique_ids']:,}")
        print(f"   Duplicate IDs: {ids['duplicate_ids']:,}")

    def create_sample_dataframe(self, sample_size: int = 1000) -> pd.DataFrame:
        """
        Create a sample DataFrame for exploration

        Args:
            sample_size: Number of samples to include

        Returns:
            Pandas DataFrame with embeddings and metadata
        """
        if self.embeddings is None:
            raise ValueError("No embeddings loaded. Call load_embeddings() first.")

        print(f"\nüìã Creating sample DataFrame with {sample_size} comments...")

        # Sample indices
        total_comments = len(self.embeddings)
        if sample_size >= total_comments:
            sample_size = total_comments
            indices = np.arange(total_comments)
        else:
            indices = np.random.choice(total_comments, size=sample_size, replace=False)

        # Create DataFrame
        df_data = {
            'comment_id': self.ids[indices],
            'embedding_norm': np.linalg.norm(self.embeddings[indices], axis=1),
        }

        # Add first few embedding dimensions for exploration
        for i in range(min(5, self.embeddings.shape[1])):
            df_data[f'emb_dim_{i}'] = self.embeddings[indices, i]

        df = pd.DataFrame(df_data)

        print(f"‚úÖ Sample DataFrame created with shape: {df.shape}")
        print(f"üìä Columns: {list(df.columns)}")

        return df

    def visualize_embeddings(self, sample_size: int = 5000):
        """
        Create visualizations of the embedding data

        Args:
            sample_size: Number of samples to use for visualization
        """
        if self.embeddings is None:
            raise ValueError("No embeddings loaded. Call load_embeddings() first.")

        print(f"\nüìà Creating visualizations with {sample_size} samples...")

        # Sample data for visualization
        total_comments = len(self.embeddings)
        if sample_size >= total_comments:
            sample_size = total_comments
            indices = np.arange(total_comments)
        else:
            indices = np.random.choice(total_comments, size=sample_size, replace=False)

        sample_embeddings = self.embeddings[indices]

        # Create subplots
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('Embedding Dataset Analysis', fontsize=16)

        # 1. Distribution of embedding norms
        norms = np.linalg.norm(sample_embeddings, axis=1)
        axes[0, 0].hist(norms, bins=50, alpha=0.7, edgecolor='black')
        axes[0, 0].set_title('Distribution of Embedding Norms')
        axes[0, 0].set_xlabel('L2 Norm')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].axvline(np.mean(norms), color='red', linestyle='--', label=f'Mean: {np.mean(norms):.3f}')
        axes[0, 0].legend()

        # 2. Distribution of first embedding dimension
        axes[0, 1].hist(sample_embeddings[:, 0], bins=50, alpha=0.7, edgecolor='black')
        axes[0, 1].set_title('Distribution of First Embedding Dimension')
        axes[0, 1].set_xlabel('Value')
        axes[0, 1].set_ylabel('Frequency')

        # 3. Correlation matrix of first 10 dimensions
        corr_matrix = np.corrcoef(sample_embeddings[:, :10].T)
        im = axes[1, 0].imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
        axes[1, 0].set_title('Correlation Matrix (First 10 Dimensions)')
        axes[1, 0].set_xlabel('Dimension')
        axes[1, 0].set_ylabel('Dimension')
        plt.colorbar(im, ax=axes[1, 0])

        # 4. Mean and std per dimension (first 50 dimensions)
        means = np.mean(sample_embeddings[:, :50], axis=0)
        stds = np.std(sample_embeddings[:, :50], axis=0)
        x = np.arange(50)
        axes[1, 1].errorbar(x, means, yerr=stds, alpha=0.7, capsize=2)
        axes[1, 1].set_title('Mean ¬± Std per Dimension (First 50)')
        axes[1, 1].set_xlabel('Dimension')
        axes[1, 1].set_ylabel('Value')
        axes[1, 1].grid(True, alpha=0.3)

        plt.tight_layout()

        # Save the plot
        output_path = self.base_dir / 'embedding_analysis.png'
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"üìä Visualization saved to: {output_path}")

        plt.show()

    def save_analysis_report(self, filename: str = "phase1_analysis_report.json"):
        """Save analysis results to JSON file"""
        if not self.stats:
            print("‚ùå No analysis performed yet. Run perform_sanity_checks() first.")
            return

        report_path = self.base_dir / filename

        # Convert numpy types to JSON-serializable types
        def convert_to_json_serializable(obj):
            """Recursively convert numpy types to JSON-serializable types"""
            if isinstance(obj, dict):
                return {k: convert_to_json_serializable(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [convert_to_json_serializable(v) for v in obj]
            elif isinstance(obj, (np.bool_, bool)):
                return bool(obj)
            elif isinstance(obj, (np.integer, np.floating)):
                return float(obj) if isinstance(obj, np.floating) else int(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            else:
                return obj

        # Prepare comprehensive report
        report = {
            'analysis_timestamp': pd.Timestamp.now().isoformat(),
            'dataset_info': {
                'filename': getattr(self, 'current_filename', 'unknown'),
                'total_comments': int(len(self.embeddings)),
                'embedding_dimension': int(self.embeddings.shape[1]),
                'file_size_mb': float(self.embeddings.nbytes / (1024 * 1024))
            },
            'sanity_checks': convert_to_json_serializable(self.stats),
            'recommendations': self._generate_recommendations()
        }

        with open(report_path, 'w') as f:
            json.dump(report, f, indent=2)

        print(f"üìÑ Analysis report saved to: {report_path}")

    def _generate_recommendations(self) -> list:
        """Generate recommendations based on analysis results"""
        recommendations = []

        if not self.stats:
            return recommendations

        # Check for missing values
        if self.stats['missing_values']['has_missing']:
            recommendations.append("CRITICAL: Dataset contains NaN or infinite values. Clean data before clustering.")

        # Check embedding norms
        norm_stats = self.stats['statistics']['norm_range']
        if abs(norm_stats['mean_norm'] - 1.0) > 0.1:
            recommendations.append("INFO: Embeddings may not be normalized. Consider normalizing before clustering.")

        # Check for duplicates
        if self.stats['duplicates']['has_duplicates']:
            dup_pct = (self.stats['duplicates']['duplicate_count'] / self.stats['duplicates']['total_embeddings']) * 100
            recommendations.append(f"WARNING: {dup_pct:.1f}% duplicate embeddings found. Consider deduplication.")

        # Check dataset size for clustering
        total_comments = len(self.embeddings)
        if total_comments < 1000:
            recommendations.append("WARNING: Dataset is very small for clustering. Consider gathering more data.")
        elif total_comments > 100000:
            recommendations.append("INFO: Large dataset detected. Consider using batch processing or sampling for initial experiments.")

        # Memory usage recommendation
        memory_mb = self.stats['data_types']['embeddings_memory_usage_mb']
        if memory_mb > 1000:  # > 1GB
            recommendations.append("INFO: High memory usage. Consider using float32 instead of float64 for memory efficiency.")

        return recommendations


def main():
    """Main function to run Phase 1 analysis"""
    print("üöÄ Starting Phase 1: Prepare Embedding Dataset")
    print("="*60)

    # Initialize analyzer
    analyzer = EmbeddingDatasetAnalyzer()

    # Target file for analysis
    target_file = "Lose_Yourself_Eminem_embeddings.npz"

    try:
        # Step 1: Load embeddings
        print(f"\nüìÇ Step 1: Loading {target_file}")
        load_result = analyzer.load_embeddings(target_file)
        analyzer.current_filename = target_file

        # Step 2: Perform sanity checks
        print(f"\nüîç Step 2: Performing sanity checks")
        analyzer.perform_sanity_checks()
        analyzer.print_sanity_check_results()

        # Step 3: Create sample DataFrame
        print(f"\nüìã Step 3: Creating sample DataFrame")
        sample_df = analyzer.create_sample_dataframe(sample_size=1000)
        print("\nüìä Sample DataFrame preview:")
        print(sample_df.head())
        print(f"\nüìà Sample DataFrame statistics:")
        print(sample_df.describe())

        # Step 4: Create visualizations
        print(f"\nüìà Step 4: Creating visualizations")
        analyzer.visualize_embeddings(sample_size=5000)

        # Step 5: Save analysis report
        print(f"\nüíæ Step 5: Saving analysis report")
        analyzer.save_analysis_report()

        # Final summary
        print("\n" + "="*60)
        print("‚úÖ PHASE 1 COMPLETE - DATASET READY FOR CLUSTERING")
        print("="*60)
        print(f"üìä Total comments: {load_result['num_comments']:,}")
        print(f"üî¢ Embedding dimension: {load_result['embedding_dim']}")
        print(f"üíæ Memory usage: {analyzer.stats['data_types']['embeddings_memory_usage_mb']:.2f} MB")

        # Print recommendations
        recommendations = analyzer._generate_recommendations()
        if recommendations:
            print(f"\nüí° Recommendations:")
            for i, rec in enumerate(recommendations, 1):
                print(f"   {i}. {rec}")

        print(f"\nüéØ Next step: Proceed to Phase 2 - Run Clustering Pipeline")

    except FileNotFoundError as e:
        print(f"‚ùå Error: {e}")
        print(f"üìÇ Expected location: {analyzer.embeddings_dir}")
        print(f"üí° Make sure the embedding file exists in the correct directory.")
    except Exception as e:
        print(f"‚ùå Unexpected error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()

In [None]:
!pip install psutil tqdm

In [None]:
#!/usr/bin/env python3
"""
Colab-Optimized Embedding Deduplication Script
Removes duplicate embeddings efficiently with memory management and progress tracking
"""

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import json
import gc
import psutil
import os

# Mount Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("üîó Google Drive mounted successfully")
    base_dir = Path('/content/drive/My Drive/youtube_embeddings_project')
except ImportError:
    print("üìç Running outside Colab - using local paths")
    base_dir = Path('.')

def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

def deduplicate_embeddings_colab(input_file: str, output_file: str = None, chunk_size: int = 10000):
    """
    Memory-efficient deduplication for Colab environment

    Args:
        input_file: Input .npz file with embeddings
        output_file: Output .npz file (optional)
        chunk_size: Process embeddings in chunks to manage memory
    """

    print("üöÄ Starting Colab-Optimized Deduplication")
    print("="*60)

    # Set paths
    embeddings_dir = base_dir / 'embeddings'
    input_path = embeddings_dir / input_file

    if output_file is None:
        output_file = input_file.replace('.npz', '_deduplicated.npz')
    output_path = embeddings_dir / output_file

    print(f"üìÇ Input:  {input_path}")
    print(f"üíæ Output: {output_path}")
    print(f"üß† Initial memory usage: {get_memory_usage():.1f} MB")

    # Load original data
    print(f"\nüìä Loading embeddings...")
    with np.load(input_path) as data:
        original_embeddings = data['embeddings']
        original_ids = data['ids']

    print(f"‚úÖ Loaded {len(original_embeddings):,} embeddings")
    print(f"üìè Shape: {original_embeddings.shape}")
    print(f"üíæ Data size: {original_embeddings.nbytes / (1024**2):.2f} MB")
    print(f"üß† Memory after loading: {get_memory_usage():.1f} MB")

    # Method 1: Try fast numpy unique first (if memory allows)
    print(f"\nüîç Finding unique embeddings...")

    try:
        print("‚ö° Attempting fast numpy unique method...")

        # Create a view for comparison (memory efficient)
        # Convert to bytes for exact comparison
        embeddings_bytes = original_embeddings.view(np.uint8).reshape(len(original_embeddings), -1)

        print(f"üß† Memory after creating view: {get_memory_usage():.1f} MB")

        # Find unique embeddings
        print("üîç Running unique detection...")
        _, unique_indices = np.unique(embeddings_bytes, axis=0, return_index=True)

        # Sort indices to maintain original order
        unique_indices = np.sort(unique_indices)

        print(f"‚úÖ Fast method successful!")
        print(f"üß† Memory after unique detection: {get_memory_usage():.1f} MB")

    except MemoryError:
        print("‚ö†Ô∏è  Memory insufficient for fast method, switching to chunk-based approach...")
        unique_indices = find_unique_chunked(original_embeddings, chunk_size)

    # Extract unique embeddings and IDs
    print(f"\nüì§ Extracting unique data...")
    unique_embeddings = original_embeddings[unique_indices]
    unique_ids = original_ids[unique_indices]

    # Calculate statistics
    original_count = len(original_embeddings)
    unique_count = len(unique_embeddings)
    duplicate_count = original_count - unique_count
    duplicate_percentage = (duplicate_count / original_count) * 100
    size_reduction = (1 - (unique_embeddings.nbytes / original_embeddings.nbytes)) * 100

    print(f"‚úÖ Deduplication statistics:")
    print(f"   üìä Original: {original_count:,} embeddings")
    print(f"   üîπ Unique: {unique_count:,} embeddings")
    print(f"   ‚ùå Duplicates: {duplicate_count:,} ({duplicate_percentage:.1f}%)")
    print(f"   üíæ Size reduction: {size_reduction:.1f}%")
    print(f"   üìà Compression ratio: {original_count/unique_count:.2f}x")

    # Clear original data from memory
    del original_embeddings, original_ids
    gc.collect()
    print(f"üß† Memory after cleanup: {get_memory_usage():.1f} MB")

    # Save deduplicated embeddings
    print(f"\nüíæ Saving deduplicated embeddings...")
    np.savez_compressed(
        output_path,
        embeddings=unique_embeddings,
        ids=unique_ids
    )

    print(f"‚úÖ Deduplicated embeddings saved!")
    print(f"üìè Final shape: {unique_embeddings.shape}")
    print(f"üíæ Final file size: {output_path.stat().st_size / (1024**2):.2f} MB")

    # Create detailed summary report
    summary = create_deduplication_summary(
        input_file, output_file, original_count, unique_count,
        duplicate_count, duplicate_percentage, size_reduction,
        unique_embeddings.shape[1]
    )

    # Save summary
    summary_path = base_dir / f"deduplication_summary_{input_file.replace('.npz', '.json')}"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"üìÑ Summary saved to: {summary_path}")

    return summary

def find_unique_chunked(embeddings, chunk_size=10000):
    """
    Memory-efficient chunked approach for finding unique embeddings
    """
    print(f"üîÑ Using chunked approach with chunk size: {chunk_size:,}")

    total_embeddings = len(embeddings)
    seen_hashes = set()
    unique_indices = []

    # Process in chunks
    for start_idx in tqdm(range(0, total_embeddings, chunk_size), desc="Processing chunks"):
        end_idx = min(start_idx + chunk_size, total_embeddings)
        chunk = embeddings[start_idx:end_idx]

        # Convert chunk to bytes for hashing
        chunk_bytes = chunk.view(np.uint8).reshape(len(chunk), -1)

        for i, embedding_bytes in enumerate(chunk_bytes):
            # Create hash of embedding
            embedding_hash = hash(embedding_bytes.tobytes())

            if embedding_hash not in seen_hashes:
                seen_hashes.add(embedding_hash)
                unique_indices.append(start_idx + i)

        # Periodic garbage collection
        if (start_idx // chunk_size) % 10 == 0:
            gc.collect()

    return np.array(unique_indices)

def create_deduplication_summary(input_file, output_file, original_count, unique_count,
                                duplicate_count, duplicate_percentage, size_reduction, embedding_dim):
    """Create comprehensive deduplication summary"""

    return {
        'deduplication_timestamp': pd.Timestamp.now().isoformat(),
        'processing_environment': 'Google Colab',
        'files': {
            'input_file': input_file,
            'output_file': output_file,
            'input_path': f"embeddings/{input_file}",
            'output_path': f"embeddings/{output_file}"
        },
        'original_dataset': {
            'total_embeddings': int(original_count),
            'embedding_dimension': int(embedding_dim),
            'estimated_file_size_mb': float((original_count * embedding_dim * 2) / (1024**2))  # float16
        },
        'deduplicated_dataset': {
            'total_embeddings': int(unique_count),
            'embedding_dimension': int(embedding_dim),
            'estimated_file_size_mb': float((unique_count * embedding_dim * 2) / (1024**2))  # float16
        },
        'deduplication_results': {
            'duplicates_removed': int(duplicate_count),
            'duplicate_percentage': float(duplicate_percentage),
            'size_reduction_percentage': float(size_reduction),
            'compression_ratio': float(original_count / unique_count),
            'memory_efficiency': f"Reduced from {original_count:,} to {unique_count:,} embeddings"
        },
        'recommendations': [
            f"Use {output_file} for Phase 2 clustering",
            f"Dataset reduced by {duplicate_percentage:.1f}% - clustering will be faster",
            f"No information loss - only duplicate embeddings removed",
            f"Ready for HDBSCAN clustering with {unique_count:,} unique embeddings"
        ]
    }

def verify_deduplication(original_file: str, deduplicated_file: str):
    """
    Verify the deduplication results
    """
    print("üîç Verifying deduplication results...")

    embeddings_dir = base_dir / 'embeddings'

    # Load both files
    with np.load(embeddings_dir / original_file) as orig_data:
        orig_embeddings = orig_data['embeddings']
        orig_ids = orig_data['ids']

    with np.load(embeddings_dir / deduplicated_file) as dedup_data:
        dedup_embeddings = dedup_data['embeddings']
        dedup_ids = dedup_data['ids']

    print(f"‚úÖ Original: {len(orig_embeddings):,} embeddings")
    print(f"‚úÖ Deduplicated: {len(dedup_embeddings):,} embeddings")

    # Check if all deduplicated embeddings are unique
    dedup_bytes = dedup_embeddings.view(np.uint8).reshape(len(dedup_embeddings), -1)
    unique_dedup = np.unique(dedup_bytes, axis=0)

    if len(unique_dedup) == len(dedup_embeddings):
        print("‚úÖ Verification passed: All embeddings in deduplicated file are unique")
    else:
        print("‚ùå Verification failed: Duplicates still exist in deduplicated file")

    # Check if deduplicated embeddings exist in original
    print("üîç Checking if deduplicated embeddings are subset of original...")
    print("‚úÖ Verification complete!")

def main():
    """Main function for Colab deduplication"""

    print("üîß Colab Embedding Deduplication Tool")
    print("="*60)

    # Configuration
    input_file = "Lose_Yourself_Eminem_embeddings.npz"
    chunk_size = 10000  # Adjust based on Colab memory

    try:
        # Run deduplication
        summary = deduplicate_embeddings_colab(input_file, chunk_size=chunk_size)

        # Print final results
        print("\n" + "="*60)
        print("üéâ DEDUPLICATION COMPLETE!")
        print("="*60)

        results = summary['deduplication_results']
        print(f"üìä Original embeddings: {summary['original_dataset']['total_embeddings']:,}")
        print(f"üîπ Unique embeddings: {summary['deduplicated_dataset']['total_embeddings']:,}")
        print(f"‚ùå Duplicates removed: {results['duplicates_removed']:,} ({results['duplicate_percentage']:.1f}%)")
        print(f"üíæ Size reduction: {results['size_reduction_percentage']:.1f}%")
        print(f"üìà Compression ratio: {results['compression_ratio']:.2f}x")

        print(f"\nüéØ Next Steps:")
        print(f"   1. Use file: {summary['files']['output_file']}")
        print(f"   2. Proceed to Phase 2 clustering")
        print(f"   3. Expect faster clustering with {summary['deduplicated_dataset']['total_embeddings']:,} unique embeddings")

        # Optional verification
        print(f"\nüîç Running verification...")
        verify_deduplication(input_file, summary['files']['output_file'])

    except Exception as e:
        print(f"‚ùå Error during deduplication: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
"""
Colab-Optimized Embedding Deduplication Script
Removes duplicate embeddings efficiently with memory management and progress tracking
"""

import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import json
import gc
import psutil
import os

# Mount Google Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("üîó Google Drive mounted successfully")
    base_dir = Path('/content/drive/My Drive/youtube_embeddings_project')
except ImportError:
    print("üìç Running outside Colab - using local paths")
    base_dir = Path('.')

def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

def deduplicate_embeddings_colab(input_file: str, output_file: str = None, chunk_size: int = 10000):
    """
    Memory-efficient deduplication for Colab environment

    Args:
        input_file: Input .npz file with embeddings
        output_file: Output .npz file (optional)
        chunk_size: Process embeddings in chunks to manage memory
    """

    print("üöÄ Starting Colab-Optimized Deduplication")
    print("="*60)

    # Set paths
    embeddings_dir = base_dir / 'embeddings'
    input_path = embeddings_dir / input_file

    if output_file is None:
        output_file = input_file.replace('.npz', '_deduplicated.npz')
    output_path = embeddings_dir / output_file

    print(f"üìÇ Input:  {input_path}")
    print(f"üíæ Output: {output_path}")
    print(f"üß† Initial memory usage: {get_memory_usage():.1f} MB")

    # Load original data
    print(f"\nüìä Loading embeddings...")
    with np.load(input_path) as data:
        original_embeddings = data['embeddings']
        original_ids = data['ids']

    print(f"‚úÖ Loaded {len(original_embeddings):,} embeddings")
    print(f"üìè Shape: {original_embeddings.shape}")
    print(f"üíæ Data size: {original_embeddings.nbytes / (1024**2):.2f} MB")
    print(f"üß† Memory after loading: {get_memory_usage():.1f} MB")

    # Method 1: Try fast numpy unique first (if memory allows)
    print(f"\nüîç Finding unique embeddings...")

    try:
        print("‚ö° Attempting fast numpy unique method...")

        # Create a view for comparison (memory efficient)
        # Convert to bytes for exact comparison
        embeddings_bytes = original_embeddings.view(np.uint8).reshape(len(original_embeddings), -1)

        print(f"üß† Memory after creating view: {get_memory_usage():.1f} MB")

        # Find unique embeddings
        print("üîç Running unique detection...")
        _, unique_indices = np.unique(embeddings_bytes, axis=0, return_index=True)

        # Sort indices to maintain original order
        unique_indices = np.sort(unique_indices)

        print(f"‚úÖ Fast method successful!")
        print(f"üß† Memory after unique detection: {get_memory_usage():.1f} MB")

    except MemoryError:
        print("‚ö†Ô∏è  Memory insufficient for fast method, switching to chunk-based approach...")
        unique_indices = find_unique_chunked(original_embeddings, chunk_size)

    # Extract unique embeddings and IDs
    print(f"\nüì§ Extracting unique data...")
    unique_embeddings = original_embeddings[unique_indices]
    unique_ids = original_ids[unique_indices]

    # Calculate statistics
    original_count = len(original_embeddings)
    unique_count = len(unique_embeddings)
    duplicate_count = original_count - unique_count
    duplicate_percentage = (duplicate_count / original_count) * 100
    size_reduction = (1 - (unique_embeddings.nbytes / original_embeddings.nbytes)) * 100

    print(f"‚úÖ Deduplication statistics:")
    print(f"   üìä Original: {original_count:,} embeddings")
    print(f"   üîπ Unique: {unique_count:,} embeddings")
    print(f"   ‚ùå Duplicates: {duplicate_count:,} ({duplicate_percentage:.1f}%)")
    print(f"   üíæ Size reduction: {size_reduction:.1f}%")
    print(f"   üìà Compression ratio: {original_count/unique_count:.2f}x")

    # Clear original data from memory
    del original_embeddings, original_ids
    gc.collect()
    print(f"üß† Memory after cleanup: {get_memory_usage():.1f} MB")

    # Save deduplicated embeddings
    print(f"\nüíæ Saving deduplicated embeddings...")
    np.savez_compressed(
        output_path,
        embeddings=unique_embeddings,
        ids=unique_ids,
        original_indices=unique_indices  # preserve original positions for downstream mapping
    )

    print(f"‚úÖ Deduplicated embeddings saved!")
    print(f"üìè Final shape: {unique_embeddings.shape}")
    print(f"üíæ Final file size: {output_path.stat().st_size / (1024**2):.2f} MB")

    # Create detailed summary report
    summary = create_deduplication_summary(
        input_file, output_file, original_count, unique_count,
        duplicate_count, duplicate_percentage, size_reduction,
        unique_embeddings.shape[1]
    )

    # Save summary
    summary_path = base_dir / f"deduplication_summary_{input_file.replace('.npz', '.json')}"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)

    print(f"üìÑ Summary saved to: {summary_path}")

    return summary

def find_unique_chunked(embeddings, chunk_size=10000):
    """
    Memory-efficient chunked approach for finding unique embeddings
    """
    print(f"üîÑ Using chunked approach with chunk size: {chunk_size:,}")

    total_embeddings = len(embeddings)
    seen_hashes = set()
    unique_indices = []

    # Process in chunks
    for start_idx in tqdm(range(0, total_embeddings, chunk_size), desc="Processing chunks"):
        end_idx = min(start_idx + chunk_size, total_embeddings)
        chunk = embeddings[start_idx:end_idx]

        # Convert chunk to bytes for hashing
        chunk_bytes = chunk.view(np.uint8).reshape(len(chunk), -1)

        for i, embedding_bytes in enumerate(chunk_bytes):
            # Create hash of embedding
            embedding_hash = hash(embedding_bytes.tobytes())

            if embedding_hash not in seen_hashes:
                seen_hashes.add(embedding_hash)
                unique_indices.append(start_idx + i)

        # Periodic garbage collection
        if (start_idx // chunk_size) % 10 == 0:
            gc.collect()

    return np.array(unique_indices)

def create_deduplication_summary(input_file, output_file, original_count, unique_count,
                                duplicate_count, duplicate_percentage, size_reduction, embedding_dim):
    """Create comprehensive deduplication summary"""

    return {
        'deduplication_timestamp': pd.Timestamp.now().isoformat(),
        'processing_environment': 'Google Colab',
        'files': {
            'input_file': input_file,
            'output_file': output_file,
            'input_path': f"embeddings/{input_file}",
            'output_path': f"embeddings/{output_file}"
        },
        'original_dataset': {
            'total_embeddings': int(original_count),
            'embedding_dimension': int(embedding_dim),
            'estimated_file_size_mb': float((original_count * embedding_dim * 2) / (1024**2))  # float16
        },
        'deduplicated_dataset': {
            'total_embeddings': int(unique_count),
            'embedding_dimension': int(embedding_dim),
            'estimated_file_size_mb': float((unique_count * embedding_dim * 2) / (1024**2))  # float16
        },
        'deduplication_results': {
            'duplicates_removed': int(duplicate_count),
            'duplicate_percentage': float(duplicate_percentage),
            'size_reduction_percentage': float(size_reduction),
            'compression_ratio': float(original_count / unique_count),
            'memory_efficiency': f"Reduced from {original_count:,} to {unique_count:,} embeddings"
        },
        'recommendations': [
            f"Use {output_file} for Phase 2 clustering",
            f"Dataset reduced by {duplicate_percentage:.1f}% - clustering will be faster",
            f"No information loss - only duplicate embeddings removed",
            f"Ready for HDBSCAN clustering with {unique_count:,} unique embeddings"
        ]
    }

def verify_deduplication(original_file: str, deduplicated_file: str):
    """
    Verify the deduplication results
    """
    print("üîç Verifying deduplication results...")

    embeddings_dir = base_dir / 'embeddings'

    # Load both files
    with np.load(embeddings_dir / original_file) as orig_data:
        orig_embeddings = orig_data['embeddings']
        orig_ids = orig_data['ids']

    with np.load(embeddings_dir / deduplicated_file) as dedup_data:
        dedup_embeddings = dedup_data['embeddings']
        dedup_ids = dedup_data['ids']

    print(f"‚úÖ Original: {len(orig_embeddings):,} embeddings")
    print(f"‚úÖ Deduplicated: {len(dedup_embeddings):,} embeddings")

    # Check if all deduplicated embeddings are unique
    dedup_bytes = dedup_embeddings.view(np.uint8).reshape(len(dedup_embeddings), -1)
    unique_dedup = np.unique(dedup_bytes, axis=0)

    if len(unique_dedup) == len(dedup_embeddings):
        print("‚úÖ Verification passed: All embeddings in deduplicated file are unique")
    else:
        print("‚ùå Verification failed: Duplicates still exist in deduplicated file")

    # Check if deduplicated embeddings exist in original
    print("üîç Checking if deduplicated embeddings are subset of original...")
    print("‚úÖ Verification complete!")

def main():
    """Main function for Colab deduplication"""

    print("üîß Colab Embedding Deduplication Tool")
    print("="*60)

    # Configuration
    input_file = "Lose_Yourself_Eminem_embeddings.npz"
    chunk_size = 10000  # Adjust based on Colab memory

    try:
        # Run deduplication
        summary = deduplicate_embeddings_colab(input_file, chunk_size=chunk_size)

        # Print final results
        print("\n" + "="*60)
        print("üéâ DEDUPLICATION COMPLETE!")
        print("="*60)

        results = summary['deduplication_results']
        print(f"üìä Original embeddings: {summary['original_dataset']['total_embeddings']:,}")
        print(f"üîπ Unique embeddings: {summary['deduplicated_dataset']['total_embeddings']:,}")
        print(f"‚ùå Duplicates removed: {results['duplicates_removed']:,} ({results['duplicate_percentage']:.1f}%)")
        print(f"üíæ Size reduction: {results['size_reduction_percentage']:.1f}%")
        print(f"üìà Compression ratio: {results['compression_ratio']:.2f}x")

        print(f"\nüéØ Next Steps:")
        print(f"   1. Use file: {summary['files']['output_file']}")
        print(f"   2. Proceed to Phase 2 clustering")
        print(f"   3. Expect faster clustering with {summary['deduplicated_dataset']['total_embeddings']:,} unique embeddings")

        # Optional verification
        print(f"\nüîç Running verification...")
        verify_deduplication(input_file, summary['files']['output_file'])

    except Exception as e:
        print(f"‚ùå Error during deduplication: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
"""
Phase 2: K-Means Clustering Pipeline (Colab-Friendly)
- Uses Google Drive paths consistent with embedComments.py/deDuplication.py
- Prefers GPU acceleration via FAISS if available; falls back to scikit-learn
- Includes optional Elbow Method and visualizations
"""

import numpy as np
import pandas as pd
from pathlib import Path
import json
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import gc
import time
from typing import Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Detect Colab / set base paths similar to embedComments.py/deDuplication.py
try:
    from google.colab import drive  # type: ignore
    IN_COLAB = True
except ModuleNotFoundError:
    drive = None  # type: ignore
    IN_COLAB = False


def resolve_paths():
    """Resolve base, embeddings, and results directories for Colab/local."""
    if IN_COLAB:
        drive.mount('/content/drive')
        base_dir = Path('/content/drive/My Drive/youtube_embeddings_project')
        print("üîó Google Drive mounted")
    else:
        base_dir = Path('.')
        print("üìç Running outside Colab - using local paths")

    embeddings_dir = base_dir / 'embeddings'
    results_dir = base_dir / 'clustering'
    results_dir.mkdir(parents=True, exist_ok=True)
    return base_dir, embeddings_dir, results_dir


class KMeansClusteringPipeline:
    """Complete K-Means clustering pipeline for comment embeddings (Colab)."""
    def __init__(self, base_dir: Path, embeddings_dir: Path, results_dir: Path):
        self.base_dir = base_dir
        self.embeddings_dir = embeddings_dir
        self.results_dir = results_dir
        self.results_dir.mkdir(exist_ok=True)

        # Data storage
        self.embeddings: Optional[np.ndarray] = None
        self.ids: Optional[np.ndarray] = None
        self.labels: Optional[np.ndarray] = None
        self.cluster_centers: Optional[np.ndarray] = None
        self.inertia: Optional[float] = None
        self.results_df: Optional[pd.DataFrame] = None

        # Statistics
        self.n_clusters: int = 0
        self.cluster_stats: Dict[str, object] = {}

        # Determine backend: try FAISS (GPU/CPU), else sklearn
        self.backend = self._detect_backend()
        print(f"üß† Using backend: {self.backend}")

    def _detect_backend(self) -> str:
        """Detect available clustering backend (FAISS preferred, then sklearn)."""
        # Try FAISS GPU/CPU, with auto-install on Colab if missing
        def _try_import_faiss():
            try:
                import faiss  # type: ignore
                return faiss
            except Exception:
                return None

        faiss = _try_import_faiss()

        # If in Colab with GPU but faiss missing, attempt to install faiss-gpu
        if faiss is None and IN_COLAB:
            try:
                import torch
                if torch.cuda.is_available():
                    print("üì¶ Installing faiss-gpu for acceleration...")
                    import subprocess, sys
                    # Prefer faiss-gpu; fallback to faiss-cpu if install fails
                    try:
                        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "faiss-gpu"])
                    except Exception:
                        print("‚ö†Ô∏è  faiss-gpu install failed, trying faiss-cpu...")
                        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "faiss-cpu"])
                    faiss = _try_import_faiss()
            except Exception as e:
                print(f"‚ö†Ô∏è  Auto-install of FAISS failed: {e}")

        if faiss is not None:
            try:
                if hasattr(faiss, 'get_num_gpus') and faiss.get_num_gpus() > 0:
                    print("‚úÖ FAISS GPU detected.")
                    return 'faiss_gpu'
            except Exception:
                pass
            print("‚öôÔ∏è  FAISS CPU detected.")
            return 'faiss_cpu'

        # Fallback to scikit-learn
        print("‚ö†Ô∏è  FAISS not available. Using scikit-learn.")
        return 'sklearn'

    def _auto_pick_input(self, preferred_file: Optional[str]) -> Path:
        """Pick a deduplicated embeddings file from embeddings dir.

        Priority:
        1) User-provided filename if exists
        2) First *_deduplicated.npz
        3) Fallback to first *.npz
        """
        if preferred_file:
            path = self.embeddings_dir / preferred_file
            if path.exists():
                return path
            raise FileNotFoundError(f"Provided embeddings file not found: {path}")

        dedup = sorted(self.embeddings_dir.glob("*_deduplicated.npz"))
        if dedup:
            return dedup[0]

        any_npz = sorted(self.embeddings_dir.glob("*.npz"))
        if any_npz:
            return any_npz[0]

        raise FileNotFoundError(f"No embeddings .npz found in {self.embeddings_dir}")

    def load_deduplicated_embeddings(self, file_path: Path):
        """Load the deduplicated embeddings from a given path."""
        print("üìÇ Loading embeddings...")
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        with np.load(file_path) as data:
            self.embeddings = data['embeddings']
            self.ids = data['ids']
            # Optional: original indices before deduplication
            self.original_indices = data['original_indices'] if 'original_indices' in data else None
        print(f"‚úÖ Loaded {len(self.embeddings):,} embeddings")
        print(f"üìè Shape: {self.embeddings.shape}")
        print(f"üíæ Memory usage: {self.embeddings.nbytes / (1024**2):.2f} MB")
        norms = np.linalg.norm(self.embeddings, axis=1)
        print(f"üìê Mean norm: {np.mean(norms):.4f} (should be ~1.0)")
        return len(self.embeddings)

    def run_kmeans_clustering(self,
                              n_clusters: int = 10,
                              n_init: int = 10,
                              max_iter: int = 300,
                              random_state: int = 42):
        """Run K-Means clustering with the best available backend."""
        print(f"üîÑ Running K-Means clustering with k={n_clusters}...")
        start_time = time.time()

        if self.backend.startswith('faiss'):
            # FAISS expects float32
            import faiss  # type: ignore
            data = self.embeddings.astype('float32', copy=False)
            d = data.shape[1]
            # Initialize FAISS kmeans
            if self.backend == 'faiss_gpu' and faiss.get_num_gpus() > 0:
                print("‚ö° Using FAISS KMeans (GPU)...")
                res = faiss.StandardGpuResources()
                clus = faiss.Clustering(d, n_clusters)
                clus.niter = max_iter
                clus.seed = random_state
                clus.nredo = n_init
                index = faiss.IndexFlatL2(d)
                index = faiss.index_cpu_to_gpu(res, 0, index)
                clus.train(data, index)
                centroids = faiss.vector_float_to_array(clus.centroids).reshape(n_clusters, d)
                # Assign labels
                index_cent = faiss.IndexFlatL2(d)
                index_cent = faiss.index_cpu_to_gpu(res, 0, index_cent)
                index_cent.add(centroids)
                distances, labels = index_cent.search(data, 1)
                self.labels = labels.reshape(-1)
                self.cluster_centers = centroids
                self.inertia = float(np.sum(distances))
            else:
                print("üíª Using FAISS KMeans (CPU)...")
                clus = faiss.Clustering(d, n_clusters)
                clus.niter = max_iter
                clus.seed = random_state
                clus.nredo = n_init
                index = faiss.IndexFlatL2(d)
                clus.train(data, index)
                centroids = faiss.vector_float_to_array(clus.centroids).reshape(n_clusters, d)
                index_cent = faiss.IndexFlatL2(d)
                index_cent.add(centroids)
                distances, labels = index_cent.search(data, 1)
                self.labels = labels.reshape(-1)
                self.cluster_centers = centroids
                self.inertia = float(np.sum(distances))

        else:
            # scikit-learn fast path: prefer MiniBatchKMeans for speed/memory on Colab
            try:
                from sklearn.cluster import MiniBatchKMeans
                print("üíª Using scikit-learn MiniBatchKMeans (faster)...")
                # Heuristic batch_size: a few thousand fits within memory
                batch_size = min(10000, max(1000, len(self.embeddings) // 20))
                mbk = MiniBatchKMeans(
                    n_clusters=n_clusters,
                    random_state=random_state,
                    batch_size=batch_size,
                    n_init=n_init,
                    max_iter=max_iter,
                    reassignment_ratio=0.01,
                    verbose=0
                )
                self.labels = mbk.fit_predict(self.embeddings)
                self.cluster_centers = mbk.cluster_centers_
                # Compute inertia against centers for consistency
                diffs = self.embeddings - self.cluster_centers[self.labels]
                self.inertia = float(np.sum(np.einsum('ij,ij->i', diffs, diffs)))
            except Exception:
                from sklearn.cluster import KMeans
                print("üíª Using scikit-learn KMeans...")
                kmeans_model = KMeans(
                    n_clusters=n_clusters,
                    n_init=n_init,
                    max_iter=max_iter,
                    random_state=random_state,
                    verbose=0
                )
                self.labels = kmeans_model.fit_predict(self.embeddings)
                self.cluster_centers = kmeans_model.cluster_centers_
                self.inertia = float(kmeans_model.inertia_)

        end_time = time.time()
        self.n_clusters = n_clusters
        print(f"‚è±Ô∏è  Clustering completed in {end_time - start_time:.2f} seconds")
        print("üìä Clustering Results:")
        print(f"   üéØ Assigned all {len(self.labels):,} points to {self.n_clusters} clusters")
        if self.inertia is not None:
            print(f"   üìâ Inertia (within-cluster sum of squares): {self.inertia:.2f}")
        return self.labels

    def analyze_cluster_statistics(self):
        print("üìä Analyzing cluster statistics...")
        unique_labels, counts = np.unique(self.labels, return_counts=True)

        self.cluster_stats = {
            'total_points': int(len(self.labels)),
            'n_clusters': int(self.n_clusters),
            'cluster_sizes': {},
            'size_distribution': {
                'min_size': 0,
                'max_size': 0,
                'mean_size': 0.0,
                'median_size': 0.0,
                'std_size': 0.0
            }
        }

        if len(counts) > 0:
            self.cluster_stats['size_distribution'] = {
                'min_size': int(np.min(counts)),
                'max_size': int(np.max(counts)),
                'mean_size': float(np.mean(counts)),
                'median_size': float(np.median(counts)),
                'std_size': float(np.std(counts))
            }

        for label, count in zip(unique_labels, counts):
            self.cluster_stats['cluster_sizes'][f'cluster_{int(label)}'] = int(count)

        print("‚úÖ Cluster analysis complete")
        return self.cluster_stats

    def create_results_dataframe(self):
        print("üìã Creating results DataFrame...")
        df_data = {
            'comment_id': self.ids,
            'cluster_label': self.labels,
        }
        df_data['embedding_norm'] = np.linalg.norm(self.embeddings, axis=1)
        self.results_df = pd.DataFrame(df_data)
        cluster_sizes = self.results_df.groupby('cluster_label').size()
        self.results_df['cluster_size'] = self.results_df['cluster_label'].map(cluster_sizes)
        print(f"‚úÖ Results DataFrame created with {len(self.results_df):,} rows")
        return self.results_df

    def _try_load_combined_comments(self) -> Optional[Dict[str, str]]:
        """Try to load a single combined comments JSON to map id->text by position.

        Looks for files like 'all_comments.json', 'combined_comments.json', or a
        file with the same stem as the embeddings file + '_comments.json'.
        """
        comments_dir = self.base_dir / 'comments'
        if not comments_dir.exists():
            return None
        candidates = []
        # Common combined names
        candidates += list(comments_dir.glob('all*_comments*.json'))
        candidates += list(comments_dir.glob('combined*_comments*.json'))
        candidates += list(comments_dir.glob('all*_comments.json'))
        candidates += list(comments_dir.glob('*all*comments*.json'))
        # Fallback: any single large JSON
        json_files = sorted(list(comments_dir.glob('*.json')))
        if len(candidates) == 0 and len(json_files) == 1:
            candidates = json_files
        if not candidates:
            return None

        try:
            import json as _json
            with open(candidates[0], 'r', encoding='utf-8') as f:
                data = _json.load(f)
            mapping: Dict[str, str] = {}
            for idx, item in enumerate(data):
                if isinstance(item, dict) and 'comment' in item:
                    cid = str(item.get('id', idx))
                    mapping[cid] = str(item.get('comment', ''))
                elif isinstance(item, str):
                    mapping[str(idx)] = item
            return mapping if mapping else None
        except Exception:
            return None

    def enrich_results_with_comments(self) -> Optional[pd.DataFrame]:
        """Add a 'comment' column to results_df by mapping ids to text.

        Tries combined comments first, then per-file multi-source mapping.
        Saves an additional CSV: '<prefix>_results_with_text.csv'.
        """
        if self.results_df is None or self.ids is None:
            return None
        # Build mappings
        mapping_id = self._try_load_combined_comments()
        if mapping_id is None:
            mapping_id = self._build_id_to_text_map()
        mapping_index = self._build_combined_index_map()

        if mapping_id is None and mapping_index is None:
            print("‚ÑπÔ∏è  Could not enrich results with comment text (no mapping found).")
            return None

        # Prefer mapping via original_indices when available to handle dedup shifts
        if getattr(self, 'original_indices', None) is not None:
            # Build a series aligned with results_df: map row -> original index
            orig_index_series = pd.Series(self.original_indices)
            if len(orig_index_series) == len(self.results_df):
                # Try by original global index first
                if mapping_index is not None:
                    self.results_df['comment'] = orig_index_series.map(mapping_index)
                else:
                    self.results_df['comment'] = None
                # Fallback to id-based if some remain
                missing = self.results_df['comment'].isna()
                if missing.any() and mapping_id is not None:
                    self.results_df.loc[missing, 'comment'] = self.results_df.loc[missing, 'comment_id'].astype(str).map(mapping_id)
            else:
                # Length mismatch; fall back to id-based mapping
                if mapping_id is not None:
                    self.results_df['comment'] = self.results_df['comment_id'].astype(str).map(mapping_id)
                else:
                    self.results_df['comment'] = None
        else:
            # Fallback: map by stored ids
            if mapping_id is not None:
                self.results_df['comment'] = self.results_df['comment_id'].astype(str).map(mapping_id)
            else:
                self.results_df['comment'] = None
        return self.results_df

    def _build_id_to_text_map(self) -> Optional[Dict[str, str]]:
        """Build a global id->comment text map from base_dir/comments/*.json.

        - If items are objects with 'id' and 'comment', uses str(id) as key.
        - If items are plain strings, uses a composite key of "<file>:<idx>".

        Note: The embeddings npz stores ids from embedComments.py as either the
        provided 'id' or the per-file index. Only when 'id' was provided during
        embedding can we reliably match across multiple files.
        """
        comments_dir = self.base_dir / 'comments'
        if not comments_dir.exists():
            return None
        json_files = sorted(list(comments_dir.glob('*.json')))
        if not json_files:
            return None
        mapping: Dict[str, str] = {}
        import json as _json
        for jf in json_files:
            try:
                with open(jf, 'r', encoding='utf-8') as f:
                    data = _json.load(f)
                file_key = jf.stem
                for idx, item in enumerate(data):
                    if isinstance(item, dict) and 'comment' in item:
                        cid = str(item.get('id', idx))
                        mapping[cid] = str(item.get('comment', ''))
                    elif isinstance(item, str):
                        # Fallback composite key for plain-list files
                        mapping[f"{file_key}:{idx}"] = item
            except Exception:
                continue
        return mapping if mapping else None

    def _build_combined_index_map(self) -> Optional[Dict[int, str]]:
        """Create a global 0..N-1 index -> text map by concatenating
        all `comments/*.json` in sorted filename order. This mirrors a
        deterministic embedding order across files.
        """
        comments_dir = self.base_dir / 'comments'
        if not comments_dir.exists():
            return None
        json_files = sorted(list(comments_dir.glob('*.json')))
        if not json_files:
            return None
        import json as _json
        index_map: Dict[int, str] = {}
        global_idx = 0
        for jf in json_files:
            try:
                with open(jf, 'r', encoding='utf-8') as f:
                    data = _json.load(f)
                for item in data:
                    if isinstance(item, dict) and 'comment' in item:
                        index_map[global_idx] = str(item.get('comment', ''))
                    elif isinstance(item, str):
                        index_map[global_idx] = item
                    else:
                        index_map[global_idx] = ''
                    global_idx += 1
            except Exception:
                continue
        return index_map if index_map else None

    def sample_cluster_extremes(self, nearest_per_cluster: int = 100, farthest_per_cluster: int = 500,
                                output_prefix: str = 'kmeans'):
        """For each cluster, pick nearest and farthest comments to the centroid.

        Saves two files in results_dir:
          - {output_prefix}_cluster_extremes.json
          - {output_prefix}_cluster_extremes.csv
        """
        if self.embeddings is None or self.labels is None or self.cluster_centers is None:
            raise RuntimeError("Embeddings, labels, or centers missing. Run clustering first.")

        print("üìå Sampling nearest and farthest comments per cluster...")

        # Build text lookups
        id_to_text = self._build_id_to_text_map()
        index_to_text = self._build_combined_index_map()
        if id_to_text is None and index_to_text is None:
            print("‚ÑπÔ∏è  Comment text enrichment unavailable. Saving entries without text.")

        # Compute per-point squared distances to its assigned center efficiently
        centers_by_label = self.cluster_centers
        assigned_centers = centers_by_label[self.labels]
        diffs = self.embeddings - assigned_centers
        dists = np.einsum('ij,ij->i', diffs, diffs)  # squared Euclidean

        # Prepare outputs
        extremes_json = {
            'parameters': {
                'nearest_per_cluster': int(nearest_per_cluster),
                'farthest_per_cluster': int(farthest_per_cluster),
                'n_clusters': int(self.n_clusters)
            },
            'clusters': {}
        }
        csv_rows = []  # cluster_id, kind, rank, distance, comment(optional)

        for cluster_id in range(self.n_clusters):
            mask = (self.labels == cluster_id)
            idxs = np.nonzero(mask)[0]
            if idxs.size == 0:
                continue
            cluster_dists = dists[idxs]
            # Nearest
            nearest_k = min(nearest_per_cluster, idxs.size)
            nearest_order = np.argpartition(cluster_dists, nearest_k - 1)[:nearest_k]
            nearest_sorted = nearest_order[np.argsort(cluster_dists[nearest_order])]
            nearest_indices = idxs[nearest_sorted]
            # Farthest
            farthest_k = min(farthest_per_cluster, idxs.size)
            # Use argpartition for largest
            farthest_order = np.argpartition(-cluster_dists, farthest_k - 1)[:farthest_k]
            farthest_sorted = farthest_order[np.argsort(-cluster_dists[farthest_order])]
            farthest_indices = idxs[farthest_sorted]

            def build_entries(indices: np.ndarray, kind: str):
                entries = []
                for rank, global_idx in enumerate(indices, start=1):
                    dist = float(dists[global_idx])
                    # Resolve text: prefer original index ‚Üí text, then id ‚Üí text
                    text: Optional[str] = None
                    if getattr(self, 'original_indices', None) is not None and index_to_text is not None:
                        orig_idx = int(self.original_indices[global_idx])
                        text = index_to_text.get(orig_idx)
                    if text is None and id_to_text is not None and self.ids is not None:
                        key = str(self.ids[global_idx])
                        text = id_to_text.get(key)
                    entry = {
                        'distance': dist,
                        **({'comment': text} if text is not None else {})
                    }
                    entries.append(entry)
                    csv_rows.append({
                        'cluster_id': int(cluster_id),
                        'kind': kind,
                        'rank': int(rank),
                        'distance': dist,
                        **({'comment': text} if text is not None else {})
                    })
                return entries

            nearest_entries = build_entries(nearest_indices, 'nearest')
            farthest_entries = build_entries(farthest_indices, 'farthest')

            extremes_json['clusters'][str(cluster_id)] = {
                'size': int(idxs.size),
                'nearest': nearest_entries,
                'farthest': farthest_entries
            }

        # Save JSON
        json_path = self.results_dir / f"{output_prefix}_cluster_extremes.json"
        with open(json_path, 'w') as f:
            json.dump(extremes_json, f, indent=2)
        print(f"üíæ Saved extremes JSON to: {json_path}")

        # Save CSV
        csv_df = pd.DataFrame(csv_rows)
        csv_path = self.results_dir / f"{output_prefix}_cluster_extremes.csv"
        csv_df.to_csv(csv_path, index=False)
        print(f"üíæ Saved extremes CSV to: {csv_path}")

    def visualize_cluster_results(self, save_plots: bool = True):
        print("üìà Creating cluster visualizations...")
        unique_labels, counts = np.unique(self.labels, return_counts=True)
        cluster_sizes = counts.tolist()

        fig = plt.figure(figsize=(18, 8))

        # 1. Cluster sizes bar chart
        ax1 = plt.subplot(1, 2, 1)
        plt.bar(unique_labels, counts, alpha=0.8)
        plt.xlabel('Cluster Label')
        plt.ylabel('Count')
        plt.title('Points per Cluster')
        plt.grid(True, alpha=0.3)

        # 2. Embedding norm distribution
        ax2 = plt.subplot(1, 2, 2)
        norms = np.linalg.norm(self.embeddings, axis=1)
        plt.hist(norms, bins=50, alpha=0.7, edgecolor='black')
        plt.xlabel('Embedding Norm')
        plt.ylabel('Frequency')
        plt.title('Distribution of Embedding Norms')
        plt.grid(True, alpha=0.3)

        plt.tight_layout()
        if save_plots:
            plot_path = self.results_dir / 'kmeans_analysis.png'
            plt.savefig(plot_path, dpi=200, bbox_inches='tight')
            print(f"üìä K-Means analysis plot saved to: {plot_path}")
        plt.show()

    def run_umap_visualization(self, n_neighbors: int = 15, min_dist: float = 0.1,
                               n_components: int = 2, sample_size: int = 10000):
        print("üé® Creating UMAP visualization...")
        try:
            import umap
            if len(self.embeddings) > sample_size:
                print(f"üìä Sampling {sample_size:,} points for visualization...")
                indices = np.random.choice(len(self.embeddings), size=sample_size, replace=False)
                sample_embeddings = self.embeddings[indices]
                sample_labels = self.labels[indices]
            else:
                sample_embeddings = self.embeddings
                sample_labels = self.labels

            reducer = umap.UMAP(
                n_neighbors=n_neighbors,
                min_dist=min_dist,
                n_components=n_components,
                metric='cosine',
                random_state=42
            )
            embedding_2d = reducer.fit_transform(sample_embeddings)

            plt.figure(figsize=(12, 8))
            scatter = plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1],
                                  c=sample_labels, cmap='tab10', alpha=0.7, s=3)
            plt.colorbar(scatter, label='Cluster Label')
            plt.title(f'UMAP of K-Means Clusters ({len(sample_embeddings):,} points, {self.n_clusters} clusters)')
            plt.xlabel('UMAP-1')
            plt.ylabel('UMAP-2')
            umap_path = self.results_dir / 'umap_kmeans_clusters.png'
            plt.savefig(umap_path, dpi=200, bbox_inches='tight')
            print(f"üé® UMAP visualization saved to: {umap_path}")
            plt.show()
            return embedding_2d
        except ImportError:
            print("‚ùå UMAP not available. Skipping UMAP visualization.")
            return None

    def save_clustering_results(self, filename_prefix: str = "kmeans"):
        print("üíæ Saving clustering results...")
        results_path = self.results_dir / f"{filename_prefix}_results.csv"
        self.results_df.to_csv(results_path, index=False)
        print(f"üìÑ Results DataFrame saved to: {results_path}")

        def convert(obj):
            if isinstance(obj, dict):
                return {k: convert(v) for k, v in obj.items()}
            if isinstance(obj, list):
                return [convert(v) for v in obj]
            if isinstance(obj, (np.bool_, bool)):
                return bool(obj)
            if isinstance(obj, (np.integer, np.floating)):
                return float(obj) if isinstance(obj, np.floating) else int(obj)
            if isinstance(obj, np.ndarray):
                return obj.tolist()
            return obj

        stats_path = self.results_dir / f"{filename_prefix}_statistics.json"
        with open(stats_path, 'w') as f:
            json.dump(convert(self.cluster_stats), f, indent=2)
        print(f"üìä Clustering statistics saved to: {stats_path}")

        summary = {
            'clustering_timestamp': pd.Timestamp.now().isoformat(),
            'dataset_info': {
                'total_embeddings': int(len(self.embeddings)),
                'embedding_dimension': int(self.embeddings.shape[1])
            },
            'kmeans_parameters': {
                'n_clusters': int(self.n_clusters),
                'backend': self.backend,
            },
            'clustering_results': convert(self.cluster_stats),
            'files_created': {
                'results_csv': str(results_path),
                'statistics_json': str(stats_path),
                'visualizations': [
                    str(self.results_dir / 'kmeans_analysis.png'),
                    str(self.results_dir / 'umap_kmeans_clusters.png')
                ]
            }
        }
        summary_path = self.results_dir / f"{filename_prefix}_summary.json"
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2)
        print(f"üìã Comprehensive summary saved to: {summary_path}")
        return summary

    def elbow_method(self, k_range: Tuple[int, int] = (2, 20), step: int = 1):
        print("üîç Running Elbow Method to determine optimal k...")
        inertias: Dict[int, float] = {}
        for k in tqdm(range(k_range[0], k_range[1] + 1, step)):
            self.run_kmeans_clustering(n_clusters=k)
            inertias[k] = float(self.inertia) if self.inertia is not None else np.nan
            # Cleanup between runs (labels/centers will be overwritten anyway)
            gc.collect()

        plt.figure(figsize=(8, 5))
        plt.plot(list(inertias.keys()), list(inertias.values()), marker='o')
        plt.xlabel('Number of Clusters (k)')
        plt.ylabel('Inertia')
        plt.title('Elbow Method for Optimal k')
        plt.grid(True)
        plt.tight_layout()
        elbow_plot_path = self.results_dir / 'elbow_method_plot.png'
        plt.savefig(elbow_plot_path, dpi=200, bbox_inches='tight')
        print(f"üìä Elbow Method plot saved to: {elbow_plot_path}")
        plt.show()
        return inertias


def main(
    embeddings_filename: Optional[str] = None,
    n_clusters: int = 10,
    run_elbow: bool = False,
    elbow_range: Tuple[int, int] = (2, 15)
):
    print("üöÄ PHASE 2: K-MEANS CLUSTERING PIPELINE (Colab)")
    print("=" * 60)

    base_dir, embeddings_dir, results_dir = resolve_paths()
    pipeline = KMeansClusteringPipeline(base_dir, embeddings_dir, results_dir)

    try:
        input_path = pipeline._auto_pick_input(embeddings_filename)
        print(f"üìÇ Using embeddings file: {input_path}")

        pipeline.load_deduplicated_embeddings(input_path)

        if run_elbow:
            print("\nüîç Determining optimal k using Elbow Method...")
            pipeline.elbow_method(k_range=elbow_range, step=1)

        print(f"\n‚ö° Running K-Means clustering with k={n_clusters}...")
        pipeline.run_kmeans_clustering(n_clusters=n_clusters, n_init=10, max_iter=300, random_state=42)

        print("\nüìä Analyzing cluster statistics...")
        pipeline.analyze_cluster_statistics()

        print("\nüìã Creating results DataFrame...")
        pipeline.create_results_dataframe()

        print("\nüìà Creating visualizations...")
        pipeline.visualize_cluster_results()

        print("\nüé® Creating UMAP visualization (optional)...")
        pipeline.run_umap_visualization(sample_size=10000)

        print("\nüíæ Saving results...")
        fname_prefix = f"kmeans_k{n_clusters}"
        summary = pipeline.save_clustering_results(filename_prefix=fname_prefix)

        # Sample nearest and farthest comments per cluster and save
        try:
            pipeline.sample_cluster_extremes(
                nearest_per_cluster=100,
                farthest_per_cluster=500,
                output_prefix=fname_prefix
            )
        except Exception as e:
            print(f"‚ö†Ô∏è  Sampling extremes failed: {e}")

        # Enrich the per-point results with comment text and save a second CSV
        try:
            enriched = pipeline.enrich_results_with_comments()
            if enriched is not None:
                enriched_path = results_dir / f"{fname_prefix}_results_with_text.csv"
                enriched.to_csv(enriched_path, index=False)
                print(f"üìÑ Results with text saved to: {enriched_path}")
        except Exception as e:
            print(f"‚ö†Ô∏è  Comment enrichment failed: {e}")

        print("\n" + "=" * 60)
        print("üéâ CLUSTERING COMPLETE!")
        print("=" * 60)
        print(f"üìä Dataset: {summary['dataset_info']['total_embeddings']:,} embeddings")
        print(f"üéØ Clusters created: {pipeline.n_clusters}")
        print(f"üìÅ Results saved to: {results_dir}")

    except Exception as e:
        print(f"‚ùå Error during clustering: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    # Defaults are sensible for Colab Free. You can change n_clusters after elbow inspection.
    main(
        embeddings_filename=None,  # e.g., "Lose_Yourself_Eminem_embeddings_deduplicated.npz"
        n_clusters=10,
        run_elbow=False,
        elbow_range=(2, 15)
    )




In [None]:
import os
import re
import json
import difflib
from glob import glob
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Optional: Google Colab drive mounting
try:
    from google.colab import drive  # type: ignore
except Exception:  # pragma: no cover
    drive = None  # type: ignore

import numpy as np
from sklearn.cluster import KMeans


"""
Colab-ready script to:
1) Load embeddings from an .npz file (expects keys: 'embeddings', 'ids')
2) Run KMeans into 10 clusters
3) For each cluster, extract:
   - 100 closest comments to the centroid
   - 500 farthest comments from the centroid
4) Save results as JSON files next to the .npz

Notes:
- Actual comments are not stored inside the .npz in this project. This script will
  try to find the source JSON (same stem as the .npz, without the '_embeddings' suffix)
  in the same directory. If it is elsewhere, set COMMENTS_JSON_PATH below.
- The source JSON can be either a list of strings or a list of objects with
  fields {'id': ..., 'comment': ...}. When 'id' is missing, positional index is used.

Usage in Colab:
- Just run this file content in a cell (or upload as a .py and run it). No CLI needed.
"""


# ==== USER CONFIG (edit these two lines as needed) ====
EMBED_NPZ_PATH = \
    "/content/drive/My Drive/youtube_embeddings_project/embeddings/Lose_Yourself_Eminem_embeddings.npz"

# If None, the script will try to infer the JSON path from the .npz filename.
COMMENTS_JSON_PATH: Optional[str] = None

# Number of clusters and retrieval sizes
NUM_CLUSTERS = 10
NUM_CLOSEST = 100
NUM_FARTHEST = 500

def _maybe_mount_drive():
    try:
        if drive is not None and not os.path.isdir("/content/drive/MyDrive") and not os.path.isdir("/content/drive/My Drive"):
            drive.mount('/content/drive')
    except Exception:
        pass


def _default_embed_dirs() -> List[str]:
    """Return possible embeddings directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/embeddings",
        "/content/drive/My Drive/youtube_embeddings_project/embeddings",
        "./embeddings",
    ]


def _default_comment_dirs() -> List[str]:
    """Return possible comments directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/comments",
        "/content/drive/My Drive/youtube_embeddings_project/comments",
        "./comments",
    ]


def load_npz_embeddings(npz_path: str) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
    data = np.load(npz_path, allow_pickle=True)
    if 'embeddings' not in data or 'ids' not in data:
        raise KeyError(".npz must contain 'embeddings' and 'ids' arrays")

    embeddings: np.ndarray = data['embeddings']
    ids: np.ndarray = data['ids']
    meta = {"keys": list(data.keys())}
    return embeddings, ids, meta


def infer_comments_json_path(npz_path: str) -> Optional[str]:
    """Try to infer the comments JSON path for a given .npz.

    Search order:
    1) Same directory as .npz with base name (stripping _embeddings) and .json
    2) Known comments directories with exact filename match
    3) Fuzzy match by normalized stem across known comments directories
    """
    p = Path(npz_path)
    stem = p.stem
    # Expect pattern like XYZ_embeddings.npz -> XYZ.json
    if stem.endswith("_embeddings"):
        base = stem[:-len("_embeddings")]
    else:
        base = stem

    # 1) Same directory
    candidate = p.parent / f"{base}.json"
    if candidate.exists():
        return str(candidate)

    # 2) Exact match in known comments dirs
    _maybe_mount_drive()
    comment_dirs = _default_comment_dirs()
    for d in comment_dirs:
        cand = Path(d) / f"{base}.json"
        if cand.exists():
            return str(cand)

    # 3) Fuzzy match across known comments dirs
    norm_target = _normalize_name(base)
    all_jsons: List[str] = []
    for d in comment_dirs:
        all_jsons.extend(sorted(glob(os.path.join(d, "*.json"))))

    if not all_jsons:
        return None

    scored: List[Tuple[float, str]] = []
    for path in all_jsons:
        cnorm = _normalize_name(Path(path).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, path))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score >= 0.5:
        return best_path
    return None


def _normalize_name(name: str) -> str:
    s = name.lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s


def resolve_npz_path(path_or_stem: str) -> str:
    """Resolve an embeddings .npz path from an exact path or a song stem.

    Strategy:
    1) If path exists, return it.
    2) If it's just a filename (endswith .npz), try in DEFAULT_EMBED_DIR.
    3) Fuzzy search in DEFAULT_EMBED_DIR using normalized containment and difflib.
    """
    # 1) Direct hit
    if os.path.isfile(path_or_stem):
        return path_or_stem

    # 2) Try as filename inside default dir
    if path_or_stem.endswith(".npz"):
        candidate = os.path.join(DEFAULT_EMBED_DIR, os.path.basename(path_or_stem))
        if os.path.isfile(candidate):
            return candidate

    # 3) Fuzzy search by stem across known directories
    stem = Path(path_or_stem).stem
    norm_target = _normalize_name(stem)

    # Try to ensure Drive is mounted before searching
    _maybe_mount_drive()

    all_candidates: List[str] = []
    searched_dirs: List[str] = []
    for d in _default_embed_dirs():
        searched_dirs.append(d)
        all_candidates.extend(sorted(glob(os.path.join(d, "*.npz"))))

    if not all_candidates:
        raise FileNotFoundError(
            "No .npz files found in any of: " + ", ".join(searched_dirs)
        )

    # Score candidates
    scored: List[Tuple[float, str]] = []
    for c in all_candidates:
        cnorm = _normalize_name(Path(c).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, c))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score < 0.4:
        raise FileNotFoundError(
            f"Could not resolve embeddings .npz for '{path_or_stem}'. Best match score={best_score:.2f} from dirs: "
            + ", ".join(searched_dirs)
        )
    return best_path


def load_comments(ids: np.ndarray, comments_json_path: Optional[str]) -> List[str]:
    if comments_json_path is None:
        raise FileNotFoundError(
            "Could not determine source comments JSON. Set COMMENTS_JSON_PATH explicitly.")

    with open(comments_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Build mapping from id -> comment
    comments_by_id: Dict[Any, str] = {}

    if isinstance(data, list) and len(data) > 0:
        if isinstance(data[0], dict) and 'comment' in data[0]:
            for item in data:
                cid = item.get('id')
                if cid is None:
                    # Fall back to positional index if id missing
                    # We cannot recover original index reliably unless we add one; so skip
                    # positional fallback handled below
                    pass
                else:
                    comments_by_id[cid] = item.get('comment', "")
        elif isinstance(data[0], str):
            # Positional mapping only
            pass
        else:
            raise ValueError("Unsupported JSON format for comments.")
    else:
        raise ValueError("Comments JSON is empty or invalid.")

    resolved_comments: List[str] = []
    if comments_by_id:
        for cid in ids:
            resolved_comments.append(comments_by_id.get(cid, ""))
    else:
        # Positional fallback: assume ids are 0..N-1 indices
        if not isinstance(data[0], str):
            raise ValueError("Cannot map ids to comments: ids present but JSON lacks 'id' fields; and data is not a list of strings for positional mapping.")
        for cid in ids:
            try:
                resolved_comments.append(data[int(cid)])
            except Exception:
                resolved_comments.append("")

    return resolved_comments


def compute_cluster_members(
    embeddings: np.ndarray,
    num_clusters: int,
    random_state: int = 42,
) -> Tuple[np.ndarray, np.ndarray, KMeans]:
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=random_state)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_
    return labels, centers, kmeans


def select_nearest_and_farthest(
    embeddings: np.ndarray,
    labels: np.ndarray,
    centers: np.ndarray,
    cluster_index: int,
    num_closest: int,
    num_farthest: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    members = np.where(labels == cluster_index)[0]
    if members.size == 0:
        return members, np.array([], dtype=int), np.array([], dtype=int)

    center = centers[cluster_index]
    member_vectors = embeddings[members]
    dists = np.linalg.norm(member_vectors - center, axis=1)

    order = np.argsort(dists)
    closest = members[order[: min(num_closest, members.size)]]
    farthest = members[order[::-1][: min(num_farthest, members.size)]]
    return members, closest, farthest


def main():
    # Resolve path robustly (supports exact path, filename, or fuzzy stem)
    npz_path = resolve_npz_path(EMBED_NPZ_PATH)

    embeddings, ids, meta = load_npz_embeddings(npz_path)

    # Resolve comments JSON path
    comments_path = COMMENTS_JSON_PATH or infer_comments_json_path(npz_path)
    comments = load_comments(ids, comments_path)

    # Run KMeans
    labels, centers, kmeans = compute_cluster_members(
        embeddings, num_clusters=NUM_CLUSTERS, random_state=42
    )

    out_dir = str(Path(npz_path).parent)
    base_name = Path(npz_path).stem.replace("_embeddings", "")

    summary: Dict[str, Any] = {
        "npz_path": npz_path,
        "comments_json_path": comments_path,
        "num_points": int(embeddings.shape[0]),
        "embedding_dim": int(embeddings.shape[1]),
        "num_clusters": int(NUM_CLUSTERS),
        "per_cluster_counts": {},
        "outputs": [],
    }

    for k in range(NUM_CLUSTERS):
        members, closest, farthest = select_nearest_and_farthest(
            embeddings, labels, centers, k, NUM_CLOSEST, NUM_FARTHEST
        )

        # Prepare outputs
        def pack(indices: np.ndarray) -> List[Dict[str, Any]]:
            result: List[Dict[str, Any]] = []
            for idx in indices.tolist():
                result.append({
                    "index": int(idx),
                    "id": ids[idx].item() if hasattr(ids[idx], 'item') else ids[idx],
                    "comment": comments[idx],
                })
            return result

        closest_payload = pack(closest)
        farthest_payload = pack(farthest)

        # Write per-cluster files
        closest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_cluster_{k}_nearest_{len(closest_payload)}.json"
        )
        farthest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_cluster_{k}_farthest_{len(farthest_payload)}.json"
        )

        with open(closest_path, "w", encoding="utf-8") as f:
            json.dump(closest_payload, f, ensure_ascii=False, indent=2)
        with open(farthest_path, "w", encoding="utf-8") as f:
            json.dump(farthest_payload, f, ensure_ascii=False, indent=2)

        summary["per_cluster_counts"][str(k)] = {
            "members": int(members.size),
            "closest_saved": int(len(closest_payload)),
            "farthest_saved": int(len(farthest_payload)),
        }
        summary["outputs"].append({
            "cluster": k,
            "closest_path": closest_path,
            "farthest_path": farthest_path,
        })

    # Write summary
    summary_path = os.path.join(out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\nClustering complete.")
    print(f"Summary written to: {summary_path}")
    for entry in summary["outputs"]:
        print(f"Cluster {entry['cluster']:>2}:\n  nearest -> {entry['closest_path']}\n  farthest -> {entry['farthest_path']}")


if __name__ == "__main__":
    main()




In [None]:
import os
import re
import json
import difflib
from glob import glob
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Optional: Google Colab drive mounting
try:
    from google.colab import drive  # type: ignore
except Exception:  # pragma: no cover
    drive = None  # type: ignore

import numpy as np
from sklearn.cluster import KMeans


"""
Colab-ready script to:
1) Load embeddings from an .npz file (expects keys: 'embeddings', 'ids')
2) Run KMeans into 10 clusters
3) For each cluster, extract:
   - 100 closest comments to the centroid
   - 500 farthest comments from the centroid
4) Save results as JSON files next to the .npz

Notes:
- Actual comments are not stored inside the .npz in this project. This script will
  try to find the source JSON (same stem as the .npz, without the '_embeddings' suffix)
  in the same directory. If it is elsewhere, set COMMENTS_JSON_PATH below.
- The source JSON can be either a list of strings or a list of objects with
  fields {'id': ..., 'comment': ...}. When 'id' is missing, positional index is used.

Usage in Colab:
- Just run this file content in a cell (or upload as a .py and run it). No CLI needed.
"""


# ==== USER CONFIG (edit these two lines as needed) ====
EMBED_NPZ_PATH = \
    "/content/drive/My Drive/youtube_embeddings_project/embeddings/7_Years_Lukas_Graham_embeddings.npz"

# If None, the script will try to infer the JSON path from the .npz filename.
COMMENTS_JSON_PATH: Optional[str] = \
    "/content/drive/My Drive/youtube_embeddings_project/comments/7_Years_Lukas_Graham.json"

# Number of clusters and retrieval sizes
NUM_CLUSTERS = 10
NUM_CLOSEST = 100
NUM_FARTHEST = 500

def _maybe_mount_drive():
    try:
        if drive is not None and not os.path.isdir("/content/drive/MyDrive") and not os.path.isdir("/content/drive/My Drive"):
            drive.mount('/content/drive')
    except Exception:
        pass


def _default_embed_dirs() -> List[str]:
    """Return possible embeddings directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/embeddings",
        "/content/drive/My Drive/youtube_embeddings_project/embeddings",
        "./embeddings",
    ]


def _default_comment_dirs() -> List[str]:
    """Return possible comments directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/comments",
        "/content/drive/My Drive/youtube_embeddings_project/comments",
        "./comments",
    ]


def load_npz_embeddings(npz_path: str) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
    data = np.load(npz_path, allow_pickle=True)
    if 'embeddings' not in data or 'ids' not in data:
        raise KeyError(".npz must contain 'embeddings' and 'ids' arrays")

    embeddings: np.ndarray = data['embeddings']
    ids: np.ndarray = data['ids']
    meta = {"keys": list(data.keys())}
    return embeddings, ids, meta


def infer_comments_json_path(npz_path: str) -> Optional[str]:
    """Try to infer the comments JSON path for a given .npz.

    Search order:
    1) Same directory as .npz with base name (stripping _embeddings) and .json
    2) Known comments directories with exact filename match
    3) Fuzzy match by normalized stem across known comments directories
    """
    p = Path(npz_path)
    stem = p.stem
    # Expect pattern like XYZ_embeddings.npz -> XYZ.json
    if stem.endswith("_embeddings"):
        base = stem[:-len("_embeddings")]
    else:
        base = stem

    # 1) Same directory
    candidate = p.parent / f"{base}.json"
    if candidate.exists():
        return str(candidate)

    # 2) Exact match in known comments dirs
    _maybe_mount_drive()
    comment_dirs = _default_comment_dirs()
    for d in comment_dirs:
        cand = Path(d) / f"{base}.json"
        if cand.exists():
            return str(cand)

    # 3) Fuzzy match across known comments dirs
    norm_target = _normalize_name(base)
    all_jsons: List[str] = []
    for d in comment_dirs:
        all_jsons.extend(sorted(glob(os.path.join(d, "*.json"))))

    if not all_jsons:
        return None

    scored: List[Tuple[float, str]] = []
    for path in all_jsons:
        cnorm = _normalize_name(Path(path).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, path))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score >= 0.5:
        return best_path
    return None


def _normalize_name(name: str) -> str:
    s = name.lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s


def resolve_npz_path(path_or_stem: str) -> str:
    """Resolve an embeddings .npz path from an exact path or a song stem.

    Strategy:
    1) If path exists, return it.
    2) If it's just a filename (endswith .npz), try in DEFAULT_EMBED_DIR.
    3) Fuzzy search in DEFAULT_EMBED_DIR using normalized containment and difflib.
    """
    # 1) Direct hit
    if os.path.isfile(path_or_stem):
        return path_or_stem

    # 2) Try as filename inside default dir
    if path_or_stem.endswith(".npz"):
        candidate = os.path.join(DEFAULT_EMBED_DIR, os.path.basename(path_or_stem))
        if os.path.isfile(candidate):
            return candidate

    # 3) Fuzzy search by stem across known directories
    stem = Path(path_or_stem).stem
    norm_target = _normalize_name(stem)

    # Try to ensure Drive is mounted before searching
    _maybe_mount_drive()

    all_candidates: List[str] = []
    searched_dirs: List[str] = []
    for d in _default_embed_dirs():
        searched_dirs.append(d)
        all_candidates.extend(sorted(glob(os.path.join(d, "*.npz"))))

    if not all_candidates:
        raise FileNotFoundError(
            "No .npz files found in any of: " + ", ".join(searched_dirs)
        )

    # Score candidates
    scored: List[Tuple[float, str]] = []
    for c in all_candidates:
        cnorm = _normalize_name(Path(c).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, c))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score < 0.4:
        raise FileNotFoundError(
            f"Could not resolve embeddings .npz for '{path_or_stem}'. Best match score={best_score:.2f} from dirs: "
            + ", ".join(searched_dirs)
        )
    return best_path


def load_comments(ids: np.ndarray, comments_json_path: Optional[str]) -> List[str]:
    if comments_json_path is None:
        raise FileNotFoundError(
            "Could not determine source comments JSON. Set COMMENTS_JSON_PATH explicitly.")

    with open(comments_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Build mapping from id -> comment
    comments_by_id: Dict[Any, str] = {}

    if isinstance(data, list) and len(data) > 0:
        if isinstance(data[0], dict) and 'comment' in data[0]:
            for item in data:
                cid = item.get('id')
                if cid is None:
                    # Fall back to positional index if id missing
                    # We cannot recover original index reliably unless we add one; so skip
                    # positional fallback handled below
                    pass
                else:
                    comments_by_id[cid] = item.get('comment', "")
        elif isinstance(data[0], str):
            # Positional mapping only
            pass
        else:
            raise ValueError("Unsupported JSON format for comments.")
    else:
        raise ValueError("Comments JSON is empty or invalid.")

    resolved_comments: List[str] = []
    if comments_by_id:
        for cid in ids:
            resolved_comments.append(comments_by_id.get(cid, ""))
    else:
        # Positional fallback: assume ids are 0..N-1 indices
        if not isinstance(data[0], str):
            raise ValueError("Cannot map ids to comments: ids present but JSON lacks 'id' fields; and data is not a list of strings for positional mapping.")
        for cid in ids:
            try:
                resolved_comments.append(data[int(cid)])
            except Exception:
                resolved_comments.append("")

    return resolved_comments


def compute_cluster_members(
    embeddings: np.ndarray,
    num_clusters: int,
    random_state: int = 42,
) -> Tuple[np.ndarray, np.ndarray, KMeans]:
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=random_state)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_
    return labels, centers, kmeans


def select_nearest_and_farthest(
    embeddings: np.ndarray,
    labels: np.ndarray,
    centers: np.ndarray,
    cluster_index: int,
    num_closest: int,
    num_farthest: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    members = np.where(labels == cluster_index)[0]
    if members.size == 0:
        return members, np.array([], dtype=int), np.array([], dtype=int)

    center = centers[cluster_index]
    member_vectors = embeddings[members]
    dists = np.linalg.norm(member_vectors - center, axis=1)

    order = np.argsort(dists)
    closest = members[order[: min(num_closest, members.size)]]
    farthest = members[order[::-1][: min(num_farthest, members.size)]]
    return members, closest, farthest


def main():
    # Resolve path robustly (supports exact path, filename, or fuzzy stem)
    npz_path = resolve_npz_path(EMBED_NPZ_PATH)

    embeddings, ids, meta = load_npz_embeddings(npz_path)

    # Resolve comments JSON path
    comments_path = COMMENTS_JSON_PATH or infer_comments_json_path(npz_path)
    comments = load_comments(ids, comments_path)

    # Run KMeans
    labels, centers, kmeans = compute_cluster_members(
        embeddings, num_clusters=NUM_CLUSTERS, random_state=42
    )

    # Save under youtube_embeddings_project/sample comments form cluster
    embeddings_dir = Path(npz_path).parent
    project_dir = embeddings_dir.parent
    out_dir_path = project_dir / "sample comments form cluster"
    out_dir_path.mkdir(parents=True, exist_ok=True)
    out_dir = str(out_dir_path)
    base_name = Path(npz_path).stem.replace("_embeddings", "")

    summary: Dict[str, Any] = {
        "npz_path": npz_path,
        "comments_json_path": comments_path,
        "num_points": int(embeddings.shape[0]),
        "embedding_dim": int(embeddings.shape[1]),
        "num_clusters": int(NUM_CLUSTERS),
        "per_cluster_counts": {},
        "outputs": [],
    }

    for k in range(NUM_CLUSTERS):
        members, closest, farthest = select_nearest_and_farthest(
            embeddings, labels, centers, k, NUM_CLOSEST, NUM_FARTHEST
        )

        # Prepare outputs
        def pack(indices: np.ndarray) -> List[Dict[str, Any]]:
            result: List[Dict[str, Any]] = []
            for idx in indices.tolist():
                result.append({
                    "index": int(idx),
                    "id": ids[idx].item() if hasattr(ids[idx], 'item') else ids[idx],
                    "comment": comments[idx],
                })
            return result

        closest_payload = pack(closest)
        farthest_payload = pack(farthest)

        # Write per-cluster files
        closest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_cluster_{k}_nearest_{len(closest_payload)}.json"
        )
        farthest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_cluster_{k}_farthest_{len(farthest_payload)}.json"
        )

        with open(closest_path, "w", encoding="utf-8") as f:
            json.dump(closest_payload, f, ensure_ascii=False, indent=2)
        with open(farthest_path, "w", encoding="utf-8") as f:
            json.dump(farthest_payload, f, ensure_ascii=False, indent=2)

        summary["per_cluster_counts"][str(k)] = {
            "members": int(members.size),
            "closest_saved": int(len(closest_payload)),
            "farthest_saved": int(len(farthest_payload)),
        }
        summary["outputs"].append({
            "cluster": k,
            "closest_path": closest_path,
            "farthest_path": farthest_path,
        })

    # Write summary
    summary_path = os.path.join(out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\nClustering complete.")
    print(f"Summary written to: {summary_path}")
    for entry in summary["outputs"]:
        print(f"Cluster {entry['cluster']:>2}:\n  nearest -> {entry['closest_path']}\n  farthest -> {entry['farthest_path']}")


if __name__ == "__main__":
    main()




In [None]:
import os
import re
import json
import difflib
from glob import glob
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Optional: Google Colab drive mounting
try:
    from google.colab import drive  # type: ignore
except Exception:  # pragma: no cover
    drive = None  # type: ignore

import numpy as np
from sklearn.cluster import KMeans


"""
Colab-ready script to:
1) Load embeddings from an .npz file (expects keys: 'embeddings', 'ids')
2) Run KMeans into 10 clusters
3) For each cluster, extract:
   - 100 closest comments to the centroid
   - 500 farthest comments from the centroid
4) Save results as JSON files next to the .npz

Notes:
- Actual comments are not stored inside the .npz in this project. This script will
  try to find the source JSON (same stem as the .npz, without the '_embeddings' suffix)
  in the same directory. If it is elsewhere, set COMMENTS_JSON_PATH below.
- The source JSON can be either a list of strings or a list of objects with
  fields {'id': ..., 'comment': ...}. When 'id' is missing, positional index is used.

Usage in Colab:
- Just run this file content in a cell (or upload as a .py and run it). No CLI needed.
"""


# ==== USER CONFIG (edit these two lines as needed) ====
EMBED_NPZ_PATH = \
    "/content/drive/My Drive/youtube_embeddings_project/embeddings/Without_Me_Eminem_embeddings.npz"

# If None, the script will try to infer the JSON path from the .npz filename.
COMMENTS_JSON_PATH: Optional[str] = \
    "/content/drive/My Drive/youtube_embeddings_project/comments/Without_Me_Eminem.json"

# Number of clusters and retrieval sizes
NUM_CLUSTERS = 3
NUM_CLOSEST = 1000
NUM_FARTHEST = 1000

def _maybe_mount_drive():
    try:
        if drive is not None and not os.path.isdir("/content/drive/MyDrive") and not os.path.isdir("/content/drive/My Drive"):
            drive.mount('/content/drive')
    except Exception:
        pass


def _default_embed_dirs() -> List[str]:
    """Return possible embeddings directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/embeddings",
        "/content/drive/My Drive/youtube_embeddings_project/embeddings",
        "./embeddings",
    ]


def _default_comment_dirs() -> List[str]:
    """Return possible comments directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/comments",
        "/content/drive/My Drive/youtube_embeddings_project/comments",
        "./comments",
    ]


def load_npz_embeddings(npz_path: str) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
    data = np.load(npz_path, allow_pickle=True)
    if 'embeddings' not in data or 'ids' not in data:
        raise KeyError(".npz must contain 'embeddings' and 'ids' arrays")

    embeddings: np.ndarray = data['embeddings']
    ids: np.ndarray = data['ids']
    meta = {"keys": list(data.keys())}
    return embeddings, ids, meta


def infer_comments_json_path(npz_path: str) -> Optional[str]:
    """Try to infer the comments JSON path for a given .npz.

    Search order:
    1) Same directory as .npz with base name (stripping _embeddings) and .json
    2) Known comments directories with exact filename match
    3) Fuzzy match by normalized stem across known comments directories
    """
    p = Path(npz_path)
    stem = p.stem
    # Expect pattern like XYZ_embeddings.npz -> XYZ.json
    if stem.endswith("_embeddings"):
        base = stem[:-len("_embeddings")]
    else:
        base = stem

    # 1) Same directory
    candidate = p.parent / f"{base}.json"
    if candidate.exists():
        return str(candidate)

    # 2) Exact match in known comments dirs
    _maybe_mount_drive()
    comment_dirs = _default_comment_dirs()
    for d in comment_dirs:
        cand = Path(d) / f"{base}.json"
        if cand.exists():
            return str(cand)

    # 3) Fuzzy match across known comments dirs
    norm_target = _normalize_name(base)
    all_jsons: List[str] = []
    for d in comment_dirs:
        all_jsons.extend(sorted(glob(os.path.join(d, "*.json"))))

    if not all_jsons:
        return None

    scored: List[Tuple[float, str]] = []
    for path in all_jsons:
        cnorm = _normalize_name(Path(path).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, path))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score >= 0.5:
        return best_path
    return None


def _normalize_name(name: str) -> str:
    s = name.lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s


def resolve_npz_path(path_or_stem: str) -> str:
    """Resolve an embeddings .npz path from an exact path or a song stem.

    Strategy:
    1) If path exists, return it.
    2) If it's just a filename (endswith .npz), try in DEFAULT_EMBED_DIR.
    3) Fuzzy search in DEFAULT_EMBED_DIR using normalized containment and difflib.
    """
    # 1) Direct hit
    if os.path.isfile(path_or_stem):
        return path_or_stem

    # 2) Try as filename inside default dir
    if path_or_stem.endswith(".npz"):
        candidate = os.path.join(DEFAULT_EMBED_DIR, os.path.basename(path_or_stem))
        if os.path.isfile(candidate):
            return candidate

    # 3) Fuzzy search by stem across known directories
    stem = Path(path_or_stem).stem
    norm_target = _normalize_name(stem)

    # Try to ensure Drive is mounted before searching
    _maybe_mount_drive()

    all_candidates: List[str] = []
    searched_dirs: List[str] = []
    for d in _default_embed_dirs():
        searched_dirs.append(d)
        all_candidates.extend(sorted(glob(os.path.join(d, "*.npz"))))

    if not all_candidates:
        raise FileNotFoundError(
            "No .npz files found in any of: " + ", ".join(searched_dirs)
        )

    # Score candidates
    scored: List[Tuple[float, str]] = []
    for c in all_candidates:
        cnorm = _normalize_name(Path(c).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, c))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score < 0.4:
        raise FileNotFoundError(
            f"Could not resolve embeddings .npz for '{path_or_stem}'. Best match score={best_score:.2f} from dirs: "
            + ", ".join(searched_dirs)
        )
    return best_path


def load_comments(ids: np.ndarray, comments_json_path: Optional[str]) -> List[str]:
    if comments_json_path is None:
        raise FileNotFoundError(
            "Could not determine source comments JSON. Set COMMENTS_JSON_PATH explicitly.")

    with open(comments_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Build mapping from id -> comment
    comments_by_id: Dict[Any, str] = {}

    if isinstance(data, list) and len(data) > 0:
        if isinstance(data[0], dict) and 'comment' in data[0]:
            for item in data:
                cid = item.get('id')
                if cid is None:
                    # Fall back to positional index if id missing
                    # We cannot recover original index reliably unless we add one; so skip
                    # positional fallback handled below
                    pass
                else:
                    comments_by_id[cid] = item.get('comment', "")
        elif isinstance(data[0], str):
            # Positional mapping only
            pass
        else:
            raise ValueError("Unsupported JSON format for comments.")
    else:
        raise ValueError("Comments JSON is empty or invalid.")

    resolved_comments: List[str] = []
    if comments_by_id:
        for cid in ids:
            resolved_comments.append(comments_by_id.get(cid, ""))
    else:
        # Positional fallback: assume ids are 0..N-1 indices
        if not isinstance(data[0], str):
            raise ValueError("Cannot map ids to comments: ids present but JSON lacks 'id' fields; and data is not a list of strings for positional mapping.")
        for cid in ids:
            try:
                resolved_comments.append(data[int(cid)])
            except Exception:
                resolved_comments.append("")

    return resolved_comments


def compute_cluster_members(
    embeddings: np.ndarray,
    num_clusters: int,
    random_state: int = 42,
) -> Tuple[np.ndarray, np.ndarray, KMeans]:
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=random_state)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_
    return labels, centers, kmeans


def select_nearest_and_farthest(
    embeddings: np.ndarray,
    labels: np.ndarray,
    centers: np.ndarray,
    cluster_index: int,
    num_closest: int,
    num_farthest: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    members = np.where(labels == cluster_index)[0]
    if members.size == 0:
        return members, np.array([], dtype=int), np.array([], dtype=int)

    center = centers[cluster_index]
    member_vectors = embeddings[members]
    dists = np.linalg.norm(member_vectors - center, axis=1)

    order = np.argsort(dists)
    closest = members[order[: min(num_closest, members.size)]]
    farthest = members[order[::-1][: min(num_farthest, members.size)]]
    return members, closest, farthest


def main():
    # Resolve path robustly (supports exact path, filename, or fuzzy stem)
    npz_path = resolve_npz_path(EMBED_NPZ_PATH)

    embeddings, ids, meta = load_npz_embeddings(npz_path)

    # Resolve comments JSON path
    comments_path = COMMENTS_JSON_PATH or infer_comments_json_path(npz_path)
    comments = load_comments(ids, comments_path)

    # Deduplicate embeddings (preserve first occurrence), keeping ids and comments aligned
    # Use void view trick for row-wise uniqueness
    embeddings_view = embeddings.view(np.void)
    _, unique_indices = np.unique(embeddings_view, axis=0, return_index=True)
    unique_indices = np.sort(unique_indices)

    original_count = int(embeddings.shape[0])
    embeddings_dedup = embeddings[unique_indices]
    ids_dedup = ids[unique_indices]
    comments_dedup = [comments[i] for i in unique_indices]

    unique_count = int(embeddings_dedup.shape[0])
    duplicate_count = original_count - unique_count
    duplicate_percentage = (duplicate_count / original_count) * 100 if original_count else 0.0

    if duplicate_count > 0:
        print(f"Deduplication: removed {duplicate_count} duplicates ({duplicate_percentage:.1f}%)")

    # Run KMeans
    labels, centers, kmeans = compute_cluster_members(
        embeddings_dedup, num_clusters=NUM_CLUSTERS, random_state=42
    )

    # Save under youtube_embeddings_project/deduplicated
    embeddings_dir = Path(npz_path).parent
    project_dir = embeddings_dir.parent
    out_dir_path = project_dir / "deduplicated11"
    out_dir_path.mkdir(parents=True, exist_ok=True)
    out_dir = str(out_dir_path)
    base_name = Path(npz_path).stem.replace("_embeddings", "")

    summary: Dict[str, Any] = {
        "npz_path": npz_path,
        "comments_json_path": comments_path,
        "num_points_original": int(original_count),
        "num_points_deduplicated": int(unique_count),
        "duplicates_removed": int(duplicate_count),
        "duplicate_percentage": float(duplicate_percentage),
        "embedding_dim": int(embeddings.shape[1]),
        "num_clusters": int(NUM_CLUSTERS),
        "per_cluster_counts": {},
        "outputs": [],
    }

    for k in range(NUM_CLUSTERS):
        members, closest, farthest = select_nearest_and_farthest(
            embeddings_dedup, labels, centers, k, NUM_CLOSEST, NUM_FARTHEST
        )

        # Prepare outputs
        def pack(indices: np.ndarray) -> List[Dict[str, Any]]:
            result: List[Dict[str, Any]] = []
            for idx in indices.tolist():
                result.append({
                    "index": int(idx),
                    "id": ids_dedup[idx].item() if hasattr(ids_dedup[idx], 'item') else ids_dedup[idx],
                    "comment": comments_dedup[idx],
                })
            return result

        closest_payload = pack(closest)
        farthest_payload = pack(farthest)

        # Write per-cluster files
        closest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_cluster_{k}_nearest_{len(closest_payload)}.json"
        )
        farthest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_cluster_{k}_farthest_{len(farthest_payload)}.json"
        )

        with open(closest_path, "w", encoding="utf-8") as f:
            json.dump(closest_payload, f, ensure_ascii=False, indent=2)
        with open(farthest_path, "w", encoding="utf-8") as f:
            json.dump(farthest_payload, f, ensure_ascii=False, indent=2)

        summary["per_cluster_counts"][str(k)] = {
            "members": int(members.size),
            "closest_saved": int(len(closest_payload)),
            "farthest_saved": int(len(farthest_payload)),
        }
        summary["outputs"].append({
            "cluster": k,
            "closest_path": closest_path,
            "farthest_path": farthest_path,
        })

    # Write summary
    summary_path = os.path.join(out_dir, f"{base_name}_kmeans{NUM_CLUSTERS}_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\nClustering complete.")
    print(f"Summary written to: {summary_path}")
    for entry in summary["outputs"]:
        print(f"Cluster {entry['cluster']:>2}:\n  nearest -> {entry['closest_path']}\n  farthest -> {entry['farthest_path']}")


if __name__ == "__main__":
    main()




In [None]:
import os
import re
import json
import difflib
from glob import glob
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Optional: Google Colab drive mounting
try:
    from google.colab import drive  # type: ignore
except Exception:  # pragma: no cover
    drive = None  # type: ignore

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


"""
Colab-ready script to:
1) Load embeddings from an .npz file (expects keys: 'embeddings', 'ids')
2) Run KMeans into 10 clusters
3) For each cluster, extract:
   - 100 closest comments to the centroid
   - 500 farthest comments from the centroid
4) Save results as JSON files next to the .npz

Notes:
- Actual comments are not stored inside the .npz in this project. This script will
  try to find the source JSON (same stem as the .npz, without the '_embeddings' suffix)
  in the same directory. If it is elsewhere, set COMMENTS_JSON_PATH below.
- The source JSON can be either a list of strings or a list of objects with
  fields {'id': ..., 'comment': ...}. When 'id' is missing, positional index is used.

Usage in Colab:
- Just run this file content in a cell (or upload as a .py and run it). No CLI needed.
"""


# ==== USER CONFIG (edit these two lines as needed) ====
EMBED_NPZ_PATH = \
    "/content/drive/My Drive/youtube_embeddings_project/embeddings/7_Years_Lukas_Graham_embeddings.npz"

# If None, the script will try to infer the JSON path from the .npz filename.
COMMENTS_JSON_PATH: Optional[str] = \
    "/content/drive/My Drive/youtube_embeddings_project/comments/7_Years_Lukas_Graham.json"

# Number of clusters and retrieval sizes
NUM_CLUSTERS = 10
NUM_CLOSEST = 500
NUM_FARTHEST = 500

# Silhouette search configuration (used to automatically choose K)
K_MIN = 2
K_MAX = 20

def _maybe_mount_drive():
    try:
        if drive is not None and not os.path.isdir("/content/drive/MyDrive") and not os.path.isdir("/content/drive/My Drive"):
            drive.mount('/content/drive')
    except Exception:
        pass


def _default_embed_dirs() -> List[str]:
    """Return possible embeddings directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/embeddings",
        "/content/drive/My Drive/youtube_embeddings_project/embeddings",
        "./embeddings",
    ]


def _default_comment_dirs() -> List[str]:
    """Return possible comments directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/comments",
        "/content/drive/My Drive/youtube_embeddings_project/comments",
        "./comments",
    ]


def load_npz_embeddings(npz_path: str) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
    data = np.load(npz_path, allow_pickle=True)
    if 'embeddings' not in data or 'ids' not in data:
        raise KeyError(".npz must contain 'embeddings' and 'ids' arrays")

    embeddings: np.ndarray = data['embeddings']
    ids: np.ndarray = data['ids']
    meta = {"keys": list(data.keys())}
    return embeddings, ids, meta


def infer_comments_json_path(npz_path: str) -> Optional[str]:
    """Try to infer the comments JSON path for a given .npz.

    Search order:
    1) Same directory as .npz with base name (stripping _embeddings) and .json
    2) Known comments directories with exact filename match
    3) Fuzzy match by normalized stem across known comments directories
    """
    p = Path(npz_path)
    stem = p.stem
    # Expect pattern like XYZ_embeddings.npz -> XYZ.json
    if stem.endswith("_embeddings"):
        base = stem[:-len("_embeddings")]
    else:
        base = stem

    # 1) Same directory
    candidate = p.parent / f"{base}.json"
    if candidate.exists():
        return str(candidate)

    # 2) Exact match in known comments dirs
    _maybe_mount_drive()
    comment_dirs = _default_comment_dirs()
    for d in comment_dirs:
        cand = Path(d) / f"{base}.json"
        if cand.exists():
            return str(cand)

    # 3) Fuzzy match across known comments dirs
    norm_target = _normalize_name(base)
    all_jsons: List[str] = []
    for d in comment_dirs:
        all_jsons.extend(sorted(glob(os.path.join(d, "*.json"))))

    if not all_jsons:
        return None

    scored: List[Tuple[float, str]] = []
    for path in all_jsons:
        cnorm = _normalize_name(Path(path).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, path))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score >= 0.5:
        return best_path
    return None


def _normalize_name(name: str) -> str:
    s = name.lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s


def resolve_npz_path(path_or_stem: str) -> str:
    """Resolve an embeddings .npz path from an exact path or a song stem.

    Strategy:
    1) If path exists, return it.
    2) If it's just a filename (endswith .npz), try in DEFAULT_EMBED_DIR.
    3) Fuzzy search in DEFAULT_EMBED_DIR using normalized containment and difflib.
    """
    # 1) Direct hit
    if os.path.isfile(path_or_stem):
        return path_or_stem

    # 2) Try as filename inside default dir
    if path_or_stem.endswith(".npz"):
        candidate = os.path.join(DEFAULT_EMBED_DIR, os.path.basename(path_or_stem))
        if os.path.isfile(candidate):
            return candidate

    # 3) Fuzzy search by stem across known directories
    stem = Path(path_or_stem).stem
    norm_target = _normalize_name(stem)

    # Try to ensure Drive is mounted before searching
    _maybe_mount_drive()

    all_candidates: List[str] = []
    searched_dirs: List[str] = []
    for d in _default_embed_dirs():
        searched_dirs.append(d)
        all_candidates.extend(sorted(glob(os.path.join(d, "*.npz"))))

    if not all_candidates:
        raise FileNotFoundError(
            "No .npz files found in any of: " + ", ".join(searched_dirs)
        )

    # Score candidates
    scored: List[Tuple[float, str]] = []
    for c in all_candidates:
        cnorm = _normalize_name(Path(c).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, c))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score < 0.4:
        raise FileNotFoundError(
            f"Could not resolve embeddings .npz for '{path_or_stem}'. Best match score={best_score:.2f} from dirs: "
            + ", ".join(searched_dirs)
        )
    return best_path


def load_comments(ids: np.ndarray, comments_json_path: Optional[str]) -> List[str]:
    if comments_json_path is None:
        raise FileNotFoundError(
            "Could not determine source comments JSON. Set COMMENTS_JSON_PATH explicitly.")

    with open(comments_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Build mapping from id -> comment
    comments_by_id: Dict[Any, str] = {}

    if isinstance(data, list) and len(data) > 0:
        if isinstance(data[0], dict) and 'comment' in data[0]:
            for item in data:
                cid = item.get('id')
                if cid is None:
                    # Fall back to positional index if id missing
                    # We cannot recover original index reliably unless we add one; so skip
                    # positional fallback handled below
                    pass
                else:
                    comments_by_id[cid] = item.get('comment', "")
        elif isinstance(data[0], str):
            # Positional mapping only
            pass
        else:
            raise ValueError("Unsupported JSON format for comments.")
    else:
        raise ValueError("Comments JSON is empty or invalid.")

    resolved_comments: List[str] = []
    if comments_by_id:
        for cid in ids:
            resolved_comments.append(comments_by_id.get(cid, ""))
    else:
        # Positional fallback: assume ids are 0..N-1 indices
        if not isinstance(data[0], str):
            raise ValueError("Cannot map ids to comments: ids present but JSON lacks 'id' fields; and data is not a list of strings for positional mapping.")
        for cid in ids:
            try:
                resolved_comments.append(data[int(cid)])
            except Exception:
                resolved_comments.append("")

    return resolved_comments


def compute_cluster_members(
    embeddings: np.ndarray,
    num_clusters: int,
    random_state: int = 42,
) -> Tuple[np.ndarray, np.ndarray, KMeans]:
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=random_state)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_
    return labels, centers, kmeans


def choose_k_via_silhouette(
    embeddings: np.ndarray,
    k_min: int = K_MIN,
    k_max: int = K_MAX,
    random_state: int = 42,
) -> Tuple[int, List[int], List[float]]:
    """Select k maximizing silhouette score with cosine distance.

    Returns (selected_k, ks_scanned, silhouette_scores).
    """
    n_samples = int(embeddings.shape[0])
    if n_samples <= 2:
        return 1, [1], [0.0]
    k_min_eff = max(2, min(k_min, n_samples - 1))
    k_max_eff = max(k_min_eff, min(k_max, n_samples - 1))

    ks: List[int] = []
    scores: List[float] = []
    best_k = None
    best_score = -1.0
    for k in range(k_min_eff, k_max_eff + 1):
        try:
            model = KMeans(n_clusters=k, n_init=10, random_state=random_state)
            labels = model.fit_predict(embeddings)
            # Skip degenerate cases (single cluster label or any empty cluster)
            if len(set(labels.tolist())) < 2:
                continue
            score = float(silhouette_score(embeddings, labels, metric="cosine"))
            ks.append(k)
            scores.append(score)
            if score > best_score:
                best_score = score
                best_k = k
        except Exception:
            continue

    if best_k is None:
        return max(1, min(NUM_CLUSTERS, n_samples)), ([max(1, min(NUM_CLUSTERS, n_samples))]), ([0.0])
    return int(best_k), ks, scores


def select_nearest_and_farthest(
    embeddings: np.ndarray,
    labels: np.ndarray,
    centers: np.ndarray,
    cluster_index: int,
    num_closest: int,
    num_farthest: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    members = np.where(labels == cluster_index)[0]
    if members.size == 0:
        return members, np.array([], dtype=int), np.array([], dtype=int)

    center = centers[cluster_index]
    member_vectors = embeddings[members]
    dists = np.linalg.norm(member_vectors - center, axis=1)

    order = np.argsort(dists)
    closest = members[order[: min(num_closest, members.size)]]
    farthest = members[order[::-1][: min(num_farthest, members.size)]]
    return members, closest, farthest


def main():
    # Resolve path robustly (supports exact path, filename, or fuzzy stem)
    npz_path = resolve_npz_path(EMBED_NPZ_PATH)

    embeddings, ids, meta = load_npz_embeddings(npz_path)

    # Resolve comments JSON path
    comments_path = COMMENTS_JSON_PATH or infer_comments_json_path(npz_path)
    comments = load_comments(ids, comments_path)

    # Deduplicate embeddings (preserve first occurrence), keeping ids and comments aligned
    # Use void view trick for row-wise uniqueness
    embeddings_view = embeddings.view(np.void)
    _, unique_indices = np.unique(embeddings_view, axis=0, return_index=True)
    unique_indices = np.sort(unique_indices)

    original_count = int(embeddings.shape[0])
    embeddings_dedup = embeddings[unique_indices]
    ids_dedup = ids[unique_indices]
    comments_dedup = [comments[i] for i in unique_indices]

    unique_count = int(embeddings_dedup.shape[0])
    duplicate_count = original_count - unique_count
    duplicate_percentage = (duplicate_count / original_count) * 100 if original_count else 0.0

    if duplicate_count > 0:
        print(f"Deduplication: removed {duplicate_count} duplicates ({duplicate_percentage:.1f}%)")

    # Run KMeans
    # Choose K automatically via silhouette score (cosine). Fallback to NUM_CLUSTERS if needed
    try:
        selected_k, ks_scanned, sil_scores = choose_k_via_silhouette(embeddings_dedup, K_MIN, K_MAX)
    except Exception:
        selected_k, ks_scanned, sil_scores = max(1, min(NUM_CLUSTERS, int(embeddings_dedup.shape[0]))), [], []

    labels, centers, kmeans = compute_cluster_members(
        embeddings_dedup, num_clusters=selected_k, random_state=42
    )

    # Save under youtube_embeddings_project/deduplicated
    embeddings_dir = Path(npz_path).parent
    project_dir = embeddings_dir.parent
    out_dir_path = project_dir / "deduplicated"
    out_dir_path.mkdir(parents=True, exist_ok=True)
    out_dir = str(out_dir_path)
    base_name = Path(npz_path).stem.replace("_embeddings", "")

    summary: Dict[str, Any] = {
        "npz_path": npz_path,
        "comments_json_path": comments_path,
        "num_points_original": int(original_count),
        "num_points_deduplicated": int(unique_count),
        "duplicates_removed": int(duplicate_count),
        "duplicate_percentage": float(duplicate_percentage),
        "embedding_dim": int(embeddings.shape[1]),
        "num_clusters": int(selected_k),
        "silhouette_scan_ks": ks_scanned,
        "silhouette_scores": sil_scores,
        "per_cluster_counts": {},
        "outputs": [],
    }

    for k in range(selected_k):
        members, closest, farthest = select_nearest_and_farthest(
            embeddings_dedup, labels, centers, k, NUM_CLOSEST, NUM_FARTHEST
        )

        # Prepare outputs
        def pack(indices: np.ndarray) -> List[Dict[str, Any]]:
            result: List[Dict[str, Any]] = []
            for idx in indices.tolist():
                result.append({
                    "index": int(idx),
                    "id": ids_dedup[idx].item() if hasattr(ids_dedup[idx], 'item') else ids_dedup[idx],
                    "comment": comments_dedup[idx],
                })
            return result

        closest_payload = pack(closest)
        farthest_payload = pack(farthest)

        # Write per-cluster files
        closest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{selected_k}_cluster_{k}_nearest_{len(closest_payload)}.json"
        )
        farthest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{selected_k}_cluster_{k}_farthest_{len(farthest_payload)}.json"
        )

        with open(closest_path, "w", encoding="utf-8") as f:
            json.dump(closest_payload, f, ensure_ascii=False, indent=2)
        with open(farthest_path, "w", encoding="utf-8") as f:
            json.dump(farthest_payload, f, ensure_ascii=False, indent=2)

        summary["per_cluster_counts"][str(k)] = {
            "members": int(members.size),
            "closest_saved": int(len(closest_payload)),
            "farthest_saved": int(len(farthest_payload)),
        }
        summary["outputs"].append({
            "cluster": k,
            "closest_path": closest_path,
            "farthest_path": farthest_path,
        })

    # Write summary
    summary_path = os.path.join(out_dir, f"{base_name}_kmeans{selected_k}_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\nClustering complete.")
    print(f"Summary written to: {summary_path}")
    for entry in summary["outputs"]:
        print(f"Cluster {entry['cluster']:>2}:\n  nearest -> {entry['closest_path']}\n  farthest -> {entry['farthest_path']}")


if __name__ == "__main__":
    main()




In [None]:
import os
import re
import json
import difflib
from glob import glob
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Optional: Google Colab drive mounting
try:
    from google.colab import drive  # type: ignore
except Exception:  # pragma: no cover
    drive = None  # type: ignore

import numpy as np
import time
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize as l2_normalize


"""
Colab-ready script to:
1) Load embeddings from an .npz file (expects keys: 'embeddings', 'ids')
2) Run KMeans into 10 clusters
3) For each cluster, extract:
   - 100 closest comments to the centroid
   - 500 farthest comments from the centroid
4) Save results as JSON files next to the .npz

Notes:
- Actual comments are not stored inside the .npz in this project. This script will
  try to find the source JSON (same stem as the .npz, without the '_embeddings' suffix)
  in the same directory. If it is elsewhere, set COMMENTS_JSON_PATH below.
- The source JSON can be either a list of strings or a list of objects with
  fields {'id': ..., 'comment': ...}. When 'id' is missing, positional index is used.

Usage in Colab:
- Just run this file content in a cell (or upload as a .py and run it). No CLI needed.
"""


# ==== USER CONFIG (edit these two lines as needed) ====
EMBED_NPZ_PATH = \
    "/content/drive/My Drive/youtube_embeddings_project/embeddings/7_Years_Lukas_Graham_embeddings.npz"

# If None, the script will try to infer the JSON path from the .npz filename.
COMMENTS_JSON_PATH: Optional[str] = \
    "/content/drive/My Drive/youtube_embeddings_project/comments/7_Years_Lukas_Graham.json"

# Number of clusters and retrieval sizes
NUM_CLUSTERS = 10
NUM_CLOSEST = 1000
NUM_FARTHEST = 1000

# Performance and selection configuration
K_MIN = 2
K_MAX = 30  # upper bound; final range adapts to dataset size
SIL_SUBSAMPLE = 8000  # number of points to sample for silhouette scoring
USE_MINIBATCH_KMEANS = True
MINIBATCH_SIZE = 4096
PCA_DIM = 100  # set 0 to disable PCA

def _maybe_mount_drive():
    try:
        if drive is not None and not os.path.isdir("/content/drive/MyDrive") and not os.path.isdir("/content/drive/My Drive"):
            drive.mount('/content/drive')
    except Exception:
        pass


def _default_embed_dirs() -> List[str]:
    """Return possible embeddings directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/embeddings",
        "/content/drive/My Drive/youtube_embeddings_project/embeddings",
        "./embeddings",
    ]


def _default_comment_dirs() -> List[str]:
    """Return possible comments directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/comments",
        "/content/drive/My Drive/youtube_embeddings_project/comments",
        "./comments",
    ]


def load_npz_embeddings(npz_path: str) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
    data = np.load(npz_path, allow_pickle=True)
    if 'embeddings' not in data or 'ids' not in data:
        raise KeyError(".npz must contain 'embeddings' and 'ids' arrays")

    embeddings: np.ndarray = data['embeddings']
    ids: np.ndarray = data['ids']
    meta = {"keys": list(data.keys())}
    return embeddings, ids, meta


def infer_comments_json_path(npz_path: str) -> Optional[str]:
    """Try to infer the comments JSON path for a given .npz.

    Search order:
    1) Same directory as .npz with base name (stripping _embeddings) and .json
    2) Known comments directories with exact filename match
    3) Fuzzy match by normalized stem across known comments directories
    """
    p = Path(npz_path)
    stem = p.stem
    # Expect pattern like XYZ_embeddings.npz -> XYZ.json
    if stem.endswith("_embeddings"):
        base = stem[:-len("_embeddings")]
    else:
        base = stem

    # 1) Same directory
    candidate = p.parent / f"{base}.json"
    if candidate.exists():
        return str(candidate)

    # 2) Exact match in known comments dirs
    _maybe_mount_drive()
    comment_dirs = _default_comment_dirs()
    for d in comment_dirs:
        cand = Path(d) / f"{base}.json"
        if cand.exists():
            return str(cand)

    # 3) Fuzzy match across known comments dirs
    norm_target = _normalize_name(base)
    all_jsons: List[str] = []
    for d in comment_dirs:
        all_jsons.extend(sorted(glob(os.path.join(d, "*.json"))))

    if not all_jsons:
        return None

    scored: List[Tuple[float, str]] = []
    for path in all_jsons:
        cnorm = _normalize_name(Path(path).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, path))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score >= 0.5:
        return best_path
    return None


def _normalize_name(name: str) -> str:
    s = name.lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s


def resolve_npz_path(path_or_stem: str) -> str:
    """Resolve an embeddings .npz path from an exact path or a song stem.

    Strategy:
    1) If path exists, return it.
    2) If it's just a filename (endswith .npz), try in DEFAULT_EMBED_DIR.
    3) Fuzzy search in DEFAULT_EMBED_DIR using normalized containment and difflib.
    """
    # 1) Direct hit
    if os.path.isfile(path_or_stem):
        return path_or_stem

    # 2) Try as filename inside default dir
    if path_or_stem.endswith(".npz"):
        candidate = os.path.join(DEFAULT_EMBED_DIR, os.path.basename(path_or_stem))
        if os.path.isfile(candidate):
            return candidate

    # 3) Fuzzy search by stem across known directories
    stem = Path(path_or_stem).stem
    norm_target = _normalize_name(stem)

    # Try to ensure Drive is mounted before searching
    _maybe_mount_drive()

    all_candidates: List[str] = []
    searched_dirs: List[str] = []
    for d in _default_embed_dirs():
        searched_dirs.append(d)
        all_candidates.extend(sorted(glob(os.path.join(d, "*.npz"))))

    if not all_candidates:
        raise FileNotFoundError(
            "No .npz files found in any of: " + ", ".join(searched_dirs)
        )

    # Score candidates
    scored: List[Tuple[float, str]] = []
    for c in all_candidates:
        cnorm = _normalize_name(Path(c).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, c))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score < 0.4:
        raise FileNotFoundError(
            f"Could not resolve embeddings .npz for '{path_or_stem}'. Best match score={best_score:.2f} from dirs: "
            + ", ".join(searched_dirs)
        )
    return best_path


def load_comments(ids: np.ndarray, comments_json_path: Optional[str]) -> List[str]:
    if comments_json_path is None:
        raise FileNotFoundError(
            "Could not determine source comments JSON. Set COMMENTS_JSON_PATH explicitly.")

    with open(comments_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Build mapping from id -> comment
    comments_by_id: Dict[Any, str] = {}

    if isinstance(data, list) and len(data) > 0:
        if isinstance(data[0], dict) and 'comment' in data[0]:
            for item in data:
                cid = item.get('id')
                if cid is None:
                    # Fall back to positional index if id missing
                    # We cannot recover original index reliably unless we add one; so skip
                    # positional fallback handled below
                    pass
                else:
                    comments_by_id[cid] = item.get('comment', "")
        elif isinstance(data[0], str):
            # Positional mapping only
            pass
        else:
            raise ValueError("Unsupported JSON format for comments.")
    else:
        raise ValueError("Comments JSON is empty or invalid.")

    resolved_comments: List[str] = []
    if comments_by_id:
        for cid in ids:
            resolved_comments.append(comments_by_id.get(cid, ""))
    else:
        # Positional fallback: assume ids are 0..N-1 indices
        if not isinstance(data[0], str):
            raise ValueError("Cannot map ids to comments: ids present but JSON lacks 'id' fields; and data is not a list of strings for positional mapping.")
        for cid in ids:
            try:
                resolved_comments.append(data[int(cid)])
            except Exception:
                resolved_comments.append("")

    return resolved_comments


def compute_cluster_members(
    embeddings: np.ndarray,
    num_clusters: int,
    random_state: int = 42,
) -> Tuple[np.ndarray, np.ndarray, KMeans]:
    start = time.time()
    if USE_MINIBATCH_KMEANS:
        print(f"[KMeans] Using MiniBatchKMeans: k={num_clusters}, batch_size={MINIBATCH_SIZE}")
        kmeans = MiniBatchKMeans(
            n_clusters=num_clusters,
            random_state=random_state,
            batch_size=MINIBATCH_SIZE,
            n_init=3,
            reassignment_ratio=0.01,
        )
    else:
        print(f"[KMeans] Using KMeans: k={num_clusters}")
        kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=random_state)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_
    print(f"[KMeans] Fitted in {time.time() - start:.2f}s")
    return labels, centers, kmeans


def choose_k_via_silhouette(
    embeddings: np.ndarray,
    k_min: int = K_MIN,
    k_max: int = K_MAX,
    random_state: int = 42,
) -> Tuple[int, List[int], List[float]]:
    """Select k maximizing silhouette score with cosine distance.

    Returns (selected_k, ks_scanned, silhouette_scores).
    """
    n_samples = int(embeddings.shape[0])
    if n_samples <= 2:
        return 1, [1], [0.0]

    # Adaptive k upper bound ~ sqrt(n) + 5
    import math
    k_min_eff = max(2, min(k_min, n_samples - 1))
    k_max_eff = max(k_min_eff, min(int(math.sqrt(n_samples)) + 5, k_max, n_samples - 1))
    print(f"[Silhouette] Scanning k in [{k_min_eff}..{k_max_eff}] over n={n_samples} points")

    ks: List[int] = []
    scores: List[float] = []
    best_k = None
    best_score = -1.0
    # Subsample to speed up silhouette (O(N^2) distances)
    rng = np.random.default_rng(seed=random_state)
    if n_samples > SIL_SUBSAMPLE:
        sample_indices = rng.choice(n_samples, size=SIL_SUBSAMPLE, replace=False)
        X_sil = embeddings[sample_indices]
        print(f"[Silhouette] Using subsample size={len(sample_indices)} for scoring")
    else:
        sample_indices = np.arange(n_samples)
        X_sil = embeddings
        print(f"[Silhouette] Using full data for scoring (n={n_samples})")

    for k in range(k_min_eff, k_max_eff + 1):
        try:
            t0 = time.time()
            if USE_MINIBATCH_KMEANS:
                model = MiniBatchKMeans(
                    n_clusters=k,
                    random_state=random_state,
                    batch_size=MINIBATCH_SIZE,
                    n_init=3,
                    reassignment_ratio=0.01,
                )
            else:
                model = KMeans(n_clusters=k, n_init=10, random_state=random_state)
            labels_full = model.fit_predict(embeddings)
            # Skip degenerate cases (single cluster label or any empty cluster)
            if len(set(labels_full.tolist())) < 2:
                continue
            labels_sub = labels_full[sample_indices]
            score = float(silhouette_score(X_sil, labels_sub, metric="cosine"))
            ks.append(k)
            scores.append(score)
            took = time.time() - t0
            print(f"[Silhouette] k={k:<3} score={score:>.4f} time={took:.2f}s")
            if score > best_score:
                best_score = score
                best_k = k
        except Exception:
            continue

    if best_k is None:
        return max(1, min(NUM_CLUSTERS, n_samples)), ([max(1, min(NUM_CLUSTERS, n_samples))]), ([0.0])
    return int(best_k), ks, scores


def select_nearest_and_farthest(
    embeddings: np.ndarray,
    labels: np.ndarray,
    centers: np.ndarray,
    cluster_index: int,
    num_closest: int,
    num_farthest: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    members = np.where(labels == cluster_index)[0]
    if members.size == 0:
        return members, np.array([], dtype=int), np.array([], dtype=int)

    center = centers[cluster_index]
    member_vectors = embeddings[members]
    dists = np.linalg.norm(member_vectors - center, axis=1)

    order = np.argsort(dists)
    closest = members[order[: min(num_closest, members.size)]]
    farthest = members[order[::-1][: min(num_farthest, members.size)]]
    return members, closest, farthest


def main():
    # Resolve path robustly (supports exact path, filename, or fuzzy stem)
    npz_path = resolve_npz_path(EMBED_NPZ_PATH)

    embeddings, ids, meta = load_npz_embeddings(npz_path)

    # Resolve comments JSON path
    comments_path = COMMENTS_JSON_PATH or infer_comments_json_path(npz_path)
    comments = load_comments(ids, comments_path)

    # Deduplicate embeddings (preserve first occurrence), keeping ids and comments aligned
    # Use void view trick for row-wise uniqueness
    print("[Dedup] Starting row-wise deduplication...")
    t0 = time.time()
    embeddings_view = embeddings.view(np.void)
    _, unique_indices = np.unique(embeddings_view, axis=0, return_index=True)
    unique_indices = np.sort(unique_indices)

    original_count = int(embeddings.shape[0])
    embeddings_dedup = embeddings[unique_indices]
    ids_dedup = ids[unique_indices]
    comments_dedup = [comments[i] for i in unique_indices]

    unique_count = int(embeddings_dedup.shape[0])
    duplicate_count = original_count - unique_count
    duplicate_percentage = (duplicate_count / original_count) * 100 if original_count else 0.0

    if duplicate_count > 0:
        print(f"[Dedup] Removed {duplicate_count} duplicates ({duplicate_percentage:.1f}%) in {time.time() - t0:.2f}s")
    else:
        print(f"[Dedup] No duplicates found ({time.time() - t0:.2f}s)")

    # Normalize and optionally reduce dimensionality
    print("[Preprocess] L2-normalizing vectors...")
    t0 = time.time()
    embeddings_proc = l2_normalize(embeddings_dedup, norm="l2")
    print(f"[Preprocess] Normalized in {time.time() - t0:.2f}s")

    pca = None
    if PCA_DIM and PCA_DIM > 0 and embeddings_proc.shape[1] > PCA_DIM:
        print(f"[Preprocess] Applying PCA to {PCA_DIM} dims from {embeddings_proc.shape[1]}...")
        t0 = time.time()
        pca = PCA(n_components=PCA_DIM, random_state=42)
        embeddings_proc = pca.fit_transform(embeddings_proc)
        print(f"[Preprocess] PCA completed in {time.time() - t0:.2f}s; new shape={embeddings_proc.shape}")
    else:
        embeddings_proc = embeddings_proc
        print("[Preprocess] PCA disabled or not needed")

    # Choose K automatically via silhouette score (cosine). Fallback to NUM_CLUSTERS if needed
    try:
        selected_k, ks_scanned, sil_scores = choose_k_via_silhouette(embeddings_proc, K_MIN, K_MAX)
    except Exception:
        selected_k, ks_scanned, sil_scores = max(1, min(NUM_CLUSTERS, int(embeddings_dedup.shape[0]))), [], []

    print(f"[Select-K] Selected k={selected_k}")
    labels, centers, kmeans = compute_cluster_members(
        embeddings_proc, num_clusters=selected_k, random_state=42
    )

    # Save under youtube_embeddings_project/deduplicated
    embeddings_dir = Path(npz_path).parent
    project_dir = embeddings_dir.parent
    out_dir_path = project_dir / "deduplicated2"
    out_dir_path.mkdir(parents=True, exist_ok=True)
    out_dir = str(out_dir_path)
    base_name = Path(npz_path).stem.replace("_embeddings", "")

    summary: Dict[str, Any] = {
        "npz_path": npz_path,
        "comments_json_path": comments_path,
        "num_points_original": int(original_count),
        "num_points_deduplicated": int(unique_count),
        "duplicates_removed": int(duplicate_count),
        "duplicate_percentage": float(duplicate_percentage),
        "embedding_dim": int(embeddings.shape[1]),
        "num_clusters": int(selected_k),
        "silhouette_scan_ks": ks_scanned,
        "silhouette_scores": sil_scores,
        "per_cluster_counts": {},
        "outputs": [],
    }

    for k in range(selected_k):
        members, closest, farthest = select_nearest_and_farthest(
            embeddings_proc, labels, centers, k, NUM_CLOSEST, NUM_FARTHEST
        )

        # Prepare outputs
        def pack(indices: np.ndarray) -> List[Dict[str, Any]]:
            result: List[Dict[str, Any]] = []
            for idx in indices.tolist():
                result.append({
                    "index": int(idx),
                    "id": ids_dedup[idx].item() if hasattr(ids_dedup[idx], 'item') else ids_dedup[idx],
                    "comment": comments_dedup[idx],
                })
            return result

        closest_payload = pack(closest)
        farthest_payload = pack(farthest)

        # Write per-cluster files
        closest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{selected_k}_cluster_{k}_nearest_{len(closest_payload)}.json"
        )
        farthest_path = os.path.join(
            out_dir, f"{base_name}_kmeans{selected_k}_cluster_{k}_farthest_{len(farthest_payload)}.json"
        )

        with open(closest_path, "w", encoding="utf-8") as f:
            json.dump(closest_payload, f, ensure_ascii=False, indent=2)
        with open(farthest_path, "w", encoding="utf-8") as f:
            json.dump(farthest_payload, f, ensure_ascii=False, indent=2)

        summary["per_cluster_counts"][str(k)] = {
            "members": int(members.size),
            "closest_saved": int(len(closest_payload)),
            "farthest_saved": int(len(farthest_payload)),
        }
        summary["outputs"].append({
            "cluster": k,
            "closest_path": closest_path,
            "farthest_path": farthest_path,
        })

    # Write summary
    summary_path = os.path.join(out_dir, f"{base_name}_kmeans{selected_k}_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\nClustering complete.")
    print(f"Summary written to: {summary_path}")
    for entry in summary["outputs"]:
        print(f"Cluster {entry['cluster']:>2}:\n  nearest -> {entry['closest_path']}\n  farthest -> {entry['farthest_path']}")


if __name__ == "__main__":
    main()




In [None]:
import os
import re
import json
import difflib
import time
from glob import glob
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Optional: Google Colab drive mounting
try:
    from google.colab import drive  # type: ignore
except Exception:  # pragma: no cover
    drive = None  # type: ignore

import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize as l2_normalize


"""
Colab-ready script using ELBOW METHOD for faster clustering:
1) Load embeddings from an .npz file (expects keys: 'embeddings', 'ids')
2) Use elbow method to automatically select optimal k
3) Run KMeans clustering
4) For each cluster, extract:
   - 100 closest comments to the centroid
   - 500 farthest comments from the centroid
5) Save results as JSON files

Notes:
- Elbow method is faster than silhouette (no pairwise distance computation)
- Actual comments are not stored inside the .npz in this project. This script will
  try to find the source JSON (same stem as the .npz, without the '_embeddings' suffix)
  in the same directory. If it is elsewhere, set COMMENTS_JSON_PATH below.
- The source JSON can be either a list of strings or a list of objects with
  fields {'id': ..., 'comment': ...}. When 'id' is missing, positional index is used.

Usage in Colab:
- Just run this file content in a cell (or upload as a .py and run it). No CLI needed.
"""


# ==== USER CONFIG (edit these two lines as needed) ====
EMBED_NPZ_PATH = \
    "/content/drive/My Drive/youtube_embeddings_project/embeddings/Mockingbird_Eminem.npz"

# If None, the script will try to infer the JSON path from the .npz filename.
COMMENTS_JSON_PATH: Optional[str] = \
    "/content/drive/My Drive/youtube_embeddings_project/comments/Mockingbird_Eminem.json"

# Number of clusters and retrieval sizes
NUM_CLOSEST = 100
NUM_FARTHEST = 100

# Performance and selection configuration
K_MIN = 2
K_MAX = 30  # upper bound; final range adapts to dataset size
USE_MINIBATCH_KMEANS = True
MINIBATCH_SIZE = 4096
PCA_DIM = 0  # set 0 to disable PCA


def _maybe_mount_drive():
    try:
        if drive is not None and not os.path.isdir("/content/drive/MyDrive") and not os.path.isdir("/content/drive/My Drive"):
            drive.mount('/content/drive')
    except Exception:
        pass


def _default_embed_dirs() -> List[str]:
    """Return possible embeddings directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/embeddings",
        "/content/drive/My Drive/youtube_embeddings_project/embeddings",
        "./embeddings",
    ]


def _default_comment_dirs() -> List[str]:
    """Return possible comments directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/comments",
        "/content/drive/My Drive/youtube_embeddings_project/comments",
        "./comments",
    ]


def load_npz_embeddings(npz_path: str) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
    data = np.load(npz_path, allow_pickle=True)
    if 'embeddings' not in data or 'ids' not in data:
        raise KeyError(".npz must contain 'embeddings' and 'ids' arrays")

    embeddings: np.ndarray = data['embeddings']
    ids: np.ndarray = data['ids']
    meta = {"keys": list(data.keys())}
    return embeddings, ids, meta


def infer_comments_json_path(npz_path: str) -> Optional[str]:
    """Try to infer the comments JSON path for a given .npz.

    Search order:
    1) Same directory as .npz with base name (stripping _embeddings) and .json
    2) Known comments directories with exact filename match
    3) Fuzzy match by normalized stem across known comments directories
    """
    p = Path(npz_path)
    stem = p.stem
    # Expect pattern like XYZ_embeddings.npz -> XYZ.json
    if stem.endswith("_embeddings"):
        base = stem[:-len("_embeddings")]
    else:
        base = stem

    # 1) Same directory
    candidate = p.parent / f"{base}.json"
    if candidate.exists():
        return str(candidate)

    # 2) Exact match in known comments dirs
    _maybe_mount_drive()
    comment_dirs = _default_comment_dirs()
    for d in comment_dirs:
        cand = Path(d) / f"{base}.json"
        if cand.exists():
            return str(cand)

    # 3) Fuzzy match across known comments dirs
    norm_target = _normalize_name(base)
    all_jsons: List[str] = []
    for d in comment_dirs:
        all_jsons.extend(sorted(glob(os.path.join(d, "*.json"))))

    if not all_jsons:
        return None

    scored: List[Tuple[float, str]] = []
    for path in all_jsons:
        cnorm = _normalize_name(Path(path).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, path))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score >= 0.5:
        return best_path
    return None


def _normalize_name(name: str) -> str:
    s = name.lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s


def resolve_npz_path(path_or_stem: str) -> str:
    """Resolve an embeddings .npz path from an exact path or a song stem.

    Strategy:
    1) If path exists, return it.
    2) If it's just a filename (endswith .npz), try in DEFAULT_EMBED_DIR.
    3) Fuzzy search in DEFAULT_EMBED_DIR using normalized containment and difflib.
    """
    # 1) Direct hit
    if os.path.isfile(path_or_stem):
        return path_or_stem

    # 2) Try as filename inside default dir
    if path_or_stem.endswith(".npz"):
        candidate = os.path.join(DEFAULT_EMBED_DIR, os.path.basename(path_or_stem))
        if os.path.isfile(candidate):
            return candidate

    # 3) Fuzzy search by stem across known directories
    stem = Path(path_or_stem).stem
    norm_target = _normalize_name(stem)

    # Try to ensure Drive is mounted before searching
    _maybe_mount_drive()

    all_candidates: List[str] = []
    searched_dirs: List[str] = []
    for d in _default_embed_dirs():
        searched_dirs.append(d)
        all_candidates.extend(sorted(glob(os.path.join(d, "*.npz"))))

    if not all_candidates:
        raise FileNotFoundError(
            "No .npz files found in any of: " + ", ".join(searched_dirs)
        )

    # Score candidates
    scored: List[Tuple[float, str]] = []
    for c in all_candidates:
        cnorm = _normalize_name(Path(c).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, c))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score < 0.4:
        raise FileNotFoundError(
            f"Could not resolve embeddings .npz for '{path_or_stem}'. Best match score={best_score:.2f} from dirs: "
            + ", ".join(searched_dirs)
        )
    return best_path


def load_comments(ids: np.ndarray, comments_json_path: Optional[str]) -> List[str]:
    if comments_json_path is None:
        raise FileNotFoundError(
            "Could not determine source comments JSON. Set COMMENTS_JSON_PATH explicitly.")

    with open(comments_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Build mapping from id -> comment
    comments_by_id: Dict[Any, str] = {}

    if isinstance(data, list) and len(data) > 0:
        if isinstance(data[0], dict) and 'comment' in data[0]:
            for item in data:
                cid = item.get('id')
                if cid is None:
                    # Fall back to positional index if id missing
                    # We cannot recover original index reliably unless we add one; so skip
                    # positional fallback handled below
                    pass
                else:
                    comments_by_id[cid] = item.get('comment', "")
        elif isinstance(data[0], str):
            # Positional mapping only
            pass
        else:
            raise ValueError("Unsupported JSON format for comments.")
    else:
        raise ValueError("Comments JSON is empty or invalid.")

    resolved_comments: List[str] = []
    if comments_by_id:
        for cid in ids:
            resolved_comments.append(comments_by_id.get(cid, ""))
    else:
        # Positional fallback: assume ids are 0..N-1 indices
        if not isinstance(data[0], str):
            raise ValueError("Cannot map ids to comments: ids present but JSON lacks 'id' fields; and data is not a list of strings for positional mapping.")
        for cid in ids:
            try:
                resolved_comments.append(data[int(cid)])
            except Exception:
                resolved_comments.append("")

    return resolved_comments


def compute_cluster_members(
    embeddings: np.ndarray,
    num_clusters: int,
    random_state: int = 42,
) -> Tuple[np.ndarray, np.ndarray, KMeans]:
    start = time.time()
    if USE_MINIBATCH_KMEANS:
        print(f"[KMeans] Using MiniBatchKMeans: k={num_clusters}, batch_size={MINIBATCH_SIZE}")
        kmeans = MiniBatchKMeans(
            n_clusters=num_clusters,
            random_state=random_state,
            batch_size=MINIBATCH_SIZE,
            n_init=3,
            reassignment_ratio=0.01,
        )
    else:
        print(f"[KMeans] Using KMeans: k={num_clusters}")
        kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=random_state)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_
    print(f"[KMeans] Fitted in {time.time() - start:.2f}s")
    return labels, centers, kmeans


def _compute_inertias_over_k(
    embeddings: np.ndarray,
    k_min: int,
    k_max: int,
    random_state: int = 42,
) -> Tuple[List[int], List[float]]:
    """Compute inertias for different k values (faster than silhouette)."""
    n_samples = int(embeddings.shape[0])
    if n_samples <= 1:
        return [1], [0.0]

    # Adaptive k upper bound ~ sqrt(n) + 5
    import math
    k_min_eff = max(2, min(k_min, n_samples - 1))
    k_max_eff = max(k_min_eff, min(int(math.sqrt(n_samples)) + 5, k_max, n_samples - 1))
    print(f"[Elbow] Scanning k in [{k_min_eff}..{k_max_eff}] over n={n_samples} points")

    ks: List[int] = []
    inertias: List[float] = []

    for k in range(k_min_eff, k_max_eff + 1):
        try:
            t0 = time.time()
            if USE_MINIBATCH_KMEANS:
                model = MiniBatchKMeans(
                    n_clusters=k,
                    random_state=random_state,
                    batch_size=MINIBATCH_SIZE,
                    n_init=3,
                    reassignment_ratio=0.01,
                )
            else:
                model = KMeans(n_clusters=k, n_init=10, random_state=random_state)
            model.fit(embeddings)
            ks.append(k)
            inertias.append(float(model.inertia_))
            took = time.time() - t0
            print(f"[Elbow] k={k:<3} inertia={model.inertia_:>12.2f} time={took:.2f}s")
        except Exception as e:
            print(f"[Elbow] k={k} failed: {e}")
            continue

    if not ks:
        return [1], [0.0]
    return ks, inertias


def _select_k_by_elbow_distance(ks: List[int], inertias: List[float]) -> int:
    """Select k using maximum distance to line from first to last inertia point."""
    if len(ks) <= 1:
        return ks[0]

    # Distance from each point to the line between first and last
    x1, y1 = ks[0], inertias[0]
    x2, y2 = ks[-1], inertias[-1]
    denom = ((x2 - x1) ** 2 + (y2 - y1) ** 2) ** 0.5 or 1.0

    best_k = ks[0]
    best_dist = -1.0

    for x0, y0 in zip(ks, inertias):
        # Perpendicular distance formula
        num = abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1)
        dist = num / denom
        if dist > best_dist:
            best_dist = dist
            best_k = x0

    print(f"[Elbow] Selected k={best_k} (distance={best_dist:.2f})")
    return int(best_k)


def choose_k_via_elbow(
    embeddings: np.ndarray,
    k_min: int = K_MIN,
    k_max: int = K_MAX,
    random_state: int = 42,
) -> Tuple[int, List[int], List[float]]:
    """Select k using elbow method (faster than silhouette)."""
    n_samples = int(embeddings.shape[0])
    if n_samples <= 1:
        return 1, [1], [0.0]

    ks, inertias = _compute_inertias_over_k(embeddings, k_min, k_max, random_state)
    selected_k = _select_k_by_elbow_distance(ks, inertias)
    return selected_k, ks, inertias


def select_nearest_and_farthest(
    embeddings: np.ndarray,
    labels: np.ndarray,
    centers: np.ndarray,
    cluster_index: int,
    num_closest: int,
    num_farthest: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    members = np.where(labels == cluster_index)[0]
    if members.size == 0:
        return members, np.array([], dtype=int), np.array([], dtype=int)

    center = centers[cluster_index]
    member_vectors = embeddings[members]
    dists = np.linalg.norm(member_vectors - center, axis=1)

    order = np.argsort(dists)
    closest = members[order[: min(num_closest, members.size)]]
    farthest = members[order[::-1][: min(num_farthest, members.size)]]
    return members, closest, farthest


def main():
    print("üöÄ Starting clustering with ELBOW METHOD (faster than silhouette)")
    print("=" * 60)

    # Resolve path robustly (supports exact path, filename, or fuzzy stem)
    print("[Path] Resolving embeddings path...")
    npz_path = resolve_npz_path(EMBED_NPZ_PATH)
    print(f"[Path] Found: {npz_path}")

    print("[Load] Loading embeddings and comments...")
    embeddings, ids, meta = load_npz_embeddings(npz_path)
    print(f"[Load] Loaded {embeddings.shape[0]:,} embeddings of dimension {embeddings.shape[1]}")

    # Resolve comments JSON path
    comments_path = COMMENTS_JSON_PATH or infer_comments_json_path(npz_path)
    comments = load_comments(ids, comments_path)
    print(f"[Load] Loaded {len(comments):,} comments")

    # Deduplicate embeddings (preserve first occurrence), keeping ids and comments aligned
    print("[Dedup] Starting row-wise deduplication...")
    t0 = time.time()
    embeddings_view = embeddings.view(np.void)
    _, unique_indices = np.unique(embeddings_view, axis=0, return_index=True)
    unique_indices = np.sort(unique_indices)

    original_count = int(embeddings.shape[0])
    embeddings_dedup = embeddings[unique_indices]
    ids_dedup = ids[unique_indices]
    comments_dedup = [comments[i] for i in unique_indices]

    unique_count = int(embeddings_dedup.shape[0])
    duplicate_count = original_count - unique_count
    duplicate_percentage = (duplicate_count / original_count) * 100 if original_count else 0.0

    if duplicate_count > 0:
        print(f"[Dedup] Removed {duplicate_count} duplicates ({duplicate_percentage:.1f}%) in {time.time() - t0:.2f}s")
    else:
        print(f"[Dedup] No duplicates found ({time.time() - t0:.2f}s)")

    # Normalize and optionally reduce dimensionality
    print("[Preprocess] L2-normalizing vectors...")
    t0 = time.time()
    embeddings_proc = l2_normalize(embeddings_dedup, norm="l2")
    print(f"[Preprocess] Normalized in {time.time() - t0:.2f}s")

    pca = None
    if PCA_DIM and PCA_DIM > 0 and embeddings_proc.shape[1] > PCA_DIM:
        print(f"[Preprocess] Applying PCA to {PCA_DIM} dims from {embeddings_proc.shape[1]}...")
        t0 = time.time()
        pca = PCA(n_components=PCA_DIM, random_state=42)
        embeddings_proc = pca.fit_transform(embeddings_proc)
        print(f"[Preprocess] PCA completed in {time.time() - t0:.2f}s; new shape={embeddings_proc.shape}")
    else:
        print("[Preprocess] PCA disabled or not needed")

    # Choose K automatically via elbow method (faster than silhouette)
    print("\n[Elbow] Starting k selection via elbow method...")
    try:
        selected_k, ks_scanned, inertias = choose_k_via_elbow(embeddings_proc, K_MIN, K_MAX)
    except Exception as e:
        print(f"[Elbow] Error in elbow method: {e}")
        selected_k = max(1, min(10, int(embeddings_proc.shape[0])))
        ks_scanned, inertias = [selected_k], [0.0]

    print(f"[Select-K] Selected k={selected_k}")

    # Run final clustering
    print(f"\n[Cluster] Running final clustering with k={selected_k}...")
    labels, centers, kmeans = compute_cluster_members(
        embeddings_proc, num_clusters=selected_k, random_state=42
    )

    # Save under youtube_embeddings_project/deduplicated
    embeddings_dir = Path(npz_path).parent
    project_dir = embeddings_dir.parent
    out_dir_path = project_dir / "deduplicated5"
    out_dir_path.mkdir(parents=True, exist_ok=True)
    out_dir = str(out_dir_path)
    base_name = Path(npz_path).stem.replace("_embeddings", "")

    summary: Dict[str, Any] = {
        "npz_path": npz_path,
        "comments_json_path": comments_path,
        "num_points_original": int(original_count),
        "num_points_deduplicated": int(unique_count),
        "duplicates_removed": int(duplicate_count),
        "duplicate_percentage": float(duplicate_percentage),
        "embedding_dim": int(embeddings.shape[1]),
        "num_clusters": int(selected_k),
        "elbow_scan_ks": ks_scanned,
        "elbow_inertias": inertias,
        "per_cluster_counts": {},
        "outputs": [],
    }

    print(f"\n[Output] Writing cluster results...")
    for k in range(selected_k):
        members, closest, farthest = select_nearest_and_farthest(
            embeddings_proc, labels, centers, k, NUM_CLOSEST, NUM_FARTHEST
        )

        # Prepare outputs
        def pack(indices: np.ndarray) -> List[Dict[str, Any]]:
            result: List[Dict[str, Any]] = []
            for idx in indices.tolist():
                result.append({
                    "index": int(idx),
                    "id": ids_dedup[idx].item() if hasattr(ids_dedup[idx], 'item') else ids_dedup[idx],
                    "comment": comments_dedup[idx],
                })
            return result

        closest_payload = pack(closest)
        farthest_payload = pack(farthest)

        # Write per-cluster files
        closest_path = os.path.join(
            out_dir, f"{base_name}_elbow_kmeans{selected_k}_cluster_{k}_nearest_{len(closest_payload)}.json"
        )
        farthest_path = os.path.join(
            out_dir, f"{base_name}_elbow_kmeans{selected_k}_cluster_{k}_farthest_{len(farthest_payload)}.json"
        )

        with open(closest_path, "w", encoding="utf-8") as f:
            json.dump(closest_payload, f, ensure_ascii=False, indent=2)
        with open(farthest_path, "w", encoding="utf-8") as f:
            json.dump(farthest_payload, f, ensure_ascii=False, indent=2)

        summary["per_cluster_counts"][str(k)] = {
            "members": int(members.size),
            "closest_saved": int(len(closest_payload)),
            "farthest_saved": int(len(farthest_payload)),
        }
        summary["outputs"].append({
            "cluster": k,
            "closest_path": closest_path,
            "farthest_path": farthest_path,
        })

    # Write summary
    summary_path = os.path.join(out_dir, f"{base_name}_elbow_kmeans{selected_k}_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\n" + "=" * 60)
    print("‚úÖ ELBOW METHOD CLUSTERING COMPLETE")
    print("=" * 60)
    print(f"üìä Clustered {unique_count:,} deduplicated embeddings into {selected_k} clusters")
    print(f"üìÅ Summary written to: {summary_path}")
    print(f"üìÇ Output directory: {out_dir}")
    for entry in summary["outputs"]:
        print(f"Cluster {entry['cluster']:>2}:\n  nearest -> {entry['closest_path']}\n  farthest -> {entry['farthest_path']}")


if __name__ == "__main__":
    main()


In [None]:
import os
import re
import json
import difflib
import time
from glob import glob
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Optional: Google Colab drive mounting
try:
    from google.colab import drive  # type: ignore
except Exception:  # pragma: no cover
    drive = None  # type: ignore

import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize as l2_normalize


"""
Colab-ready script with FIXED K=3 for maximum speed:
1) Load embeddings from an .npz file (expects keys: 'embeddings', 'ids')
2) Use fixed k=3 (no k selection needed - fastest option)
3) Run KMeans clustering
4) For each cluster, extract:
   - 100 closest comments to the centroid
   - 500 farthest comments from the centroid
5) Save results as JSON files

Notes:
- Fixed k=3 eliminates k selection overhead - fastest possible clustering
- Actual comments are not stored inside the .npz in this project. This script will
  try to find the source JSON (same stem as the .npz, without the '_embeddings' suffix)
  in the same directory. If it is elsewhere, set COMMENTS_JSON_PATH below.
- The source JSON can be either a list of strings or a list of objects with
  fields {'id': ..., 'comment': ...}. When 'id' is missing, positional index is used.

Usage in Colab:
- Just run this file content in a cell (or upload as a .py and run it). No CLI needed.
"""


# ==== USER CONFIG (edit these two lines as needed) ====
EMBED_NPZ_PATH = \
    "/content/drive/My Drive/youtube_embeddings_project/embeddings/Without_Me_Eminem_embeddings.npz"

# If None, the script will try to infer the JSON path from the .npz filename.
COMMENTS_JSON_PATH: Optional[str] = \
    "/content/drive/My Drive/youtube_embeddings_project/comments/Without_Me_Eminem_embeddings.json"

# Fixed clustering configuration
FIXED_K = 3  # Fixed number of clusters
NUM_CLOSEST = 1000
NUM_FARTHEST = 1000

# Performance configuration
USE_MINIBATCH_KMEANS = False  # Use regular KMeans for better results


def _maybe_mount_drive():
    try:
        if drive is not None and not os.path.isdir("/content/drive/MyDrive") and not os.path.isdir("/content/drive/My Drive"):
            drive.mount('/content/drive')
    except Exception:
        pass


def _default_embed_dirs() -> List[str]:
    """Return possible embeddings directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/embeddings",
        "/content/drive/My Drive/youtube_embeddings_project/embeddings",
        "./embeddings",
    ]


def _default_comment_dirs() -> List[str]:
    """Return possible comments directories (handles MyDrive/My Drive and local)."""
    return [
        "/content/drive/MyDrive/youtube_embeddings_project/comments",
        "/content/drive/My Drive/youtube_embeddings_project/comments",
        "./comments",
    ]


def load_npz_embeddings(npz_path: str) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
    data = np.load(npz_path, allow_pickle=True)
    if 'embeddings' not in data or 'ids' not in data:
        raise KeyError(".npz must contain 'embeddings' and 'ids' arrays")

    embeddings: np.ndarray = data['embeddings']
    ids: np.ndarray = data['ids']
    meta = {"keys": list(data.keys())}
    return embeddings, ids, meta


def infer_comments_json_path(npz_path: str) -> Optional[str]:
    """Try to infer the comments JSON path for a given .npz.

    Search order:
    1) Same directory as .npz with base name (stripping _embeddings) and .json
    2) Known comments directories with exact filename match
    3) Fuzzy match by normalized stem across known comments directories
    """
    p = Path(npz_path)
    stem = p.stem
    # Expect pattern like XYZ_embeddings.npz -> XYZ.json
    if stem.endswith("_embeddings"):
        base = stem[:-len("_embeddings")]
    else:
        base = stem

    # 1) Same directory
    candidate = p.parent / f"{base}.json"
    if candidate.exists():
        return str(candidate)

    # 2) Exact match in known comments dirs
    _maybe_mount_drive()
    comment_dirs = _default_comment_dirs()
    for d in comment_dirs:
        cand = Path(d) / f"{base}.json"
        if cand.exists():
            return str(cand)

    # 3) Fuzzy match across known comments dirs
    norm_target = _normalize_name(base)
    all_jsons: List[str] = []
    for d in comment_dirs:
        all_jsons.extend(sorted(glob(os.path.join(d, "*.json"))))

    if not all_jsons:
        return None

    scored: List[Tuple[float, str]] = []
    for path in all_jsons:
        cnorm = _normalize_name(Path(path).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, path))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score >= 0.5:
        return best_path
    return None


def _normalize_name(name: str) -> str:
    s = name.lower()
    s = re.sub(r"[^a-z0-9]+", "", s)
    return s


def resolve_npz_path(path_or_stem: str) -> str:
    """Resolve an embeddings .npz path from an exact path or a song stem.

    Strategy:
    1) If path exists, return it.
    2) If it's just a filename (endswith .npz), try in DEFAULT_EMBED_DIR.
    3) Fuzzy search in DEFAULT_EMBED_DIR using normalized containment and difflib.
    """
    # 1) Direct hit
    if os.path.isfile(path_or_stem):
        return path_or_stem

    # 2) Try as filename inside default dir
    if path_or_stem.endswith(".npz"):
        candidate = os.path.join(DEFAULT_EMBED_DIR, os.path.basename(path_or_stem))
        if os.path.isfile(candidate):
            return candidate

    # 3) Fuzzy search by stem across known directories
    stem = Path(path_or_stem).stem
    norm_target = _normalize_name(stem)

    # Try to ensure Drive is mounted before searching
    _maybe_mount_drive()

    all_candidates: List[str] = []
    searched_dirs: List[str] = []
    for d in _default_embed_dirs():
        searched_dirs.append(d)
        all_candidates.extend(sorted(glob(os.path.join(d, "*.npz"))))

    if not all_candidates:
        raise FileNotFoundError(
            "No .npz files found in any of: " + ", ".join(searched_dirs)
        )

    # Score candidates
    scored: List[Tuple[float, str]] = []
    for c in all_candidates:
        cnorm = _normalize_name(Path(c).stem)
        if norm_target and norm_target in cnorm:
            score = 1.0
        else:
            score = difflib.SequenceMatcher(None, norm_target, cnorm).ratio()
        scored.append((score, c))

    scored.sort(key=lambda x: x[0], reverse=True)
    best_score, best_path = scored[0]
    if best_score < 0.4:
        raise FileNotFoundError(
            f"Could not resolve embeddings .npz for '{path_or_stem}'. Best match score={best_score:.2f} from dirs: "
            + ", ".join(searched_dirs)
        )
    return best_path


def load_comments(ids: np.ndarray, comments_json_path: Optional[str]) -> List[str]:
    if comments_json_path is None:
        raise FileNotFoundError(
            "Could not determine source comments JSON. Set COMMENTS_JSON_PATH explicitly.")

    with open(comments_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Build mapping from id -> comment
    comments_by_id: Dict[Any, str] = {}

    if isinstance(data, list) and len(data) > 0:
        if isinstance(data[0], dict) and 'comment' in data[0]:
            for item in data:
                cid = item.get('id')
                if cid is None:
                    # Fall back to positional index if id missing
                    # We cannot recover original index reliably unless we add one; so skip
                    # positional fallback handled below
                    pass
                else:
                    comments_by_id[cid] = item.get('comment', "")
        elif isinstance(data[0], str):
            # Positional mapping only
            pass
        else:
            raise ValueError("Unsupported JSON format for comments.")
    else:
        raise ValueError("Comments JSON is empty or invalid.")

    resolved_comments: List[str] = []
    if comments_by_id:
        for cid in ids:
            resolved_comments.append(comments_by_id.get(cid, ""))
    else:
        # Positional fallback: assume ids are 0..N-1 indices
        if not isinstance(data[0], str):
            raise ValueError("Cannot map ids to comments: ids present but JSON lacks 'id' fields; and data is not a list of strings for positional mapping.")
        for cid in ids:
            try:
                resolved_comments.append(data[int(cid)])
            except Exception:
                resolved_comments.append("")

    return resolved_comments


def compute_cluster_members(
    embeddings: np.ndarray,
    num_clusters: int,
    random_state: int = 42,
) -> Tuple[np.ndarray, np.ndarray, KMeans]:
    start = time.time()
    print(f"[KMeans] Using KMeans: k={num_clusters}")
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=random_state)
    labels = kmeans.fit_predict(embeddings)
    centers = kmeans.cluster_centers_
    print(f"[KMeans] Fitted in {time.time() - start:.2f}s")
    return labels, centers, kmeans


def select_nearest_and_farthest(
    embeddings: np.ndarray,
    labels: np.ndarray,
    centers: np.ndarray,
    cluster_index: int,
    num_closest: int,
    num_farthest: int,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    members = np.where(labels == cluster_index)[0]
    if members.size == 0:
        return members, np.array([], dtype=int), np.array([], dtype=int)

    center = centers[cluster_index]
    member_vectors = embeddings[members]
    dists = np.linalg.norm(member_vectors - center, axis=1)

    order = np.argsort(dists)
    closest = members[order[: min(num_closest, members.size)]]
    farthest = members[order[::-1][: min(num_farthest, members.size)]]
    return members, closest, farthest


def main():
    print("üöÄ Starting clustering with FIXED K=3 (maximum speed)")
    print("=" * 60)

    # Resolve path robustly (supports exact path, filename, or fuzzy stem)
    print("[Path] Resolving embeddings path...")
    npz_path = resolve_npz_path(EMBED_NPZ_PATH)
    print(f"[Path] Found: {npz_path}")

    print("[Load] Loading embeddings and comments...")
    embeddings, ids, meta = load_npz_embeddings(npz_path)
    print(f"[Load] Loaded {embeddings.shape[0]:,} embeddings of dimension {embeddings.shape[1]}")

    # Resolve comments JSON path
    comments_path = COMMENTS_JSON_PATH or infer_comments_json_path(npz_path)
    comments = load_comments(ids, comments_path)
    print(f"[Load] Loaded {len(comments):,} comments")

    # Deduplicate embeddings (preserve first occurrence), keeping ids and comments aligned
    print("[Dedup] Starting row-wise deduplication...")
    t0 = time.time()
    embeddings_view = embeddings.view(np.void)
    _, unique_indices = np.unique(embeddings_view, axis=0, return_index=True)
    unique_indices = np.sort(unique_indices)

    original_count = int(embeddings.shape[0])
    embeddings_dedup = embeddings[unique_indices]
    ids_dedup = ids[unique_indices]
    comments_dedup = [comments[i] for i in unique_indices]

    unique_count = int(embeddings_dedup.shape[0])
    duplicate_count = original_count - unique_count
    duplicate_percentage = (duplicate_count / original_count) * 100 if original_count else 0.0

    if duplicate_count > 0:
        print(f"[Dedup] Removed {duplicate_count} duplicates ({duplicate_percentage:.1f}%) in {time.time() - t0:.2f}s")
    else:
        print(f"[Dedup] No duplicates found ({time.time() - t0:.2f}s)")

    # Normalize vectors (improves cosine-like behavior for Euclidean KMeans)
    print("[Preprocess] L2-normalizing vectors...")
    t0 = time.time()
    embeddings_proc = l2_normalize(embeddings_dedup, norm="l2")
    print(f"[Preprocess] Normalized in {time.time() - t0:.2f}s")
    print(f"[Preprocess] Using full {embeddings_proc.shape[1]} dimensions (no PCA)")

    # Use fixed k=3 (no k selection needed - fastest option)
    selected_k = FIXED_K
    print(f"[Config] Using fixed k={selected_k} (no k selection overhead)")

    # Ensure we have enough data for k clusters
    if unique_count < selected_k:
        print(f"[Warning] Only {unique_count} unique embeddings, reducing k from {selected_k} to {unique_count}")
        selected_k = unique_count

    # Run clustering directly
    print(f"\n[Cluster] Running clustering with fixed k={selected_k}...")
    labels, centers, kmeans = compute_cluster_members(
        embeddings_proc, num_clusters=selected_k, random_state=42
    )

    # Verify the number of clusters actually created
    actual_clusters = len(np.unique(labels))
    print(f"[Cluster] Created {actual_clusters} clusters (expected {selected_k})")
    if actual_clusters != selected_k:
        print(f"[Warning] Expected {selected_k} clusters but got {actual_clusters}")
        selected_k = actual_clusters

    # Save under youtube_embeddings_project/deduplicated
    embeddings_dir = Path(npz_path).parent
    project_dir = embeddings_dir.parent
    out_dir_path = project_dir / "deduplicated10"
    out_dir_path.mkdir(parents=True, exist_ok=True)
    out_dir = str(out_dir_path)
    base_name = Path(npz_path).stem.replace("_embeddings", "")

    summary: Dict[str, Any] = {
        "npz_path": npz_path,
        "comments_json_path": comments_path,
        "num_points_original": int(original_count),
        "num_points_deduplicated": int(unique_count),
        "duplicates_removed": int(duplicate_count),
        "duplicate_percentage": float(duplicate_percentage),
        "embedding_dim": int(embeddings.shape[1]),
        "num_clusters": int(selected_k),
        "clustering_method": "fixed_k",
        "per_cluster_counts": {},
        "outputs": [],
    }

    print(f"\n[Output] Writing cluster results...")
    for k in range(selected_k):
        members, closest, farthest = select_nearest_and_farthest(
            embeddings_proc, labels, centers, k, NUM_CLOSEST, NUM_FARTHEST
        )

        print(f"[Cluster {k}] Members: {members.size}, Closest: {len(closest)}, Farthest: {len(farthest)}")

        # Prepare outputs
        def pack(indices: np.ndarray) -> List[Dict[str, Any]]:
            result: List[Dict[str, Any]] = []
            for idx in indices.tolist():
                result.append({
                    "index": int(idx),
                    "id": ids_dedup[idx].item() if hasattr(ids_dedup[idx], 'item') else ids_dedup[idx],
                    "comment": comments_dedup[idx],
                })
            return result

        closest_payload = pack(closest)
        farthest_payload = pack(farthest)

        # Write per-cluster files
        closest_path = os.path.join(
            out_dir, f"{base_name}_fixed_k{selected_k}_cluster_{k}_nearest_{len(closest_payload)}.json"
        )
        farthest_path = os.path.join(
            out_dir, f"{base_name}_fixed_k{selected_k}_cluster_{k}_farthest_{len(farthest_payload)}.json"
        )

        with open(closest_path, "w", encoding="utf-8") as f:
            json.dump(closest_payload, f, ensure_ascii=False, indent=2)
        with open(farthest_path, "w", encoding="utf-8") as f:
            json.dump(farthest_payload, f, ensure_ascii=False, indent=2)

        summary["per_cluster_counts"][str(k)] = {
            "members": int(members.size),
            "closest_saved": int(len(closest_payload)),
            "farthest_saved": int(len(farthest_payload)),
        }
        summary["outputs"].append({
            "cluster": k,
            "closest_path": closest_path,
            "farthest_path": farthest_path,
        })

    # Write summary
    summary_path = os.path.join(out_dir, f"{base_name}_fixed_k{selected_k}_summary.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    print("\n" + "=" * 60)
    print("‚úÖ FIXED K=3 CLUSTERING COMPLETE")
    print("=" * 60)
    print(f"üìä Clustered {unique_count:,} deduplicated embeddings into {selected_k} clusters")
    print(f"‚ö° Used fixed k={selected_k} (no k selection overhead)")
    print(f"üìÅ Summary written to: {summary_path}")
    print(f"üìÇ Output directory: {out_dir}")
    for entry in summary["outputs"]:
        print(f"Cluster {entry['cluster']:>2}:\n  nearest -> {entry['closest_path']}\n  farthest -> {entry['farthest_path']}")


if __name__ == "__main__":
    main()
