# Step 4: Correlate Category Embeddings

This notebook correlates category-level average embeddings between two embedding files (e.g., bv_clip and things_clip).

## Overview

This step:
1. Loads category average embeddings from two sources
2. Finds matching categories between the two sets
3. Computes correlations (Pearson, Spearman, Cosine) for each category
4. Reports summary statistics and top/bottom categories

## Prerequisites

This step requires:
- Output from Step 2 (e.g., bv_clip and things_clip category average embeddings)

## Setup and Imports

In [14]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import pearsonr, spearmanr, kendalltau
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

print("All imports successful!")

All imports successful!


## Configuration

**Please update the paths below according to your setup:**

In [None]:
# ============================================================================
# CONFIGURATION - UPDATE THESE PATHS FOR YOUR SETUP
# ============================================================================

# Input directories from Step 2
AVG_CAT_EMB_PATH1 = "./bv_dinov3_rdm_results_26/category_average_embeddings.npz"  # bv_clip embeddings output directory
AVG_CAT_EMB_PATH2 = "../../data/things_dino_embeddings.npz"  # things_clip embeddings output directory

# Output directory for correlation results
OUTPUT_DIR = "./correlation_results_12102025"  # Directory to save correlation results
OUTPUT_FILENAME = 'bv_things_dino_category_embeddings_correlations.txt'  # Output filename

print("Configuration loaded. Please review and update paths as needed.")

Configuration loaded. Please review and update paths as needed.


## Correlate Category Embeddings

In [16]:
print("="*60)
print("CORRELATING CATEGORY EMBEDDINGS")
print("="*60)

embeddings1_path = Path(AVG_CAT_EMB_PATH1)
embeddings2_path = Path(AVG_CAT_EMB_PATH2)

if not embeddings1_path.exists():
    print(f"Error: {embeddings1_path} not found. Please run Step 1 first.")
elif not embeddings2_path.exists():
    print(f"Error: {embeddings2_path} not found. Please run Step 1.2 first.")
else:
    # Load embeddings
    print(f"Loading embeddings 1 from {embeddings1_path}...")
    data1 = np.load(embeddings1_path)
    embeddings1 = data1['embeddings']
    # Try multiple possible key names for categories
    available_keys1 = list(data1.keys())
    categories1 = None
    for key_name in ['categories', 'category', 'labels', 'label']:
        if key_name in available_keys1:
            categories1 = [str(cat) for cat in data1[key_name]]
            print(f"  Found categories under key '{key_name}'")
            break
    if categories1 is None:
        raise KeyError(f"No category/label key found in {embeddings1_path}. Available keys: {available_keys1}")
    print(f"  Categories: {len(categories1)}, Embedding dim: {embeddings1.shape[1]}")
    
    print(f"Loading embeddings 2 from {embeddings2_path}...")
    data2 = np.load(embeddings2_path)
    embeddings2 = data2['embeddings']
    # Try multiple possible key names for categories
    available_keys2 = list(data2.keys())
    categories2 = None
    for key_name in ['categories', 'category', 'labels', 'label']:
        if key_name in available_keys2:
            categories2 = [str(cat) for cat in data2[key_name]]
            print(f"  Found categories under key '{key_name}'")
            break
    if categories2 is None:
        raise KeyError(f"No category/label key found in {embeddings2_path}. Available keys: {available_keys2}")
    print(f"  Categories: {len(categories2)}, Embedding dim: {embeddings2.shape[1]}")
    
    # Check embedding dimensions
    if embeddings1.shape[1] != embeddings2.shape[1]:
        print(f"Warning: Embedding dimensions differ: {embeddings1.shape[1]} vs {embeddings2.shape[1]}")
        min_dim = min(embeddings1.shape[1], embeddings2.shape[1])
        embeddings1 = embeddings1[:, :min_dim]
        embeddings2 = embeddings2[:, :min_dim]
        print(f"  Using first {min_dim} dimensions")
    
    # Find matching categories
    categories1_set = set(categories1)
    categories2_set = set(categories2)
    matching_categories = sorted(categories1_set & categories2_set)
    
    print(f"\nMatching categories: {len(matching_categories)}")
    
    if len(matching_categories) == 0:
        print("Error: No matching categories found!")
    else:
        # Create mapping from category to index
        cat_to_idx1 = {cat: idx for idx, cat in enumerate(categories1)}
        cat_to_idx2 = {cat: idx for idx, cat in enumerate(categories2)}
        
        # Compute correlations for each matching category
        per_category_results = []
        all_pearson_rs = []
        all_spearman_rs = []
        all_kendall_rs = []
        all_cosine_sims = []
        
        for cat in matching_categories:
            idx1 = cat_to_idx1[cat]
            idx2 = cat_to_idx2[cat]
            
            vec1 = embeddings1[idx1]
            vec2 = embeddings2[idx2]
            
            # Remove NaN/Inf
            mask = np.isfinite(vec1) & np.isfinite(vec2)
            vec1_clean = vec1[mask]
            vec2_clean = vec2[mask]
            
            if len(vec1_clean) >= 3:
                pearson_r, pearson_p = pearsonr(vec1_clean, vec2_clean)
                spearman_r, spearman_p = spearmanr(vec1_clean, vec2_clean)
                kendall_r, kendall_p = kendalltau(vec1_clean, vec2_clean)
            else:
                pearson_r, pearson_p = np.nan, np.nan
                spearman_r, spearman_p = np.nan, np.nan
                kendall_r, kendall_p = np.nan, np.nan
            
            # Cosine similarity
            if len(vec1_clean) > 0:
                vec1_2d = vec1_clean.reshape(1, -1)
                vec2_2d = vec2_clean.reshape(1, -1)
                cosine_sim = cosine_similarity(vec1_2d, vec2_2d)[0, 0]
            else:
                cosine_sim = np.nan
            
            per_category_results.append({
                'category': cat,
                'pearson_r': pearson_r,
                'spearman_r': spearman_r,
                'kendall_r': kendall_r,
                'cosine_similarity': cosine_sim
            })
            
            if not np.isnan(pearson_r):
                all_pearson_rs.append(pearson_r)
            if not np.isnan(spearman_r):
                all_spearman_rs.append(spearman_r)
            if not np.isnan(kendall_r):
                all_kendall_rs.append(kendall_r)
            if not np.isnan(cosine_sim):
                all_cosine_sims.append(cosine_sim)
        
        # Summary statistics
        print("\n" + "="*60)
        print("SUMMARY STATISTICS")
        print("="*60)
        print(f"Categories analyzed: {len(matching_categories)}")
        print(f"\nPearson Correlation:")
        print(f"  Mean:   {np.nanmean(all_pearson_rs):.6f}")
        print(f"  Std:    {np.nanstd(all_pearson_rs):.6f}")
        print(f"  Median: {np.nanmedian(all_pearson_rs):.6f}")
        print(f"  Min:    {np.nanmin(all_pearson_rs):.6f}")
        print(f"  Max:    {np.nanmax(all_pearson_rs):.6f}")
        print(f"\nSpearman Correlation:")
        print(f"  Mean:   {np.nanmean(all_spearman_rs):.6f}")
        print(f"  Std:    {np.nanstd(all_spearman_rs):.6f}")
        print(f"  Median: {np.nanmedian(all_spearman_rs):.6f}")
        print(f"\nKendall Correlation:")
        print(f"  Mean:   {np.nanmean(all_kendall_rs):.6f}")
        print(f"  Std:    {np.nanstd(all_kendall_rs):.6f}")
        print(f"  Median: {np.nanmedian(all_kendall_rs):.6f}")
        print(f"\nCosine Similarity:")
        print(f"  Mean:   {np.nanmean(all_cosine_sims):.6f}")
        print(f"  Std:    {np.nanstd(all_cosine_sims):.6f}")
        print(f"  Median: {np.nanmedian(all_cosine_sims):.6f}")
        
        # Top and bottom categories
        sorted_results = sorted(per_category_results, 
                              key=lambda x: x['pearson_r'] if not np.isnan(x['pearson_r']) else -np.inf, 
                              reverse=True)
        
        print(f"\n\nTop 10 categories by Pearson correlation:")
        for i, result in enumerate(sorted_results[:10], 1):
            print(f"  {i:2d}. {result['category']:<30} r={result['pearson_r']:.6f}, cos={result['cosine_similarity']:.6f}")
        
        print(f"\nBottom 10 categories by Pearson correlation:")
        for i, result in enumerate(sorted_results[-10:], len(sorted_results)-9):
            print(f"  {i:2d}. {result['category']:<30} r={result['pearson_r']:.6f}, cos={result['cosine_similarity']:.6f}")

        # Save results to output directory
        output_dir = Path(OUTPUT_DIR)
        output_dir.mkdir(parents=True, exist_ok=True)

        # Create results text file
        results_file = output_dir / OUTPUT_FILENAME
        with open(results_file, 'w') as f:
            f.write("="*60 + "\n")
            f.write("CATEGORY EMBEDDING CORRELATION RESULTS\n")
            f.write("="*60 + "\n\n")
            f.write(f"Embeddings 1: {embeddings1_path}\n")
            f.write(f"Embeddings 2: {embeddings2_path}\n\n")
            f.write(f"Categories 1: {len(categories1)}, Embedding dim: {embeddings1.shape[1]}\n")
            f.write(f"Categories 2: {len(categories2)}, Embedding dim: {embeddings2.shape[1]}\n")
            f.write(f"Matching categories: {len(matching_categories)}\n\n")
            
            f.write("="*60 + "\n")
            f.write("SUMMARY STATISTICS\n")
            f.write("="*60 + "\n")
            f.write(f"Categories analyzed: {len(matching_categories)}\n\n")
            
            f.write("Pearson Correlation:\n")
            f.write(f"  Mean:   {np.nanmean(all_pearson_rs):.6f}\n")
            f.write(f"  Std:    {np.nanstd(all_pearson_rs):.6f}\n")
            f.write(f"  Median: {np.nanmedian(all_pearson_rs):.6f}\n")
            f.write(f"  Min:    {np.nanmin(all_pearson_rs):.6f}\n")
            f.write(f"  Max:    {np.nanmax(all_pearson_rs):.6f}\n\n")
            
            f.write("Spearman Correlation:\n")
            f.write(f"  Mean:   {np.nanmean(all_spearman_rs):.6f}\n")
            f.write(f"  Std:    {np.nanstd(all_spearman_rs):.6f}\n")
            f.write(f"  Median: {np.nanmedian(all_spearman_rs):.6f}\n")
            f.write(f"  Min:    {np.nanmin(all_spearman_rs):.6f}\n")
            f.write(f"  Max:    {np.nanmax(all_spearman_rs):.6f}\n\n")
            
            f.write("Kendall Correlation:\n")
            f.write(f"  Mean:   {np.nanmean(all_kendall_rs):.6f}\n")
            f.write(f"  Std:    {np.nanstd(all_kendall_rs):.6f}\n")
            f.write(f"  Median: {np.nanmedian(all_kendall_rs):.6f}\n")
            f.write(f"  Min:    {np.nanmin(all_kendall_rs):.6f}\n")
            f.write(f"  Max:    {np.nanmax(all_kendall_rs):.6f}\n\n")
            
            f.write("Cosine Similarity:\n")
            f.write(f"  Mean:   {np.nanmean(all_cosine_sims):.6f}\n")
            f.write(f"  Std:    {np.nanstd(all_cosine_sims):.6f}\n")
            f.write(f"  Median: {np.nanmedian(all_cosine_sims):.6f}\n")
            f.write(f"  Min:    {np.nanmin(all_cosine_sims):.6f}\n")
            f.write(f"  Max:    {np.nanmax(all_cosine_sims):.6f}\n\n")
            
            f.write("="*60 + "\n")
            f.write("TOP 10 CATEGORIES BY PEARSON CORRELATION\n")
            f.write("="*60 + "\n")
            for i, result in enumerate(sorted_results[:10], 1):
                f.write(f"  {i:2d}. {result['category']:<30} pearson_r={result['pearson_r']:.6f}, spearman_r={result['spearman_r']:.6f}, kendall_r={result['kendall_r']:.6f}, cosine={result['cosine_similarity']:.6f}\n")
            
            f.write("\n" + "="*60 + "\n")
            f.write("BOTTOM 10 CATEGORIES BY PEARSON CORRELATION\n")
            f.write("="*60 + "\n")
            for i, result in enumerate(sorted_results[-10:], len(sorted_results)-9):
                f.write(f"  {i:2d}. {result['category']:<30} pearson_r={result['pearson_r']:.6f}, spearman_r={result['spearman_r']:.6f}, kendall_r={result['kendall_r']:.6f}, cosine={result['cosine_similarity']:.6f}\n")
        
        print(f"\nResults saved to: {results_file}")


CORRELATING CATEGORY EMBEDDINGS
Error: bv_dino_rdm_results_26/category_average_embeddings.npz not found. Please run Step 1 first.
