Code run on node14-ccn2cluster.stanford.edu

In [None]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict
from vislearnlabpy.embeddings.embedding_store import EmbeddingStore

Extract THINGS CLIP embeddings

In [None]:
embedding_store = EmbeddingStore.from_doc("/ccn2/dataset/babyview/outputs_20250312/things_bv_overlapping_categories_corrected/embeddings/image_embeddings/clip_image_embeddings_doc_normalized_filtered-by-clip-26.docs")

def process_embeddings(embedding_store):
    """
    Average embeddings by text class and export to NPZ and CSV
    """
    # Group embeddings by text
    text_to_embeddings = defaultdict(list)
    
    for item in embedding_store:
        text = item.text
        embedding = np.array(item.embedding, dtype=np.float32)
        text_to_embeddings[text].append(embedding)
    
    print([item.text for item in embedding_store[:5]])  # Print first 5 texts for verification
    # Average embeddings for each unique text
    averaged_embeddings = {}
    class_names = []
    embeddings_list = []
    
    for text, embeddings in text_to_embeddings.items():
        # Convert list of embeddings to numpy array and average
        embeddings_array = np.array(embeddings)
        avg_embedding = np.mean(embeddings_array, axis=0)
        
        averaged_embeddings[text] = avg_embedding
        class_names.append(text)
        embeddings_list.append(avg_embedding)
    
    # Convert to numpy array
    embeddings_matrix = np.array(embeddings_list)
    
    # Print number of classes
    num_classes = len(class_names)
    print(f"Number of unique classes: {num_classes}")
    
    # Save to NPZ file
    np.savez('things_clip_embeddings.npz', 
             embeddings=embeddings_matrix,
             labels=np.array(class_names))
    print("Saved things_clip_embeddings.npz")
    
    # Save to CSV file
    df = pd.DataFrame(embeddings_matrix)
    df.insert(0, 'text', class_names)
    df.to_csv('things_clip_embeddings.csv', index=False)
    print("Saved things_clip_embeddings.csv")
    
    return averaged_embeddings, num_classes

averaged_embeddings, num_classes = process_embeddings(embedding_store.EmbeddingList)

Extract THINGS DINO embeddings

In [None]:
dino_things_path = "/ccn2/dataset/babyview/outputs_20250312/image_embeddings/things_bv_overlapping_categories_corrected/facebook_dinov3-vitb16-pretrain-lvd1689m"
# Process embeddings from folders
def process_folder_embeddings(dino_things_path):
    """
    Average embeddings from each folder and export to NPZ and CSV
    """
    all_embeddings = []
    all_labels = []
    
    for folder in os.listdir(dino_things_path):
        folder_path = os.path.join(dino_things_path, folder)
        if os.path.isdir(folder_path):
            avg_embeddings = []
            for file in os.listdir(folder_path):
                embedding_file_path = os.path.join(folder_path, file)
                try:
                    embedding = np.load(embedding_file_path)
                    avg_embeddings.append(embedding)
                except Exception as e:
                    print(f"Error loading {embedding_file_path}: {e}")
                    continue
            
            if len(avg_embeddings) > 0:
                avg_embedding = np.mean(np.array(avg_embeddings), axis=0)
                label = folder
                print(f"Processed folder: {folder}, Average embedding shape: {avg_embedding.shape}, Total embeddings: {len(avg_embeddings)}")
                
                all_embeddings.append(avg_embedding)
                all_labels.append(label)
    
    # Convert to numpy arrays
    embeddings_matrix = np.array(all_embeddings)
    labels_array = np.array(all_labels)
    
    # Print number of classes
    num_classes = len(all_labels)
    print(f"\nNumber of unique classes: {num_classes}")
    
    # Save to NPZ file
    np.savez('things_dino_embeddings.npz', 
             embeddings=embeddings_matrix,
             labels=labels_array)
    print("Saved things_dino_embeddings.npz")
    
    # Save to CSV file
    df = pd.DataFrame(embeddings_matrix)
    df.insert(0, 'label', all_labels)
    df.to_csv('things_dino_embeddings.csv', index=False)
    print("Saved things_dino_embeddings.csv")
    
    return embeddings_matrix, labels_array, num_classes

embeddings_matrix, labels_array, num_classes = process_folder_embeddings(dino_things_path)