In [None]:
import sys
import os
import traceback
import time
import numpy as np
from collections import defaultdict
import h5py
import json
import torch
from PIL import Image
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.multiprocessing as mp
import warnings
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Suppress warnings
warnings.filterwarnings("ignore")

def check_gpu_availability():
    """Check GPU availability and set appropriate device."""
    try:
        if torch.cuda.is_available():
            device_count = torch.cuda.device_count()
            print(f"CUDA is available with {device_count} GPU(s)")

            # Try to create a tensor on GPU to test functionality
            test_tensor = torch.tensor([1.0]).cuda()
            del test_tensor
            torch.cuda.empty_cache()

            return "cuda"
        else:
            print("CUDA is not available")
            return "cpu"
    except Exception as e:
        print(f"GPU test failed: {e}")
        print("Falling back to CPU")
        return "cpu"

# Constants
BATCH_SIZE = 1  # Qwen2.5-VL requires batch size 1 for vision processing
PART_SIZE = 250  # Images per part file (as requested)
NUM_WORKERS = 0  # Disabled multiprocessing for Qwen2.5-VL
IMAGE_DIR = "/root/akhil_workspace/coco_val2014"
OUTPUT_DIR = "/root/akhil_workspace/qwen_model/extracted_embeddings_coco2014"
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"  # Can also use "Qwen/Qwen2.5-VL-3B-Instruct"

# Test mode - set to True to process only 2 images for testing
TEST_MODE = True  # Set to False for full processing

# Global model variables (will be initialized in main)
processor = None
model = None
device = None

def get_model_architecture_info(model):
    """Get the total number of layers in the language model."""
    arch_info = {}

    # Get language model layers
    if hasattr(model, 'model') and hasattr(model.model, 'config'):
        config = model.model.config
        arch_info['language_layers'] = getattr(config, 'num_hidden_layers', None)
        arch_info['language_hidden_size'] = getattr(config, 'hidden_size', None)

    # Get vision model info
    if hasattr(model, 'visual'):
        arch_info['has_vision_tower'] = True
        if hasattr(model.visual, 'config'):
            vision_config = model.visual.config
            arch_info['vision_layers'] = getattr(vision_config, 'num_hidden_layers', None)
            arch_info['vision_hidden_size'] = getattr(vision_config, 'hidden_size', None)
    else:
        arch_info['has_vision_tower'] = False

    return arch_info

def get_layer_indices(total_layers):
    """Get layer indices: 0, n//4, n//2, 3n//4, n-1"""
    if total_layers < 5:
        return list(range(total_layers))

    indices = [
        0,                          # First layer
        total_layers // 4,          # n//4
        total_layers // 2,          # n//2
        (3 * total_layers) // 4,    # 3n//4
        total_layers - 1            # Last layer (n-1)
    ]

    # Remove duplicates and sort
    return sorted(list(set(indices)))

def detect_and_configure_layers(model):
    """Detect model architecture and configure layer selection."""
    print("🔍 Detecting Qwen2.5-VL model architecture...")

    arch_info = get_model_architecture_info(model)
    print(f"Architecture info: {arch_info}")

    # Get total language model layers
    total_layers = arch_info.get('language_layers')

    if total_layers is None:
        # Default based on model size
        if "3B" in MODEL_NAME:
            total_layers = 36
        else:
            total_layers = 28  # For 7B model
        print(f"Could not determine layer count. Using default: {total_layers}")

    # Get target layer indices
    selected_layers = get_layer_indices(total_layers)

    config = {
        'model_architecture': arch_info,
        'total_layers': total_layers,
        'selected_layers': selected_layers,
        'has_vision_tower': arch_info.get('has_vision_tower', False),
        'model_name': MODEL_NAME
    }

    print(f"📋 Configuration:")
    print(f"   Total language layers: {total_layers}")
    print(f"   Selected layers: {selected_layers}")
    print(f"   Has vision tower: {config['has_vision_tower']}")

    return config

def load_image_ids():
    """Load image IDs from the directory."""
    try:
        if not os.path.exists(IMAGE_DIR):
            print(f"Error: Image directory {IMAGE_DIR} not found")
            return []

        image_files = [f for f in os.listdir(IMAGE_DIR) if f.endswith('.jpg')]
        image_ids = sorted([os.path.splitext(f)[0] for f in image_files])

        if TEST_MODE:
            image_ids = image_ids[:2]
            print(f"🧪 TEST MODE: Processing only {len(image_ids)} images")

        print(f"Found {len(image_ids)} images in {IMAGE_DIR}")
        return image_ids
    except Exception as e:
        print(f"Error loading image IDs: {e}")
        return []

def safe_tensor_to_numpy(tensor, name="tensor"):
    """Safely convert tensor to numpy array, handling BFloat16 and other dtypes."""
    try:
        tensor_cpu = tensor.cpu()

        # Convert BFloat16 to Float32 if needed
        if tensor_cpu.dtype == torch.bfloat16:
            tensor_cpu = tensor_cpu.to(torch.float32)
        elif tensor_cpu.dtype == torch.float16:
            tensor_cpu = tensor_cpu.to(torch.float32)

        return tensor_cpu.numpy().tolist()

    except Exception as e:
        print(f"Error converting {name} to numpy: {e}")
        return torch.zeros_like(tensor).float().cpu().numpy().tolist()

def find_token_positions(input_ids, processor, vision_token_count):
    """Find the positions of image end and query end tokens."""
    token_ids = input_ids[0].cpu().tolist()
    sequence_length = len(token_ids)

    # Image tokens are at the beginning, so image_end_pos is after vision tokens
    image_end_pos = min(vision_token_count, sequence_length - 10)

    # Query end is the last token before generation
    query_end_pos = sequence_length - 1

    # Ensure positions are valid
    image_end_pos = max(1, min(image_end_pos, sequence_length - 2))
    query_end_pos = max(image_end_pos + 1, min(query_end_pos, sequence_length - 1))

    return image_end_pos, query_end_pos

def extract_vision_embeddings_from_sequence(hidden_states, vision_token_count):
    """Extract vision embeddings from the sequence hidden states."""
    try:
        # Vision tokens are typically at the beginning of the sequence
        # We'll extract from the last layer and pool the vision region

        final_layer = hidden_states[-1]  # Last layer hidden states

        # Extract vision tokens (first vision_token_count tokens)
        vision_tokens = final_layer[0, :vision_token_count, :]  # [vision_tokens, hidden_size]

        # Pool across vision tokens to get a single representation
        vision_embedding = vision_tokens.mean(dim=0)  # [hidden_size]

        return safe_tensor_to_numpy(vision_embedding, "vision_embeddings_from_sequence")

    except Exception as e:
        print(f"Warning: Could not extract vision embeddings from sequence: {e}")
        return None

def estimate_vision_token_count(input_ids, sequence_length):
    """Estimate the number of vision tokens in the sequence."""
    # For Qwen2.5-VL, vision tokens are typically at the beginning
    # Common ranges are 256-1024 tokens depending on image resolution

    # Conservative estimate: vision tokens are roughly 10-30% of sequence
    # but usually between 200-800 tokens
    estimated_vision_tokens = min(
        max(200, sequence_length // 4),  # At least 200, at most 1/4 of sequence
        800  # Cap at 800 tokens
    )

    return estimated_vision_tokens

def process_image_with_qwen2_5vl(image_path):
    """Process image with Qwen2.5-VL processor."""
    global processor, model

    try:
        image = Image.open(image_path).convert("RGB")

        # Create messages
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": image,
                    },
                    {"type": "text", "text": "Describe this image."},
                ],
            }
        ]

        # Process following reference code
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        inputs = inputs.to(model.device)

        return inputs, text, messages

    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None, None, None

def extract_targeted_embeddings(image_id, image_path, selected_layers):
    """Extract only the specific embeddings we need."""
    global model, processor, device

    try:
        # Process image
        inputs, text, messages = process_image_with_qwen2_5vl(image_path)
        if inputs is None:
            return None

        embeddings_data = {}

        # Forward pass to get all hidden states
        with torch.no_grad():
            model_outputs = model(**inputs, output_hidden_states=True)

            if hasattr(model_outputs, 'hidden_states') and model_outputs.hidden_states:
                hidden_states = model_outputs.hidden_states
                total_available_layers = len(hidden_states)

                # Get sequence info
                sequence_length = len(inputs['input_ids'][0])

                # Estimate vision token count
                vision_token_count = estimate_vision_token_count(inputs['input_ids'], sequence_length)

                # 1. Extract Vision Embeddings from the sequence
                vision_embeddings = extract_vision_embeddings_from_sequence(hidden_states, vision_token_count)
                embeddings_data['vision_embeddings'] = vision_embeddings

                # 2. Extract Pre-generation Embeddings at Specific Layers and Positions
                # Validate selected layers
                valid_layers = [layer for layer in selected_layers if layer < total_available_layers]
                if len(valid_layers) != len(selected_layers):
                    invalid_layers = [layer for layer in selected_layers if layer >= total_available_layers]
                    print(f"   ⚠️ Skipping invalid layers {invalid_layers} (max: {total_available_layers-1})")

                # Find critical token positions
                image_end_pos, query_end_pos = find_token_positions(inputs['input_ids'], processor, vision_token_count)

                # Extract embeddings from selected layers at critical positions
                pre_generation_data = {}

                for layer_idx in valid_layers:
                    layer_hidden = hidden_states[layer_idx]

                    # Extract embeddings at the two critical positions
                    after_image_emb = layer_hidden[0, image_end_pos, :]
                    end_query_emb = layer_hidden[0, query_end_pos, :]

                    pre_generation_data[f'layer_{layer_idx}'] = {
                        'after_image_embeddings': safe_tensor_to_numpy(after_image_emb, f"layer_{layer_idx}_after_image"),
                        'end_query_embeddings': safe_tensor_to_numpy(end_query_emb, f"layer_{layer_idx}_end_query")
                    }

                embeddings_data['pre_generation'] = pre_generation_data

                # Store position info for reference
                embeddings_data['token_positions'] = {
                    'image_end_position': int(image_end_pos),
                    'query_end_position': int(query_end_pos),
                    'sequence_length': int(sequence_length),
                    'estimated_vision_tokens': int(vision_token_count)
                }

        # Generate caption for reference
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=128)
            generated_ids_trimmed = [
                out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )
            generated_caption = output_text[0] if output_text else ""

        # Clear GPU memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return {
            'image_id': image_id,
            'generated_caption': generated_caption,
            'vision_embeddings': embeddings_data.get('vision_embeddings'),
            'pre_generation': embeddings_data.get('pre_generation', {}),
            'token_positions': embeddings_data.get('token_positions', {})
        }

    except Exception as e:
        print(f"Error extracting embeddings for {image_id}: {str(e)}")
        return None

def write_embeddings_hdf5(f, image_id, result):
    """Write targeted embeddings to HDF5 file."""
    try:
        group = f.create_group(image_id)

        # Basic metadata
        group.create_dataset('image_id', data=result['image_id'].encode('utf-8'))
        group.create_dataset('generated_caption', data=result['generated_caption'].encode('utf-8'))

        # Token positions info
        if result['token_positions']:
            pos_group = group.create_group('token_positions')
            for key, value in result['token_positions'].items():
                pos_group.create_dataset(key, data=value)

        # Vision embeddings
        if result['vision_embeddings'] is not None:
            vision_emb = np.array(result['vision_embeddings'], dtype=np.float32)
            group.create_dataset('vision_embeddings',
                               data=vision_emb,
                               chunks=True,
                               compression='gzip')

        # Pre-generation embeddings
        pre_gen_group = group.create_group('pre_generation')
        for layer_name, layer_data in result['pre_generation'].items():
            layer_group = pre_gen_group.create_group(layer_name)

            # After image embeddings
            if 'after_image_embeddings' in layer_data:
                layer_group.create_dataset('after_image_embeddings',
                                         data=np.array(layer_data['after_image_embeddings'], dtype=np.float32),
                                         chunks=True,
                                         compression='gzip')

            # End query embeddings
            if 'end_query_embeddings' in layer_data:
                layer_group.create_dataset('end_query_embeddings',
                                         data=np.array(layer_data['end_query_embeddings'], dtype=np.float32),
                                         chunks=True,
                                         compression='gzip')

        return True
    except Exception as e:
        print(f"Error writing embeddings for image {image_id}: {str(e)}")
        return False

class ImageDataset(Dataset):
    def __init__(self, image_ids, image_dir):
        self.image_ids = image_ids
        self.image_dir = image_dir

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image_path = os.path.join(self.image_dir, f"{image_id}.jpg")

        if os.path.exists(image_path):
            return image_id, image_path
        else:
            return image_id, None

def collate_fn(batch):
    """Collate function for DataLoader."""
    valid_batch = [(img_id, img_path) for img_id, img_path in batch if img_path is not None]

    if not valid_batch:
        return [], []

    image_ids = [item[0] for item in valid_batch]
    image_paths = [item[1] for item in valid_batch]

    return image_ids, image_paths

def process_batch(batch_data, selected_layers):
    """Process a batch of images."""
    image_ids, image_paths = batch_data
    results = []

    if not image_ids or not image_paths:
        return results

    for image_id, image_path in zip(image_ids, image_paths):
        try:
            result = extract_targeted_embeddings(image_id, image_path, selected_layers)
            if result:
                results.append((image_id, result))
        except Exception as e:
            print(f"Error processing image {image_id}: {e}")
            continue

    return results

def main():
    """Main function with GPU usage and progress tracking."""
    global processor, model, device

    try:
        print("🚀 Starting Qwen2.5-VL Targeted Embeddings Extraction")
        print("=" * 60)
        print(f"📊 Settings: {PART_SIZE} images per H5 file")
        if TEST_MODE:
            print("🧪 RUNNING IN TEST MODE - Processing only 2 images")
        print("=" * 60)

        # Check GPU and load model
        device = check_gpu_availability()

        print("📥 Loading Qwen2.5-VL model and processor...")
        print(f"Model: {MODEL_NAME}")

        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            MODEL_NAME,
            torch_dtype="auto",
            device_map="auto"
        )

        processor = AutoProcessor.from_pretrained(MODEL_NAME)

        device = model.device
        model_dtype = next(model.parameters()).dtype
        print(f"🖥️  Model loaded on device: {device}")
        print(f"🔢 Model dtype: {model_dtype}")

        model.eval()

        # Configure layers
        config = detect_and_configure_layers(model)
        selected_layers = config['selected_layers']

        # Load image IDs
        image_ids = load_image_ids()
        if not image_ids:
            print("No image IDs found. Exiting.")
            return

        print(f"Found {len(image_ids)} image IDs")

        if TEST_MODE:
            PART_SIZE_ACTUAL = min(PART_SIZE, len(image_ids))
        else:
            PART_SIZE_ACTUAL = PART_SIZE

        # Process in parts
        total_parts = (len(image_ids) + PART_SIZE_ACTUAL - 1) // PART_SIZE_ACTUAL

        if TEST_MODE:
            print(f"🧪 TEST MODE: Processing {len(image_ids)} images in {total_parts} part(s)")

        # Overall progress bar
        overall_pbar = tqdm(total=len(image_ids), desc="Overall Progress", position=0)

        for part in range(total_parts):
            start_idx = part * PART_SIZE_ACTUAL
            end_idx = min((part + 1) * PART_SIZE_ACTUAL, len(image_ids))
            current_batch = image_ids[start_idx:end_idx]

            # Create output directory
            os.makedirs(OUTPUT_DIR, exist_ok=True)

            # Output file
            current_output_path = os.path.join(OUTPUT_DIR, f"qwen2_5_vl_targeted_embeddings_part{part+1}.h5")

            print(f"\n📦 Processing part {part+1}/{total_parts}")
            print(f"🖼️  Processing images {start_idx+1} to {end_idx}")
            print(f"💾 Output file: {current_output_path}")

            # Create dataset and dataloader
            dataset = ImageDataset(current_batch, IMAGE_DIR)
            dataloader = DataLoader(
                dataset,
                batch_size=BATCH_SIZE,
                num_workers=NUM_WORKERS,
                shuffle=False,
                collate_fn=collate_fn
            )

            # Process images with progress bar
            part_pbar = tqdm(total=len(current_batch), desc=f"Part {part+1}", position=1, leave=False)

            with h5py.File(current_output_path, 'w') as f:
                # Store configuration
                f.attrs['model_name'] = MODEL_NAME
                f.attrs['model_config'] = json.dumps(config, default=str)
                f.attrs['extraction_type'] = 'targeted_embeddings_fixed'
                f.attrs['selected_layers'] = json.dumps(selected_layers)

                saved_count = 0

                for batch_idx, batch_data in enumerate(dataloader):
                    try:
                        batch_results = process_batch(batch_data, selected_layers)

                        for image_id, result in batch_results:
                            try:
                                if write_embeddings_hdf5(f, image_id, result):
                                    saved_count += 1
                            except Exception as e:
                                print(f"Error saving {image_id}: {e}")

                        # Update progress bars
                        batch_size = len(batch_data[0]) if batch_data[0] else 0
                        part_pbar.update(batch_size)
                        overall_pbar.update(batch_size)

                        # GPU memory cleanup
                        if torch.cuda.is_available() and batch_idx % 10 == 0:
                            torch.cuda.empty_cache()

                    except Exception as e:
                        print(f"Error processing batch {batch_idx + 1}: {e}")
                        continue

            part_pbar.close()

            print(f"✅ Part {part+1} completed: {saved_count} images saved")

            if TEST_MODE:
                print("🧪 TEST MODE: Successfully processed test images!")
                break

            # Memory cleanup between parts
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        overall_pbar.close()

        if TEST_MODE:
            print("\n🎉 TEST MODE COMPLETED SUCCESSFULLY!")
            print("✅ Targeted embeddings extracted without errors")
            print("✅ Ready for full dataset processing (set TEST_MODE = False)")
        else:
            print("\n🎉 All parts processing complete!")

    except Exception as e:
        print(f"❌ Error in main function: {e}")
        traceback.print_exc()
        raise

if __name__ == "__main__":
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass

    main()

In [None]:
import sys
import os
import traceback
import time
import numpy as np
from collections import defaultdict
import h5py
import json
import torch
from PIL import Image
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.multiprocessing as mp
import warnings
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Suppress warnings
warnings.filterwarnings("ignore")

def check_gpu_availability():
    """Check GPU availability and set appropriate device."""
    try:
        if torch.cuda.is_available():
            device_count = torch.cuda.device_count()
            print(f"CUDA is available with {device_count} GPU(s)")

            # Try to create a tensor on GPU to test functionality
            test_tensor = torch.tensor([1.0]).cuda()
            del test_tensor
            torch.cuda.empty_cache()

            return "cuda"
        else:
            print("CUDA is not available")
            return "cpu"
    except Exception as e:
        print(f"GPU test failed: {e}")
        print("Falling back to CPU")
        return "cpu"

# Constants
BATCH_SIZE = 1  # Qwen2.5-VL requires batch size 1 for vision processing
PART_SIZE = 250  # Images per part file (as requested)
NUM_WORKERS = 0  # Disabled multiprocessing for Qwen2.5-VL
IMAGE_DIR = "/root/akhil_workspace/coco_val2014"
OUTPUT_DIR = "/root/akhil_workspace/qwen_model/extracted_embeddings_coco2014"
MODEL_NAME = "Qwen/Qwen2.5-VL-7B-Instruct"  # Can also use "Qwen/Qwen2.5-VL-3B-Instruct"

# Test mode - set to True to process only 2 images for testing
TEST_MODE = False  # Set to False for full processing

# Global model variables (will be initialized in main)
processor = None
model = None
device = None

def get_model_architecture_info(model):
    """Get the total number of layers in the language model."""
    arch_info = {}

    # Get language model layers
    if hasattr(model, 'model') and hasattr(model.model, 'config'):
        config = model.model.config
        arch_info['language_layers'] = getattr(config, 'num_hidden_layers', None)
        arch_info['language_hidden_size'] = getattr(config, 'hidden_size', None)

    # Get vision model info
    if hasattr(model, 'visual'):
        arch_info['has_vision_tower'] = True
        if hasattr(model.visual, 'config'):
            vision_config = model.visual.config
            arch_info['vision_layers'] = getattr(vision_config, 'num_hidden_layers', None)
            arch_info['vision_hidden_size'] = getattr(vision_config, 'hidden_size', None)
    else:
        arch_info['has_vision_tower'] = False

    return arch_info

def get_layer_indices(total_layers):
    """Get layer indices: 0, n//4, n//2, 3n//4, n-1"""
    if total_layers < 5:
        return list(range(total_layers))

    indices = [
        0,                          # First layer
        total_layers // 4,          # n//4
        total_layers // 2,          # n//2
        (3 * total_layers) // 4,    # 3n//4
        total_layers - 1            # Last layer (n-1)
    ]

    # Remove duplicates and sort
    return sorted(list(set(indices)))

def detect_and_configure_layers(model):
    """Detect model architecture and configure layer selection."""
    print("🔍 Detecting Qwen2.5-VL model architecture...")

    arch_info = get_model_architecture_info(model)
    print(f"Architecture info: {arch_info}")

    # Get total language model layers
    total_layers = arch_info.get('language_layers')

    if total_layers is None:
        # Default based on model size
        if "3B" in MODEL_NAME:
            total_layers = 36
        else:
            total_layers = 28  # For 7B model
        print(f"Could not determine layer count. Using default: {total_layers}")

    # Get target layer indices
    selected_layers = get_layer_indices(total_layers)

    config = {
        'model_architecture': arch_info,
        'total_layers': total_layers,
        'selected_layers': selected_layers,
        'has_vision_tower': arch_info.get('has_vision_tower', False),
        'model_name': MODEL_NAME
    }

    print(f"📋 Configuration:")
    print(f"   Total language layers: {total_layers}")
    print(f"   Selected layers: {selected_layers}")
    print(f"   Has vision tower: {config['has_vision_tower']}")

    return config

def load_image_ids():
    """Load image IDs from the directory."""
    try:
        if not os.path.exists(IMAGE_DIR):
            print(f"Error: Image directory {IMAGE_DIR} not found")
            return []

        image_files = [f for f in os.listdir(IMAGE_DIR) if f.endswith('.jpg')]
        image_ids = sorted([os.path.splitext(f)[0] for f in image_files])

        if TEST_MODE:
            image_ids = image_ids[:2]
            print(f"🧪 TEST MODE: Processing only {len(image_ids)} images")

        print(f"Found {len(image_ids)} images in {IMAGE_DIR}")
        return image_ids
    except Exception as e:
        print(f"Error loading image IDs: {e}")
        return []

def safe_tensor_to_numpy(tensor, name="tensor"):
    """Efficiently convert tensor to numpy array, keeping operations on GPU until final step."""
    try:
        # Keep tensor on GPU for as long as possible, only move to CPU at the end
        if tensor.dtype == torch.bfloat16:
            tensor = tensor.to(torch.float32)
        elif tensor.dtype == torch.float16:
            tensor = tensor.to(torch.float32)

        # Single CPU transfer at the end
        return tensor.cpu().numpy().tolist()

    except Exception as e:
        print(f"Error converting {name} to numpy: {e}")
        return torch.zeros_like(tensor).float().cpu().numpy().tolist()

def find_token_positions(input_ids, processor, vision_token_count):
    """Find the positions of image end and query end tokens."""
    token_ids = input_ids[0].cpu().tolist()
    sequence_length = len(token_ids)

    # Image tokens are at the beginning, so image_end_pos is after vision tokens
    image_end_pos = min(vision_token_count, sequence_length - 10)

    # Query end is the last token before generation
    query_end_pos = sequence_length - 1

    # Ensure positions are valid
    image_end_pos = max(1, min(image_end_pos, sequence_length - 2))
    query_end_pos = max(image_end_pos + 1, min(query_end_pos, sequence_length - 1))

    return image_end_pos, query_end_pos

def extract_vision_embeddings_from_sequence(hidden_states, vision_token_count):
    """Extract vision embeddings from the sequence hidden states - GPU optimized."""
    try:
        # Keep everything on GPU until final conversion
        final_layer = hidden_states[-1]  # Already on GPU

        # Extract vision tokens (first vision_token_count tokens) - GPU operation
        vision_tokens = final_layer[0, :vision_token_count, :]  # [vision_tokens, hidden_size]

        # Pool across vision tokens - GPU operation
        vision_embedding = vision_tokens.mean(dim=0)  # [hidden_size]

        return safe_tensor_to_numpy(vision_embedding, "vision_embeddings_from_sequence")

    except Exception as e:
        print(f"Warning: Could not extract vision embeddings from sequence: {e}")
        return None

def estimate_vision_token_count(input_ids, sequence_length):
    """Estimate the number of vision tokens in the sequence."""
    # For Qwen2.5-VL, vision tokens are typically at the beginning
    # Common ranges are 256-1024 tokens depending on image resolution

    # Conservative estimate: vision tokens are roughly 10-30% of sequence
    # but usually between 200-800 tokens
    estimated_vision_tokens = min(
        max(200, sequence_length // 4),  # At least 200, at most 1/4 of sequence
        800  # Cap at 800 tokens
    )

    return estimated_vision_tokens

def process_image_with_qwen2_5vl(image_path):
    """Process image with Qwen2.5-VL processor."""
    global processor, model

    try:
        image = Image.open(image_path).convert("RGB")

        # Create messages
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": image,
                    },
                    {"type": "text", "text": "Caption this image."},
                ],
            }
        ]

        # Process following reference code
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

        image_inputs, video_inputs = process_vision_info(messages)

        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        inputs = inputs.to(model.device)

        return inputs, text, messages

    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None, None, None

def extract_targeted_embeddings(image_id, image_path, selected_layers):
    """Extract only the specific embeddings we need - GPU optimized."""
    global model, processor, device

    try:
        # Process image
        inputs, text, messages = process_image_with_qwen2_5vl(image_path)
        if inputs is None:
            return None

        embeddings_data = {}

        # Single forward pass to get all hidden states - keep on GPU
        with torch.no_grad():
            # Use torch.cuda.amp for mixed precision if available
            if device == "cuda":
                with torch.cuda.amp.autocast():
                    model_outputs = model(**inputs, output_hidden_states=True)
            else:
                model_outputs = model(**inputs, output_hidden_states=True)

            if hasattr(model_outputs, 'hidden_states') and model_outputs.hidden_states:
                hidden_states = model_outputs.hidden_states
                total_available_layers = len(hidden_states)

                # Get sequence info - minimal CPU operations
                sequence_length = inputs['input_ids'].shape[1]  # Keep tensor operation on GPU

                # Estimate vision token count
                vision_token_count = estimate_vision_token_count(inputs['input_ids'], sequence_length)

                # 1. Extract Vision Embeddings from the sequence
                vision_embeddings = extract_vision_embeddings_from_sequence(hidden_states, vision_token_count)
                embeddings_data['vision_embeddings'] = vision_embeddings

                # 2. Extract Pre-generation Embeddings at Specific Layers and Positions
                # Validate selected layers
                valid_layers = [layer for layer in selected_layers if layer < total_available_layers]

                # Find critical token positions
                image_end_pos, query_end_pos = find_token_positions(inputs['input_ids'], processor, vision_token_count)

                # Batch extract embeddings from all layers - GPU optimized
                pre_generation_data = {}

                # Extract from multiple layers in one go to reduce GPU operations
                for layer_idx in valid_layers:
                    layer_hidden = hidden_states[layer_idx]  # Already on GPU

                    # Extract both positions at once - GPU operations
                    after_image_emb = layer_hidden[0, image_end_pos, :]
                    end_query_emb = layer_hidden[0, query_end_pos, :]

                    # Convert to numpy only at the end
                    pre_generation_data[f'layer_{layer_idx}'] = {
                        'after_image_embeddings': safe_tensor_to_numpy(after_image_emb, f"layer_{layer_idx}_after_image"),
                        'end_query_embeddings': safe_tensor_to_numpy(end_query_emb, f"layer_{layer_idx}_end_query")
                    }

                embeddings_data['pre_generation'] = pre_generation_data

                # Store position info
                embeddings_data['token_positions'] = {
                    'image_end_position': int(image_end_pos),
                    'query_end_position': int(query_end_pos),
                    'sequence_length': int(sequence_length),
                    'estimated_vision_tokens': int(vision_token_count)
                }

        # Generate caption efficiently - separate forward pass
        with torch.no_grad():
            if device == "cuda":
                with torch.cuda.amp.autocast():
                    generated_ids = model.generate(**inputs, max_new_tokens=64, do_sample=False)  # Reduced tokens for speed
            else:
                generated_ids = model.generate(**inputs, max_new_tokens=64, do_sample=False)

            generated_ids_trimmed = [
                out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )
            generated_caption = output_text[0] if output_text else ""

        return {
            'image_id': image_id,
            'generated_caption': generated_caption,
            'vision_embeddings': embeddings_data.get('vision_embeddings'),
            'pre_generation': embeddings_data.get('pre_generation', {}),
            'token_positions': embeddings_data.get('token_positions', {})
        }

    except Exception as e:
        print(f"Error extracting embeddings for {image_id}: {str(e)}")
        return None

def write_embeddings_hdf5(f, image_id, result):
    """Write targeted embeddings to HDF5 file."""
    try:
        group = f.create_group(image_id)

        # Basic metadata
        group.create_dataset('image_id', data=result['image_id'].encode('utf-8'))
        group.create_dataset('generated_caption', data=result['generated_caption'].encode('utf-8'))

        # Token positions info
        if result['token_positions']:
            pos_group = group.create_group('token_positions')
            for key, value in result['token_positions'].items():
                pos_group.create_dataset(key, data=value)

        # Vision embeddings
        if result['vision_embeddings'] is not None:
            vision_emb = np.array(result['vision_embeddings'], dtype=np.float32)
            group.create_dataset('vision_embeddings',
                               data=vision_emb,
                               chunks=True,
                               compression='gzip')

        # Pre-generation embeddings
        pre_gen_group = group.create_group('pre_generation')
        for layer_name, layer_data in result['pre_generation'].items():
            layer_group = pre_gen_group.create_group(layer_name)

            # After image embeddings
            if 'after_image_embeddings' in layer_data:
                layer_group.create_dataset('after_image_embeddings',
                                         data=np.array(layer_data['after_image_embeddings'], dtype=np.float32),
                                         chunks=True,
                                         compression='gzip')

            # End query embeddings
            if 'end_query_embeddings' in layer_data:
                layer_group.create_dataset('end_query_embeddings',
                                         data=np.array(layer_data['end_query_embeddings'], dtype=np.float32),
                                         chunks=True,
                                         compression='gzip')

        return True
    except Exception as e:
        print(f"Error writing embeddings for image {image_id}: {str(e)}")
        return False

class ImageDataset(Dataset):
    def __init__(self, image_ids, image_dir):
        self.image_ids = image_ids
        self.image_dir = image_dir

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image_path = os.path.join(self.image_dir, f"{image_id}.jpg")

        if os.path.exists(image_path):
            return image_id, image_path
        else:
            return image_id, None

def collate_fn(batch):
    """Collate function for DataLoader."""
    valid_batch = [(img_id, img_path) for img_id, img_path in batch if img_path is not None]

    if not valid_batch:
        return [], []

    image_ids = [item[0] for item in valid_batch]
    image_paths = [item[1] for item in valid_batch]

    return image_ids, image_paths

def process_batch(batch_data, selected_layers):
    """Process a batch of images."""
    image_ids, image_paths = batch_data
    results = []

    if not image_ids or not image_paths:
        return results

    for image_id, image_path in zip(image_ids, image_paths):
        try:
            result = extract_targeted_embeddings(image_id, image_path, selected_layers)
            if result:
                results.append((image_id, result))
        except Exception as e:
            print(f"Error processing image {image_id}: {e}")
            continue

    return results

def main():
    """Main function with GPU usage and progress tracking."""
    global processor, model, device

    try:
        print("🚀 Starting Qwen2.5-VL Targeted Embeddings Extraction")
        print("=" * 60)
        print(f"📊 Settings: {PART_SIZE} images per H5 file")
        if TEST_MODE:
            print("🧪 RUNNING IN TEST MODE - Processing only 2 images")
        print("=" * 60)

        # Check GPU and load model
        device = check_gpu_availability()

        print("📥 Loading Qwen2.5-VL model and processor...")
        print(f"Model: {MODEL_NAME}")

        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            MODEL_NAME,
            torch_dtype="auto",
            device_map="auto"
        )

        processor = AutoProcessor.from_pretrained(MODEL_NAME)

        device = model.device
        model_dtype = next(model.parameters()).dtype
        print(f"🖥️  Model loaded on device: {device}")
        print(f"🔢 Model dtype: {model_dtype}")

        model.eval()

        # Configure layers
        config = detect_and_configure_layers(model)
        selected_layers = config['selected_layers']

        # Load image IDs
        image_ids = load_image_ids()
        if not image_ids:
            print("No image IDs found. Exiting.")
            return

        print(f"Found {len(image_ids)} image IDs")

        if TEST_MODE:
            PART_SIZE_ACTUAL = min(PART_SIZE, len(image_ids))
        else:
            PART_SIZE_ACTUAL = PART_SIZE

        # Process in parts
        total_parts = (len(image_ids) + PART_SIZE_ACTUAL - 1) // PART_SIZE_ACTUAL

        if TEST_MODE:
            print(f"🧪 TEST MODE: Processing {len(image_ids)} images in {total_parts} part(s)")

        # Overall progress bar
        overall_pbar = tqdm(total=len(image_ids), desc="Overall Progress", position=0)

        for part in range(total_parts):
            start_idx = part * PART_SIZE_ACTUAL
            end_idx = min((part + 1) * PART_SIZE_ACTUAL, len(image_ids))
            current_batch = image_ids[start_idx:end_idx]

            # Create output directory
            os.makedirs(OUTPUT_DIR, exist_ok=True)

            # Output file
            current_output_path = os.path.join(OUTPUT_DIR, f"embeddings_part{part+1}.h5")

            print(f"\n📦 Processing part {part+1}/{total_parts}")
            print(f"🖼️  Processing images {start_idx+1} to {end_idx}")
            print(f"💾 Output file: {current_output_path}")

            # Create dataset and dataloader
            dataset = ImageDataset(current_batch, IMAGE_DIR)
            dataloader = DataLoader(
                dataset,
                batch_size=BATCH_SIZE,
                num_workers=NUM_WORKERS,
                shuffle=False,
                collate_fn=collate_fn
            )

            # Process images with progress bar
            part_pbar = tqdm(total=len(current_batch), desc=f"Part {part+1}", position=1, leave=False)

            with h5py.File(current_output_path, 'w') as f:
                # Store configuration
                f.attrs['model_name'] = MODEL_NAME
                f.attrs['model_config'] = json.dumps(config, default=str)
                f.attrs['extraction_type'] = 'targeted_embeddings_fixed'
                f.attrs['selected_layers'] = json.dumps(selected_layers)

                saved_count = 0

                for batch_idx, batch_data in enumerate(dataloader):
                    try:
                        batch_results = process_batch(batch_data, selected_layers)

                        for image_id, result in batch_results:
                            try:
                                if write_embeddings_hdf5(f, image_id, result):
                                    saved_count += 1
                            except Exception as e:
                                print(f"Error saving {image_id}: {e}")

                        # Update progress bars
                        batch_size = len(batch_data[0]) if batch_data[0] else 0
                        part_pbar.update(batch_size)
                        overall_pbar.update(batch_size)

                        # GPU memory cleanup
                        if torch.cuda.is_available() and batch_idx % 10 == 0:
                            torch.cuda.empty_cache()

                    except Exception as e:
                        print(f"Error processing batch {batch_idx + 1}: {e}")
                        continue

            part_pbar.close()

            print(f"✅ Part {part+1} completed: {saved_count} images saved")

            if TEST_MODE:
                print("🧪 TEST MODE: Successfully processed test images!")
                break

            # Memory cleanup between parts
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        overall_pbar.close()

        if TEST_MODE:
            print("\n🎉 TEST MODE COMPLETED SUCCESSFULLY!")
            print("✅ Targeted embeddings extracted without errors")
            print("✅ Ready for full dataset processing (set TEST_MODE = False)")
        else:
            print("\n🎉 All parts processing complete!")

    except Exception as e:
        print(f"❌ Error in main function: {e}")
        traceback.print_exc()
        raise

if __name__ == "__main__":
    try:
        mp.set_start_method('spawn', force=True)
    except RuntimeError:
        pass

    main()

In [None]:
import h5py
import json
import numpy as np
import os
import sys
from datetime import datetime

def convert_to_json_safe(value):
    """Convert any value to JSON-safe format."""
    if isinstance(value, np.ndarray):
        return value.tolist()
    elif isinstance(value, bytes):
        return value.decode('utf-8')
    elif isinstance(value, (np.integer, np.floating)):
        return value.item()
    else:
        return value

def extract_embeddings_from_group(group):
    """Extract all embeddings from H5 group recursively."""
    result = {}

    for key in group.keys():
        item = group[key]

        if isinstance(item, h5py.Dataset):
            # It's data - extract it
            data = item[()]
            result[key] = convert_to_json_safe(data)
        elif isinstance(item, h5py.Group):
            # It's a group - go deeper
            result[key] = extract_embeddings_from_group(item)

    return result

def extract_qwen_embeddings_to_json(h5_file_path):
    """Extract Qwen2.5-VL embeddings from H5 file to JSON."""

    # Check if file exists
    if not os.path.exists(h5_file_path):
        print(f"❌ Error: File not found - {h5_file_path}")
        return

    # Create output filename
    base_name = os.path.splitext(h5_file_path)[0]
    output_file = f"{base_name}_embeddings.json"

    print(f"🔍 Processing: {h5_file_path}")
    print(f"💾 Output: {output_file}")

    try:
        # Open H5 file and extract everything
        with h5py.File(h5_file_path, 'r') as f:

            print(f"📊 Analyzing H5 file structure...")

            # Extract file attributes (model info, config, etc.)
            file_attrs = {}
            for attr_name in f.attrs.keys():
                file_attrs[attr_name] = convert_to_json_safe(f.attrs[attr_name])

            # Get all image IDs (top-level keys)
            image_ids = list(f.keys())
            print(f"📸 Found {len(image_ids)} images in the file")

            # Extract embeddings for all images
            all_embeddings = {}

            for i, image_id in enumerate(image_ids):
                if i % 10 == 0:  # Progress update every 10 images
                    print(f"   Processing image {i+1}/{len(image_ids)}: {image_id}")

                image_group = f[image_id]
                image_embeddings = extract_embeddings_from_group(image_group)
                all_embeddings[image_id] = image_embeddings

            # Create final structure
            result = {
                'extraction_info': {
                    'source_file': h5_file_path,
                    'extraction_date': datetime.now().isoformat(),
                    'total_images': len(image_ids),
                    'embedding_type': 'qwen2_5_vl_targeted_embeddings'
                },
                'file_attributes': file_attrs,
                'embeddings': all_embeddings
            }

            # Save to JSON
            print(f"💾 Saving embeddings to JSON...")
            with open(output_file, 'w') as json_file:
                json.dump(result, json_file, indent=2)

            print(f"✅ Success! Extracted embeddings to {output_file}")

            # Show file sizes and summary
            h5_size = os.path.getsize(h5_file_path) / 1024 / 1024
            json_size = os.path.getsize(output_file) / 1024 / 1024

            print(f"\n📊 Summary:")
            print(f"   📁 H5 file size: {h5_size:.2f} MB")
            print(f"   📁 JSON file size: {json_size:.2f} MB")
            print(f"   📸 Total images: {len(image_ids)}")
            print(f"   🧠 Embeddings per image:")

            # Show structure of first image as example
            if image_ids:
                first_image = image_ids[0]
                first_embeddings = all_embeddings[first_image]

                if 'vision_embeddings' in first_embeddings:
                    vision_shape = len(first_embeddings['vision_embeddings'])
                    print(f"      • Vision embeddings: {vision_shape} dimensions")

                if 'pre_generation' in first_embeddings:
                    layers = list(first_embeddings['pre_generation'].keys())
                    print(f"      • Pre-generation layers: {len(layers)} ({layers})")

                    if layers:
                        first_layer = first_embeddings['pre_generation'][layers[0]]
                        if 'after_image_embeddings' in first_layer:
                            after_img_shape = len(first_layer['after_image_embeddings'])
                            print(f"        - After image embeddings: {after_img_shape} dimensions")
                        if 'end_query_embeddings' in first_layer:
                            end_query_shape = len(first_layer['end_query_embeddings'])
                            print(f"        - End query embeddings: {end_query_shape} dimensions")

                if 'token_positions' in first_embeddings:
                    positions = first_embeddings['token_positions']
                    print(f"      • Token positions:")
                    for key, value in positions.items():
                        print(f"        - {key}: {value}")

            return output_file

    except Exception as e:
        print(f"❌ Error processing H5 file: {e}")
        return None

def inspect_embeddings_structure(json_file_path):
    """Quick inspection of extracted embeddings JSON file."""

    if not os.path.exists(json_file_path):
        print(f"❌ JSON file not found: {json_file_path}")
        return

    print(f"\n🔍 Inspecting embeddings structure: {json_file_path}")
    print("=" * 60)

    try:
        with open(json_file_path, 'r') as f:
            data = json.load(f)

        # Show extraction info
        if 'extraction_info' in data:
            info = data['extraction_info']
            print(f"📋 Extraction Info:")
            for key, value in info.items():
                print(f"   {key}: {value}")

        # Show file attributes
        if 'file_attributes' in data:
            attrs = data['file_attributes']
            print(f"\n📋 File Attributes:")
            for key, value in attrs.items():
                if isinstance(value, str) and len(value) > 100:
                    print(f"   {key}: {value[:100]}...")
                else:
                    print(f"   {key}: {value}")

        # Show embeddings structure
        if 'embeddings' in data:
            embeddings = data['embeddings']
            print(f"\n🧠 Embeddings Structure:")
            print(f"   Total images: {len(embeddings)}")

            # Show first image structure
            if embeddings:
                first_image_id = list(embeddings.keys())[0]
                first_image_data = embeddings[first_image_id]

                print(f"   Sample image ({first_image_id}):")
                for key, value in first_image_data.items():
                    if key == 'vision_embeddings' and isinstance(value, list):
                        print(f"     • {key}: {len(value)} dimensions")
                    elif key == 'pre_generation' and isinstance(value, dict):
                        print(f"     • {key}: {len(value)} layers")
                        for layer_name, layer_data in value.items():
                            print(f"       - {layer_name}: {list(layer_data.keys())}")
                    elif key == 'token_positions' and isinstance(value, dict):
                        print(f"     • {key}: {list(value.keys())}")
                    else:
                        print(f"     • {key}: {type(value).__name__}")

        file_size = os.path.getsize(json_file_path) / 1024 / 1024
        print(f"\n📁 File size: {file_size:.2f} MB")

    except Exception as e:
        print(f"❌ Error inspecting JSON file: {e}")

def main():
    """Main function with command line interface."""

    if len(sys.argv) < 2:
        print("Qwen2.5-VL Embeddings Extractor")
        print("=" * 40)
        print("\nUsage:")
        print("  python h5_to_json_embeddings.py <h5_file_path>")
        print("\nExample:")
        print("  python h5_to_json_embeddings.py qwen2_5_vl_targeted_embeddings_part1.h5")
        print("\nThis will create:")
        print("  qwen2_5_vl_targeted_embeddings_part1_embeddings.json")
        return

    h5_file_path = sys.argv[1]

    # Extract embeddings
    output_json = extract_qwen_embeddings_to_json(h5_file_path)

    # Inspect the results
    if output_json:
        inspect_embeddings_structure(output_json)
        print(f"\n🎉 Extraction completed successfully!")
        print(f"📁 Output file: {output_json}")

if __name__ == "__main__":
    main()

In [None]:
import sys, os
import traceback
import json
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from collections import defaultdict, Counter
import requests
import zipfile
import re
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch.nn.functional as F

print("Starting Qwen2.5-VL Caption Scoring Pipeline...")

# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Set device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Paths configuration
BASE_DIR = "/root/akhil_workspace/qwen_model"
CAPTIONS_DIR = os.path.join(BASE_DIR, "extracted_captions")
ANALYSIS_OUTPUT_DIR = os.path.join(BASE_DIR, "factual_analysis")
COCO_DATA_DIR = os.path.join(BASE_DIR, "coco_annotations")
EVALUATION_OUTPUT_DIR = os.path.join(BASE_DIR, "evaluation_results")

# File paths
QWEN_CAPTIONS_PATH = "/root/akhil_workspace/qwen_model/extracted_captions/qwen2_captions.json"
COCO_ANNOTATIONS_PATH = os.path.join(COCO_DATA_DIR, "captions_val2014.json")
OUTPUT_CSV_PATH = os.path.join(EVALUATION_OUTPUT_DIR, "qwen2_comprehensive_scores.csv")
SUMMARY_PATH = os.path.join(EVALUATION_OUTPUT_DIR, "qwen2_scoring_summary.json")

# Create output directories
os.makedirs(ANALYSIS_OUTPUT_DIR, exist_ok=True)
os.makedirs(EVALUATION_OUTPUT_DIR, exist_ok=True)
os.makedirs(COCO_DATA_DIR, exist_ok=True)

# Initialize BERT model for factual accuracy
print("Loading BERT model...")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
bert_model.eval()

# CHAIR Metric Configuration - COCO 80 object categories
MSCOCO_OBJECTS_80 = sorted([
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
    'scissors', 'teddy bear', 'hair drier', 'toothbrush'
])
MSCOCO_OBJECT_SET = set(MSCOCO_OBJECTS_80)

def download_coco_annotations():
    """Download COCO 2014 validation annotations if not present."""
    if os.path.exists(COCO_ANNOTATIONS_PATH):
        print("✅ COCO annotations already exist")
        return True

    print("📥 Downloading COCO 2014 validation annotations...")

    # COCO 2014 annotations URL
    annotations_url = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
    zip_path = os.path.join(COCO_DATA_DIR, "annotations_trainval2014.zip")

    try:
        # Download annotations
        print("   Downloading annotations zip file...")
        response = requests.get(annotations_url, stream=True)
        response.raise_for_status()

        total_size = int(response.headers.get('content-length', 0))
        with open(zip_path, 'wb') as f, tqdm(
            desc="Downloading",
            total=total_size,
            unit='B',
            unit_scale=True
        ) as pbar:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))

        # Extract annotations
        print("   Extracting annotations...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(COCO_DATA_DIR)

        # Move the captions file to expected location
        extracted_captions = os.path.join(COCO_DATA_DIR, "annotations", "captions_val2014.json")
        if os.path.exists(extracted_captions):
            os.rename(extracted_captions, COCO_ANNOTATIONS_PATH)
            print("✅ COCO annotations downloaded and extracted successfully")

        # Clean up
        os.remove(zip_path)
        return True

    except Exception as e:
        print(f"❌ Error downloading COCO annotations: {e}")
        return False

def normalize_image_id(image_id):
    """Normalize image ID by removing the prefix and keeping only the numeric part."""
    if '_' in str(image_id):
        return str(image_id).split('_')[-1]
    return str(image_id)

def load_coco_ground_truth():
    """Load COCO ground truth captions."""
    print("📚 Loading COCO ground truth captions...")

    if not os.path.exists(COCO_ANNOTATIONS_PATH):
        print("COCO annotations not found. Attempting to download...")
        if not download_coco_annotations():
            raise FileNotFoundError(f"Could not download COCO annotations")

    try:
        with open(COCO_ANNOTATIONS_PATH, 'r') as f:
            coco_data = json.load(f)

        # Process COCO annotations
        image_captions = defaultdict(list)

        for annotation in coco_data['annotations']:
            image_id = str(annotation['image_id']).zfill(12)
            caption = annotation['caption'].strip()

            # Store with normalized ID
            normalized_id = normalize_image_id(image_id)
            image_captions[normalized_id].append(caption)

        print(f"   Loaded {len(image_captions)} images with ground truth captions")
        print(f"   Average captions per image: {np.mean([len(caps) for caps in image_captions.values()]):.1f}")
        print(f"   Sample normalized IDs: {list(image_captions.keys())[:5]}")

        return dict(image_captions)

    except Exception as e:
        print(f"❌ Error loading COCO ground truth: {e}")
        raise

def load_qwen_captions():
    """Load Qwen2.5-VL generated captions."""
    print("🤖 Loading Qwen2.5-VL generated captions...")

    if not os.path.exists(QWEN_CAPTIONS_PATH):
        raise FileNotFoundError(f"Qwen captions not found: {QWEN_CAPTIONS_PATH}")

    try:
        with open(QWEN_CAPTIONS_PATH, 'r') as f:
            qwen_data = json.load(f)

        # Extract captions from our format
        if 'captions' in qwen_data:
            captions_dict = qwen_data['captions']
        else:
            captions_dict = qwen_data

        # Convert to simple format with normalized IDs
        qwen_captions = {}
        for image_id, caption_data in captions_dict.items():
            # Normalize the image ID
            normalized_id = normalize_image_id(image_id)

            if isinstance(caption_data, dict) and 'generated_caption' in caption_data:
                qwen_captions[normalized_id] = caption_data['generated_caption']
            elif isinstance(caption_data, str):
                qwen_captions[normalized_id] = caption_data
            else:
                print(f"Warning: Unexpected format for image {image_id}")

        print(f"   Loaded {len(qwen_captions)} Qwen2.5-VL generated captions")
        print(f"   Sample normalized IDs: {list(qwen_captions.keys())[:5]}")

        return qwen_captions

    except Exception as e:
        print(f"❌ Error loading Qwen captions: {e}")
        raise

def cosine_similarity_torch(tensor1, tensor2):
    """Compute cosine similarity using PyTorch."""
    tensor1_norm = F.normalize(tensor1.unsqueeze(0), p=2, dim=1)
    tensor2_norm = F.normalize(tensor2.unsqueeze(0), p=2, dim=1)
    similarity = torch.mm(tensor1_norm, tensor2_norm.t())
    return similarity.item()

def get_bert_embedding(text):
    """Get BERT embedding for a text."""
    try:
        with torch.no_grad():
            text = str(text).strip()
            if not text:
                return None

            inputs = bert_tokenizer(
                text,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(device)

            outputs = bert_model(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]
            return embeddings.squeeze(0)

    except Exception as e:
        print(f"Error getting BERT embedding for text '{text[:50]}...': {e}")
        return None

def compute_factual_score(generated_caption, ground_truth_captions):
    """Compute factual score using semantic similarity with ground truth captions."""
    try:
        if not generated_caption or not ground_truth_captions:
            return 0.0

        gen_embedding = get_bert_embedding(generated_caption)
        if gen_embedding is None:
            return 0.0

        similarities = []
        for gt_caption in ground_truth_captions:
            gt_embedding = get_bert_embedding(gt_caption)
            if gt_embedding is not None:
                similarity = cosine_similarity_torch(gen_embedding, gt_embedding)
                similarities.append(float(similarity))

        return float(max(similarities)) if similarities else 0.0

    except Exception as e:
        print(f"Error computing factual score: {e}")
        return 0.0

def extract_mscoco_objects_from_caption(caption_text):
    """Extract MSCOCO objects mentioned in a caption using the COCO object list."""
    words = re.findall(r'\b\w+\b', caption_text.lower())
    mentioned_objects = set()

    for word in words:
        if word in MSCOCO_OBJECT_SET:
            mentioned_objects.add(word)

    return mentioned_objects

def calculate_chair_metrics_individual(generated_caption, ground_truth_captions):
    """Calculate CHAIR metrics for a single image."""
    # Extract ground truth objects
    gt_objects = set()
    for caption in ground_truth_captions:
        gt_objects.update(extract_mscoco_objects_from_caption(caption))

    # Extract mentioned objects from generated caption
    mentioned_objects = extract_mscoco_objects_from_caption(generated_caption)

    # Find hallucinated objects (mentioned but not in ground truth)
    hallucinated_objects = mentioned_objects - gt_objects

    return {
        'mentioned_objects': list(mentioned_objects),
        'hallucinated_objects': list(hallucinated_objects),
        'ground_truth_objects': list(gt_objects),
        'has_hallucination': len(hallucinated_objects) > 0
    }

def calculate_standard_metrics_individual(generated_caption, ground_truth_captions):
    """Calculate standard captioning metrics for a single image."""
    smoothing = SmoothingFunction()

    # Tokenize
    gen_tokens = generated_caption.lower().split()
    gt_tokens_list = [gt.lower().split() for gt in ground_truth_captions]

    # BLEU scores
    try:
        bleu_1 = sentence_bleu(gt_tokens_list, gen_tokens,
                             weights=(1.0, 0, 0, 0),
                             smoothing_function=smoothing.method1)
    except:
        bleu_1 = 0.0

    try:
        bleu_4 = sentence_bleu(gt_tokens_list, gen_tokens,
                             weights=(0.25, 0.25, 0.25, 0.25),
                             smoothing_function=smoothing.method1)
    except:
        bleu_4 = 0.0

    # Simple ROUGE-L implementation
    def simple_rouge_l(candidate, reference):
        def lcs_length(x, y):
            m, n = len(x), len(y)
            dp = [[0] * (n + 1) for _ in range(m + 1)]

            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i-1] == y[j-1]:
                        dp[i][j] = dp[i-1][j-1] + 1
                    else:
                        dp[i][j] = max(dp[i-1][j], dp[i][j-1])

            return dp[m][n]

        candidate_words = candidate.split()
        reference_words = reference.split()

        if not candidate_words or not reference_words:
            return 0.0

        lcs_len = lcs_length(candidate_words, reference_words)

        if lcs_len == 0:
            return 0.0

        precision = lcs_len / len(candidate_words)
        recall = lcs_len / len(reference_words)

        if precision + recall == 0:
            return 0.0

        f1 = 2 * precision * recall / (precision + recall)
        return f1

    # ROUGE-L score
    rouge_vals = []
    for gt_caption in ground_truth_captions:
        rouge = simple_rouge_l(generated_caption.lower(), gt_caption.lower())
        rouge_vals.append(rouge)
    rouge_l = max(rouge_vals) if rouge_vals else 0.0

    return {
        'BLEU-1': bleu_1,
        'BLEU-4': bleu_4,
        'ROUGE-L': rouge_l
    }

def analyze_captions():
    """Main analysis function."""
    print("🔬 Starting comprehensive caption analysis...")

    # Load data
    ground_truth_captions = load_coco_ground_truth()
    qwen_captions = load_qwen_captions()

    # Find common images using normalized IDs
    common_images = set(ground_truth_captions.keys()) & set(qwen_captions.keys())
    print(f"📊 Found {len(common_images)} images with both Qwen and ground truth captions")

    if len(common_images) == 0:
        print("❌ No common images found even after normalization! Check image ID formats.")
        print(f"Sample Qwen normalized IDs: {list(qwen_captions.keys())[:5]}")
        print(f"Sample GT normalized IDs: {list(ground_truth_captions.keys())[:5]}")
        return

    # Analyze each image
    results = []

    print("🧮 Computing comprehensive scores...")
    for image_id in tqdm(sorted(common_images), desc="Analyzing images"):
        try:
            generated_caption = qwen_captions[image_id]
            gt_captions = ground_truth_captions[image_id]

            # Compute factual score
            factual_score = compute_factual_score(generated_caption, gt_captions)

            # Compute CHAIR metrics
            chair_metrics = calculate_chair_metrics_individual(generated_caption, gt_captions)

            # Compute standard metrics
            standard_metrics = calculate_standard_metrics_individual(generated_caption, gt_captions)

            # Create comprehensive result
            result = {
                'image_id': image_id,
                'generated_caption': generated_caption,
                'ground_truth_captions': '; '.join(gt_captions),
                'factual_score': factual_score,
                'has_hallucination': chair_metrics['has_hallucination'],
                'mentioned_objects': '; '.join(chair_metrics['mentioned_objects']),
                'hallucinated_objects': '; '.join(chair_metrics['hallucinated_objects']),
                'ground_truth_objects': '; '.join(chair_metrics['ground_truth_objects']),
                'BLEU-1': standard_metrics['BLEU-1'],
                'BLEU-4': standard_metrics['BLEU-4'],
                'ROUGE-L': standard_metrics['ROUGE-L']
            }

            results.append(result)

        except Exception as e:
            print(f"Error analyzing image {image_id}: {e}")
            continue

    # Convert to DataFrame
    df = pd.DataFrame(results)

    # Add the requested additional columns
    print("📊 Adding additional computed columns...")

    df["hallucinated_objects_count"] = df["hallucinated_objects"].apply(
        lambda x: len(x.split(";")) if pd.notna(x) and x.strip() else 0
    )
    df["mentioned_objects_count"] = df["mentioned_objects"].apply(
        lambda x: len(x.split(";")) if pd.notna(x) and x.strip() else 0
    )
    df["ground_truth_objects_count"] = df["ground_truth_objects"].apply(
        lambda x: len(x.split(";")) if pd.notna(x) and x.strip() else 0
    )

    df["chair_regression_score"] = np.where(
        df["mentioned_objects_count"] == 0,
        0,
        df["hallucinated_objects_count"] / df["mentioned_objects_count"]
    )

    # Compute overall statistics
    stats = {
        'total_images': len(df),
        'average_factual_score': float(df['factual_score'].mean()),
        'hallucination_rate': float(df['has_hallucination'].mean()),
        'average_chair_regression_score': float(df['chair_regression_score'].mean()),
        'average_bleu_1': float(df['BLEU-1'].mean()),
        'average_bleu_4': float(df['BLEU-4'].mean()),
        'average_rouge_l': float(df['ROUGE-L'].mean()),
        'average_mentioned_objects': float(df['mentioned_objects_count'].mean()),
        'average_hallucinated_objects': float(df['hallucinated_objects_count'].mean()),
        'average_ground_truth_objects': float(df['ground_truth_objects_count'].mean())
    }

    # Save CSV file
    print(f"💾 Saving comprehensive scores to CSV...")
    df.to_csv(OUTPUT_CSV_PATH, index=False)

    # Save summary statistics
    summary_data = {
        'analysis_date': pd.Timestamp.now().isoformat(),
        'model_type': 'Qwen2.5-VL',
        'total_images_analyzed': len(df),
        'statistics': stats,
        'column_descriptions': {
            'factual_score': 'BERT-based semantic similarity with ground truth (0-1)',
            'has_hallucination': 'Boolean indicating presence of hallucinated objects',
            'chair_regression_score': 'Ratio of hallucinated to mentioned objects (0-1)',
            'mentioned_objects_count': 'Number of COCO objects mentioned in caption',
            'hallucinated_objects_count': 'Number of COCO objects mentioned but not in ground truth',
            'ground_truth_objects_count': 'Number of COCO objects in ground truth captions',
            'BLEU-1': 'BLEU-1 score (0-1)',
            'BLEU-4': 'BLEU-4 score (0-1)',
            'ROUGE-L': 'ROUGE-L F1 score (0-1)'
        }
    }

    with open(SUMMARY_PATH, 'w') as f:
        json.dump(summary_data, f, indent=2)

    return df, stats

def print_analysis_summary(df, stats):
    """Print a comprehensive summary of the analysis."""

    print("\n" + "=" * 60)
    print("📊 QWEN2.5-VL COMPREHENSIVE ANALYSIS SUMMARY")
    print("=" * 60)

    print(f"🔢 Total Images Analyzed: {stats['total_images']}")
    print(f"📈 Average Factual Score: {stats['average_factual_score']:.4f}")
    print(f"🚨 Hallucination Rate: {stats['hallucination_rate']:.4f} ({stats['hallucination_rate']*100:.1f}%)")
    print(f"📊 Average CHAIR Regression Score: {stats['average_chair_regression_score']:.4f}")

    print(f"\n📝 Standard Captioning Metrics:")
    print(f"   BLEU-1: {stats['average_bleu_1']:.4f}")
    print(f"   BLEU-4: {stats['average_bleu_4']:.4f}")
    print(f"   ROUGE-L: {stats['average_rouge_l']:.4f}")

    print(f"\n🔍 Object Detection Analysis:")
    print(f"   Average Mentioned Objects: {stats['average_mentioned_objects']:.2f}")
    print(f"   Average Hallucinated Objects: {stats['average_hallucinated_objects']:.2f}")
    print(f"   Average Ground Truth Objects: {stats['average_ground_truth_objects']:.2f}")

    # Show best and worst examples
    print(f"\n🏆 Best Examples (Highest Factual Score):")
    best_examples = df.nlargest(3, 'factual_score')
    for i, (_, row) in enumerate(best_examples.iterrows()):
        print(f"   {i+1}. Factual Score: {row['factual_score']:.4f}, CHAIR: {row['chair_regression_score']:.4f}")
        print(f"      Generated: '{row['generated_caption'][:100]}...'")
        print(f"      Hallucinated Objects: {row['hallucinated_objects_count']}")

    print(f"\n🔍 Worst Examples (Lowest Factual Score):")
    worst_examples = df.nsmallest(3, 'factual_score')
    for i, (_, row) in enumerate(worst_examples.iterrows()):
        print(f"   {i+1}. Factual Score: {row['factual_score']:.4f}, CHAIR: {row['chair_regression_score']:.4f}")
        print(f"      Generated: '{row['generated_caption'][:100]}...'")
        print(f"      Hallucinated Objects: {row['hallucinated_objects_count']}")

def main():
    """Main function."""
    try:
        print("🚀 Starting Qwen2.5-VL Comprehensive Scoring Pipeline")
        print("=" * 60)

        # Check if Qwen captions exist
        if not os.path.exists(QWEN_CAPTIONS_PATH):
            print(f"❌ Qwen captions file not found: {QWEN_CAPTIONS_PATH}")
            print("   Please run the caption extraction script first!")
            return

        # Run analysis
        df, stats = analyze_captions()

        if df is not None and len(df) > 0:
            # Print summary
            print_analysis_summary(df, stats)

            print(f"\n📁 Output Files:")
            print(f"   Comprehensive CSV: {OUTPUT_CSV_PATH}")
            print(f"   Summary Statistics: {SUMMARY_PATH}")

            print(f"\n📊 CSV File Contains {len(df)} rows with columns:")
            for col in df.columns:
                print(f"   • {col}")

            print(f"\n✨ Analysis completed successfully!")

    except KeyboardInterrupt:
        print(f"\n⚠️  Analysis interrupted by user")
    except Exception as e:
        print(f"\n❌ Error in main: {e}")
        traceback.print_exc()

if __name__ == "__main__":
    main()

In [None]:
import os
import json
import h5py
import pandas as pd
import numpy as np
from tqdm import tqdm
import gc

def normalize_image_id(image_id):
    """Normalize image ID for matching."""
    if isinstance(image_id, str) and '_' in image_id:
        return image_id.split('_')[-1]  # Extract numeric part from H5

    if isinstance(image_id, (int, float)):
        return str(int(image_id)).zfill(12)  # Zero-pad CSV IDs

    return str(image_id)

def load_scores_data(csv_path):
    """Load and prepare scores data."""
    print("📊 Loading scores data...")

    scores_df = pd.read_csv(csv_path)
    print(f"   Loaded {len(scores_df)} rows")

    # Normalize IDs for matching
    scores_df['norm_id'] = scores_df['image_id'].apply(normalize_image_id)

    # Create lookup dictionary for fast access
    scores_dict = {}
    for _, row in scores_df.iterrows():
        scores_dict[row['norm_id']] = {
            'generated_caption': row.get('generated_caption', ''),
            'ground_truth_captions': row.get('ground_truth_captions', ''),
            'factual_score': row.get('factual_score', 0.0),
            'has_hallucination': row.get('has_hallucination', False),
            'BLEU-1': row.get('BLEU-1', 0.0),
            'BLEU-4': row.get('BLEU-4', 0.0),
            'ROUGE-L': row.get('ROUGE-L', 0.0),
            'chair_regression_score': row.get('chair_regression_score', 0.0)
        }

    print(f"   Created lookup dict for {len(scores_dict)} images")
    return scores_dict

def discover_embedding_structure(embeddings_dir):
    """Discover available embedding types."""
    print("🔍 Discovering embedding structure...")

    h5_files = [f for f in os.listdir(embeddings_dir) if f.endswith('.h5')]
    if not h5_files:
        return {}

    # Check first file
    sample_file = os.path.join(embeddings_dir, h5_files[0])
    configs = {}

    with h5py.File(sample_file, 'r') as f:
        sample_image = list(f.keys())[0]
        img_group = f[sample_image]

        print(f"   Sample image: {sample_image}")
        print(f"   Available keys: {list(img_group.keys())}")

        # Vision embeddings
        if 'vision_embeddings' in img_group:
            configs['vision_embeddings'] = ['vision_embeddings']
            print(f"   ✅ Found vision_embeddings: {img_group['vision_embeddings'].shape}")

        # Pre-generation embeddings
        if 'pre_generation' in img_group:
            pre_gen = img_group['pre_generation']
            layers = list(pre_gen.keys())
            print(f"   📥 Pre-generation layers: {layers}")

            for layer in layers:
                layer_group = pre_gen[layer]
                emb_types = list(layer_group.keys())
                print(f"     {layer}: {emb_types}")

                for emb_type in emb_types:
                    config_name = f"pre_generation_{layer}_{emb_type.replace('_embeddings', '')}"
                    configs[config_name] = ['pre_generation', layer, emb_type]

    print(f"   📋 Discovered {len(configs)} embedding types")
    return configs

def extract_embedding(img_group, path):
    """Extract embedding from H5 group using path."""
    try:
        current = img_group
        for step in path:
            if step in current:
                current = current[step]
            else:
                return None

        # Convert to numpy array
        if hasattr(current, 'shape'):
            return np.array(current)
        return None

    except Exception as e:
        return None

def process_single_config(config_name, path, embeddings_dir, scores_dict, output_dir, chunk_size=1000):
    """Process a single embedding configuration with chunked processing."""
    print(f"\n🔨 Processing: {config_name}")
    print(f"   Path: {' -> '.join(path)}")

    # Get all H5 files
    h5_files = [f for f in os.listdir(embeddings_dir) if f.endswith('.h5')]
    h5_files.sort()  # Process in order

    output_file = os.path.join(output_dir, f"qwen_dataset_{config_name}.csv")

    # Process files in chunks to avoid memory issues
    all_rows = []
    processed_count = 0
    matched_count = 0

    for h5_file in tqdm(h5_files, desc=f"Processing {config_name}"):
        file_path = os.path.join(embeddings_dir, h5_file)

        try:
            with h5py.File(file_path, 'r') as f:
                image_ids = list(f.keys())

                for image_id in image_ids:
                    try:
                        img_group = f[image_id]

                        # Extract embedding
                        embedding = extract_embedding(img_group, path)

                        if embedding is not None:
                            processed_count += 1

                            # Normalize ID and check for scores
                            norm_id = normalize_image_id(image_id)

                            if norm_id in scores_dict:
                                matched_count += 1
                                scores = scores_dict[norm_id]

                                row = {
                                    'image_id': image_id,
                                    'corresponding_embedding': embedding.tolist(),
                                    'generated_caption': scores['generated_caption'],
                                    'ground_truth_captions': scores['ground_truth_captions'],
                                    'factual_score': scores['factual_score'],
                                    'has_hallucination': scores['has_hallucination'],
                                    'BLEU-1': scores['BLEU-1'],
                                    'BLEU-4': scores['BLEU-4'],
                                    'ROUGE-L': scores['ROUGE-L'],
                                    'chair_regression_score': scores['chair_regression_score']
                                }

                                all_rows.append(row)

                                # Save chunk to avoid memory buildup
                                if len(all_rows) >= chunk_size:
                                    save_chunk(all_rows, output_file, is_first=(matched_count <= chunk_size))
                                    all_rows = []
                                    gc.collect()  # Force garbage collection

                    except Exception as e:
                        continue

        except Exception as e:
            print(f"   Error processing {h5_file}: {e}")
            continue

    # Save remaining rows
    if all_rows:
        save_chunk(all_rows, output_file, is_first=(matched_count <= len(all_rows)))

    print(f"   ✅ Processed {processed_count} embeddings, matched {matched_count}")

    if matched_count > 0:
        print(f"   💾 Saved to: {output_file}")

        # Create summary
        create_summary(output_file, config_name, path, matched_count)
        return True
    else:
        print(f"   ❌ No matches found for {config_name}")
        return False

def save_chunk(rows, output_file, is_first=True):
    """Save a chunk of rows to CSV file."""
    df = pd.DataFrame(rows)

    # Write header only for first chunk
    mode = 'w' if is_first else 'a'
    header = is_first

    df.to_csv(output_file, mode=mode, header=header, index=False)

def create_summary(csv_file, config_name, path, total_samples):
    """Create summary statistics for the dataset."""
    try:
        # Read just a sample to get embedding dimension
        sample_df = pd.read_csv(csv_file, nrows=1)
        embedding_dim = len(eval(sample_df['corresponding_embedding'].iloc[0]))

        # Read full file for statistics (only if not too large)
        if total_samples < 10000:
            full_df = pd.read_csv(csv_file)

            summary = {
                'dataset_name': f"qwen_dataset_{config_name}",
                'total_samples': total_samples,
                'embedding_dimension': embedding_dim,
                'embedding_path': path,
                'statistics': {
                    'factual_score': {
                        'mean': float(full_df['factual_score'].mean()),
                        'std': float(full_df['factual_score'].std()),
                        'min': float(full_df['factual_score'].min()),
                        'max': float(full_df['factual_score'].max())
                    },
                    'hallucination_rate': float(full_df['has_hallucination'].mean()),
                    'avg_bleu4': float(full_df['BLEU-4'].mean()),
                    'avg_rouge_l': float(full_df['ROUGE-L'].mean())
                }
            }
        else:
            # For large files, create basic summary
            summary = {
                'dataset_name': f"qwen_dataset_{config_name}",
                'total_samples': total_samples,
                'embedding_dimension': embedding_dim,
                'embedding_path': path,
                'note': 'Statistics not computed for large dataset to save memory'
            }

        # Save summary
        summary_file = csv_file.replace('.csv', '_summary.json')
        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)

    except Exception as e:
        print(f"   Warning: Could not create summary: {e}")

def main():
    """Main function - simple and robust."""

    # Paths
    embeddings_dir = "/root/akhil_workspace/qwen_model/extracted_embeddings_coco2014"
    scores_csv_path = "/root/akhil_workspace/qwen_model/evaluation_results/qwen2_comprehensive_scores.csv"
    output_dir = "/root/akhil_workspace/qwen_model/comprehensive_datasets"

    print("🚀 Simple Qwen Dataset Creation")
    print("=" * 50)
    print("Features:")
    print("  • Memory-efficient chunked processing")
    print("  • No complex GPU operations")
    print("  • Robust error handling")
    print("  • Saves datasets incrementally")
    print("=" * 50)

    # Check paths
    if not os.path.exists(embeddings_dir):
        print(f"❌ Embeddings directory not found: {embeddings_dir}")
        return

    if not os.path.exists(scores_csv_path):
        print(f"❌ Scores CSV not found: {scores_csv_path}")
        return

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Load scores data
    scores_dict = load_scores_data(scores_csv_path)

    # Discover embedding structure
    configs = discover_embedding_structure(embeddings_dir)

    if not configs:
        print("❌ No embedding configurations found!")
        return

    print(f"\n🎯 Will create {len(configs)} datasets:")
    for config_name in configs.keys():
        print(f"   • {config_name}")

    # Process each configuration
    successful = 0
    failed = 0

    for config_name, path in configs.items():
        try:
            success = process_single_config(
                config_name, path, embeddings_dir, scores_dict, output_dir
            )

            if success:
                successful += 1
            else:
                failed += 1

        except Exception as e:
            print(f"   ❌ Failed {config_name}: {e}")
            failed += 1

        # Force cleanup after each config
        gc.collect()

    # Final summary
    print(f"\n🎉 Dataset Creation Complete!")
    print(f"   ✅ Successful: {successful}")
    print(f"   ❌ Failed: {failed}")
    print(f"   📁 Output directory: {output_dir}")

    if successful > 0:
        print(f"\n📋 Each dataset contains:")
        print(f"   • image_id, corresponding_embedding, generated_caption")
        print(f"   • ground_truth_captions, factual_score, has_hallucination")
        print(f"   • BLEU-1, BLEU-4, ROUGE-L, chair_regression_score")

if __name__ == "__main__":
    main()

In [None]:
# 300 and 500_dataset_creation Missing