# MECH_INTERP_PHYSICS_REASONING - Kaggle Evaluation

This notebook allows you to run the PaliGemma evaluation on Kaggle with GPU support.

## ⚠️ IMPORTANT: Setup GPU Runtime First!

Before running this notebook:
1. In the Kaggle Notebook editor, go to `Settings` (right sidebar).
2. Under `Accelerator`, select `GPU` (e.g., T4 x2, P100).
3. Ensure your Code and Data Kaggle Datasets are added via `File` -> `Add or upload data`.

## 1. Check GPU Availability

In [None]:
# Check GPU availability
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("⚠️ No GPU detected or CUDA not available! Please enable GPU in Notebook settings.")

## 2. Setup Project Environment

Assuming your project code (including this notebook, `scripts/`, `src/`, `base_eval_config.yaml`) is in a Kaggle Dataset named `mech-interp-project-code`.
And your data (`test_frames/`, `miscellaneous/`) is in a Kaggle Dataset named `mech-interp-clevrer-data`.

In [None]:
import os

# --- IMPORTANT: Adjust these paths if your Kaggle dataset names are different ---
KAGGLE_CODE_DATASET_PATH = '/kaggle/input/mech-interp-project-code/' # Root of your code dataset
KAGGLE_DATA_DATASET_PATH = '/kaggle/input/mech-interp-clevrer-data/' # Root of your data dataset
# ---

# Change working directory to the root of your project code
if os.path.exists(KAGGLE_CODE_DATASET_PATH):
    os.chdir(KAGGLE_CODE_DATASET_PATH)
    print(f"Changed working directory to: {os.getcwd()}")
else:
    print(f"ERROR: Code dataset path not found: {KAGGLE_CODE_DATASET_PATH}")
    print("Please ensure you've added your code dataset and the path is correct.")

print("\nCurrent directory listing:")
!ls -la

# Define project root for scripts (should be the current KAGGLE_CODE_DATASET_PATH)
os.environ['HOME_DIR'] = os.getcwd() 
print(f"HOME_DIR environment variable set to: {os.environ['HOME_DIR']}")

## 3. Install Dependencies

In [None]:
# Install required packages
!pip install -q torch torchvision transformers>=4.36.0
!pip install -q peft accelerate bitsandbytes
!pip install -q Pillow numpy pyyaml
!pip install -q wandb pytz # wandb and pytz might not be strictly needed for eval but kept from original

# Verify installations
import transformers
import peft
print(f"✓ Transformers version: {transformers.__version__}")
print(f"✓ PEFT version: {peft.__version__}")
print("✓ Key packages queryable after installation attempt!")

## 4. Check and Update Configuration for Kaggle

This step updates `base_eval_config.yaml` to point to data paths within your Kaggle data dataset and sets `test_size`.

In [None]:
import yaml
import os
import json

config_filename = "base_eval_config.yaml"
# Config path is now relative to KAGGLE_CODE_DATASET_PATH (current working directory)
config_path_abs = os.path.join(os.getcwd(), config_filename)

if os.path.exists(config_path_abs):
    print(f"Found config file: {config_path_abs}")
    # Read current config
    with open(config_path_abs, 'r') as f:
        config = yaml.safe_load(f)

    print(f"Original test_size: {config['data_config'].get('test_size', 'Not set')}")
    print(f"Original data_path: {config['data_config'].get('data_path', 'Not set')}")
    print(f"Original json_path: {config['data_config'].get('json_path', 'Not set')}")

    # Update data paths to point to the Kaggle data dataset
    # KAGGLE_DATA_DATASET_PATH was defined in cell "2. Setup Project Environment"
    config['data_config']['data_path'] = os.path.join(KAGGLE_DATA_DATASET_PATH, 'test_frames')
    config['data_config']['json_path'] = os.path.join(KAGGLE_DATA_DATASET_PATH, 'miscellaneous/validation.json')

    # Update test_size: evaluate on all samples in validation.json or a fixed number
    # Option 1: Evaluate on all samples from validation.json
    abs_validation_json_path = config['data_config']['json_path']
    if os.path.exists(abs_validation_json_path):
        with open(abs_validation_json_path, 'r') as ann_file:
            num_samples = len(json.load(ann_file))
        config['data_config']['test_size'] = num_samples 
        print(f"✓ Set test_size to evaluate all {num_samples} samples from {abs_validation_json_path}.")
    else:
        print(f"⚠️ Validation JSON not found at {abs_validation_json_path}. test_size not updated dynamically. Ensure KAGGLE_DATA_DATASET_PATH is correct.")
        # Fallback or keep original if file not found, e.g.:
        # config['data_config']['test_size'] = config['data_config'].get('test_size', 100) # or a default

    # Option 2: Use a fixed test_size (e.g., 100, as in original Colab)
    # Uncomment below and comment out Option 1 if you prefer a fixed size
    # current_test_size = config['data_config'].get('test_size', 0)
    # if current_test_size < 10: # Or whatever threshold you prefer
    #    config['data_config']['test_size'] = 100
    #    print(f"✓ Updated test_size to: {config['data_config']['test_size']}")
    # else:
    #    print(f"✓ test_size is already: {config['data_config']['test_size']}")

    # Save the updated config back to the same path (within the code dataset)
    # Note: This modifies the config file in the /kaggle/input (read-only) if not careful.
    # For Kaggle, it's better to load this config in the script and use these updated paths directly,
    # OR copy the config to /kaggle/working and modify it there.
    # The eval_kaggle.py script is designed to read these paths from the config dictionary passed to it.
    # So, we just need to ensure the script gets this modified 'config' dictionary.
    # The current eval_kaggle.py reads the config file itself. So we DO need to write it.
    # Let's write it to /kaggle/working/ for this run.

    KAGGLE_WORKING_DIR = "/kaggle/working/"
    os.makedirs(KAGGLE_WORKING_DIR, exist_ok=True)
    updated_config_path_in_working = os.path.join(KAGGLE_WORKING_DIR, config_filename)

    with open(updated_config_path_in_working, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)
    
    print(f"✓ Updated data_path to: {config['data_config']['data_path']}")
    print(f"✓ Updated json_path to: {config['data_config']['json_path']}")
    print(f"✓ Updated config saved to: {updated_config_path_in_working}")
    print("The evaluation script will need to use this updated config file.")

else:
    print(f"❌ Config file not found at {config_path_abs}. Ensure it's in your code dataset and CWD is correct.")
    print("Available files in CWD:")
    !ls -la *.yaml

## 5. Test Data Availability

Verify that the data paths (now pointing to your Kaggle data dataset) are accessible.

In [None]:
import os
# KAGGLE_DATA_DATASET_PATH was defined in cell "2. Setup Project Environment"

print("Checking data directories based on KAGGLE_DATA_DATASET_PATH...")
data_dirs_to_check = [
    os.path.join(KAGGLE_DATA_DATASET_PATH, 'test_frames'), 
    os.path.join(KAGGLE_DATA_DATASET_PATH, 'miscellaneous')
]
for dir_path in data_dirs_to_check:
    if os.path.exists(dir_path) and os.path.isdir(dir_path):
        count = len(os.listdir(dir_path))
        print(f"✓ {dir_path}: Exists with {count} items")
    else:
        print(f"❌ {dir_path}: NOT FOUND or not a directory. Check KAGGLE_DATA_DATASET_PATH and dataset contents.")

print("\nChecking annotation files...")
ann_files_to_check = [
    os.path.join(KAGGLE_DATA_DATASET_PATH, 'miscellaneous/validation.json'), 
    os.path.join(KAGGLE_DATA_DATASET_PATH, 'miscellaneous/train.json') # Optional, for completeness
]
for ann_file_path in ann_files_to_check:
    if os.path.exists(ann_file_path) and os.path.isfile(ann_file_path):
        print(f"✓ {ann_file_path}: EXISTS")
    else:
        print(f"❌ {ann_file_path}: NOT FOUND or not a file. Check KAGGLE_DATA_DATASET_PATH and dataset contents.")

## 6. Run Evaluation

The `scripts/eval_kaggle.py` (which should be in your code dataset) will be executed.
It will use the `base_eval_config.yaml` that we've updated and saved to `/kaggle/working/`.

In [None]:
# Ensure the current working directory is still the project root from the code dataset
# os.getcwd() should be KAGGLE_CODE_DATASET_PATH
print(f"Current working directory for script execution: {os.getcwd()}")

# The eval_kaggle.py script needs to be told to use the config from /kaggle/working/
# We need to modify eval_kaggle.py to accept a config path argument, or adjust its logic.
# For now, let's assume eval_kaggle.py is modified to look for config in /kaggle/working/ if a specific env var is set, or passed as arg.
# The current eval_kaggle.py joins HOME_DIR with 'base_eval_config.yaml'.
# So, the command should be run from a place where HOME_DIR is /kaggle/working/ if we want it to pick up the modified config directly,
# OR we pass the config path as an argument to eval_kaggle.py.

# Let's adjust eval_kaggle.py to accept --config_file argument.
# (This change would need to be made to the script itself, then re-upload to dataset or modify here if written by notebook)
# For now, the script uses os.path.join(HOME_DIR, "base_eval_config.yaml").
# HOME_DIR is set to KAGGLE_CODE_DATASET_PATH.
# So, the script will try to load /kaggle/input/mech-interp-project-code/base_eval_config.yaml (original one).

# To use the MODIFIED config, we need to ensure the script loads it from /kaggle/working/base_eval_config.yaml
# Simplest way without changing script args: copy the script to /kaggle/working, cd there, and run.

KAGGLE_WORKING_DIR = "/kaggle/working/"
SCRIPT_NAME = "eval_kaggle.py"
SOURCE_SCRIPT_PATH = os.path.join(os.environ['HOME_DIR'], "scripts", SCRIPT_NAME)
DEST_SCRIPT_PATH = os.path.join(KAGGLE_WORKING_DIR, SCRIPT_NAME) # Script in /kaggle/working/

if os.path.exists(SOURCE_SCRIPT_PATH):
    import shutil
    shutil.copy(SOURCE_SCRIPT_PATH, DEST_SCRIPT_PATH)
    print(f"Copied {SCRIPT_NAME} to {DEST_SCRIPT_PATH}")

    # The config file 'base_eval_config.yaml' is already in KAGGLE_WORKING_DIR.
    # The script eval_kaggle.py, when run from KAGGLE_WORKING_DIR and if HOME_DIR is set to KAGGLE_WORKING_DIR,
    # will pick up config from KAGGLE_WORKING_DIR/base_eval_config.yaml.

    print("Running evaluation script from /kaggle/working/...")
    # Temporarily change CWD and HOME_DIR for the script execution context
    original_cwd = os.getcwd()
    original_home_dir_env = os.environ.get('HOME_DIR')

    os.chdir(KAGGLE_WORKING_DIR)
    os.environ['HOME_DIR'] = KAGGLE_WORKING_DIR # So script finds config in /kaggle/working/
    print(f"Temporarily changed CWD to: {os.getcwd()}")
    print(f"Temporarily changed HOME_DIR to: {os.environ['HOME_DIR']}")

    !python {SCRIPT_NAME} --base 

    # Restore original CWD and HOME_DIR
    os.chdir(original_cwd)
    if original_home_dir_env is not None:
        os.environ['HOME_DIR'] = original_home_dir_env
    else:
        del os.environ['HOME_DIR'] # if it wasn't set before
    print(f"Restored CWD to: {os.getcwd()}")
    print(f"Restored HOME_DIR to: {os.environ.get('HOME_DIR')}")
else:
    print(f"ERROR: Source script {SOURCE_SCRIPT_PATH} not found.")

# If you have a checkpoint to evaluate (e.g., in your code dataset under 'artifacts/my_checkpoint'):
# CHECKPOINT_RELATIVE_PATH = "artifacts/my_checkpoint" # Relative to KAGGLE_CODE_DATASET_PATH
# !python {SCRIPT_NAME} {CHECKPOINT_RELATIVE_PATH} # This would need script to handle HOME_DIR correctly for checkpoint path too.
# The current script expects checkpoint_dir to be relative to HOME_DIR. So if HOME_DIR is /kaggle/working, this won't find it in /kaggle/input.
# For checkpoint evaluation, eval_kaggle.py might need more robust path handling for checkpoint_dir if it's not in HOME_DIR.

## 7. View Results

Results are saved in `/kaggle/working/artifacts/BASE/eval_<timestamp>/`.

In [None]:
import json
import glob
import os

KAGGLE_WORKING_DIR = "/kaggle/working/"
RESULTS_PARENT_DIR = os.path.join(KAGGLE_WORKING_DIR, "artifacts", "BASE")

if os.path.exists(RESULTS_PARENT_DIR):
    result_dirs = glob.glob(os.path.join(RESULTS_PARENT_DIR, "eval_*"))
    if result_dirs:
        latest_dir = max(result_dirs, key=os.path.getctime)
        print(f"Latest results directory: {latest_dir}")
        
        # Display summary results (progressive summary)
        summary_file = os.path.join(latest_dir, "eval_summary_progressive.txt")
        if os.path.exists(summary_file):
            with open(summary_file, 'r') as f:
                print("\n" + "="*50)
                print("EVALUATION SUMMARY (PROGRESSIVE)")
                print("="*50)
                print(f.read())
        else:
            print(f"Summary file not found: {summary_file}")
        
        # Load and analyze detailed results (progressive details)
        details_file = os.path.join(latest_dir, "eval_details_progressive.json")
        if os.path.exists(details_file):
            with open(details_file, 'r') as f:
                details = json.load(f)
            
            print(f"\nTotal samples in detailed results: {len(details)}")
            
            # Show some sample predictions from the detailed file
            print("\nSample predictions from detailed file (first 3):")
            print("-" * 50)
            for i, result in enumerate(details[:3]):
                print(f"\nSample {i+1}:")
                print(f"  Video: {result['video_filename']}")
                print(f"  Question Type: {result['question_type']}")
                print(f"  Correct: {'✓' if result['correct'] else '✗'}")
                if not result['correct']:
                    print(f"  Predicted tokens (first 10): {result['predicted_token_ids'][:10]}...")
                    print(f"  Expected tokens (first 10): {result['label_token_ids'][:10]}...")
        else:
            print(f"Detailed results file not found: {details_file}")
    else:
        print(f"No 'eval_*' directories found in {RESULTS_PARENT_DIR}. Run the evaluation first!")
else:
    print(f"Results parent directory {RESULTS_PARENT_DIR} not found. Run the evaluation first!")

## 8. Persisting Results

To save the contents of `/kaggle/working/` (including your results, logs, and the modified config):
1. Click on `Save Version` in the Kaggle notebook editor (top right).
2. Choose `Save & Run All (Commit)` or `Quick Save`.
3. After the version is saved, you can find the output files in the "Data" tab of your notebook's viewer page.

## Troubleshooting

### Out of Memory (OOM) Error
If you encounter OOM errors, try reducing `eval_batch_size` in `/kaggle/working/base_eval_config.yaml` (after it's copied and modified by cell 4) and re-run the evaluation cell (cell 6).

In [None]:
# Example: How to reduce batch size if OOM occurs
# This cell would be run MANUALLY if you hit OOM, then re-run cell 6 (Run Evaluation)
import yaml
import os

KAGGLE_WORKING_DIR = "/kaggle/working/"
config_filename = "base_eval_config.yaml"
oom_config_path = os.path.join(KAGGLE_WORKING_DIR, config_filename)

if os.path.exists(oom_config_path):
    with open(oom_config_path, 'r') as f:
        config = yaml.safe_load(f)

    # Set smaller batch size
    new_batch_size = config['model_train'].get('eval_batch_size', 4) // 2
    if new_batch_size < 1: new_batch_size = 1
    config['model_train']['eval_batch_size'] = new_batch_size

    with open(oom_config_path, 'w') as f:
        yaml.dump(config, f)

    print(f"✓ Batch size in {oom_config_path} reduced to {new_batch_size}. Re-run evaluation cell.")
else:
    print(f"Config file {oom_config_path} not found. Ensure cell 4 has run successfully.")

In [None]:
# Solution 2: Clear GPU memory (less effective if model is already large)
import torch
import gc

if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()
print("✓ GPU memory cleared (if CUDA was available) and Python garbage collected.")

### Debug Single Sample
To debug with a single sample, you might need to temporarily modify `scripts/eval_kaggle.py` or create a small test script. Ensure paths point to your Kaggle data dataset.

In [None]:
import sys
# Ensure src is in path - os.environ['HOME_DIR'] should be /kaggle/input/mech-interp-project-code/
sys.path.insert(0, os.environ['HOME_DIR'])

from src.data import ClevrerDataset # Assuming ClevrerDataset is in src/data.py

# KAGGLE_DATA_DATASET_PATH was defined in cell "2. Setup Project Environment"
debug_frames_root = os.path.join(KAGGLE_DATA_DATASET_PATH, "test_frames")
debug_json_path = os.path.join(KAGGLE_DATA_DATASET_PATH, "miscellaneous/validation.json")

if os.path.exists(debug_frames_root) and os.path.exists(debug_json_path):
    try:
        dataset = ClevrerDataset(
            frames_root=debug_frames_root,
            json_path=debug_json_path,
            question_type="descriptive", # or any specific type, or 'all'
            shuffle=False # Keep shuffle False for consistent debugging
        )

        if len(dataset) > 0:
            sample = dataset[0] # Get the first sample
            print(f"Sample question: {sample['question']}")
            print(f"Expected answer: {sample['answer']}")
            print(f"Question type: {sample['question_type']}")
            print(f"Number of frames: {len(sample['frames']) if 'frames' in sample and sample['frames'] is not None else 'N/A or 0'}")
            # print(f"Raw item: {dataset.get_raw_item(0)}") # If you have such a method
        else:
            print("Dataset is empty after initialization!")
    except Exception as e:
        print(f"Error initializing or accessing dataset for debug: {e}")
else:
    print(f"Debug data paths not found. Check KAGGLE_DATA_DATASET_PATH.")
    print(f"Frames root checked: {debug_frames_root}")
    print(f"JSON path checked: {debug_json_path}")