In [1]:
# CELL 1: Setup & Installation
# ================================================================

print("="*70)
print("  SETTING UP ENVIRONMENT")
print("="*70)

!pip install -q transformers torch torchvision pillow nltk rouge-score huggingface-hub

import torch
print(f"\n‚úÖ PyTorch version: {torch.__version__}")
print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")
print(f"‚úÖ GPU count: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"‚úÖ GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"   Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")

  SETTING UP ENVIRONMENT

‚úÖ PyTorch version: 2.8.0+cu126
‚úÖ CUDA available: True
‚úÖ GPU count: 2
‚úÖ GPU 0: Tesla T4
   Memory: 14.7 GB
‚úÖ GPU 1: Tesla T4
   Memory: 14.7 GB


In [2]:
#==================== CELL 3: Checking Dataset=====================================
from pathlib import Path
import os

print("\n" + "="*70)
print("  Locating NIH Dataset")
print("="*70)

# Common paths where NIH dataset might be
possible_paths = [
    '/kaggle/input/nih-chest-xrays/data/versions/3',
    '/kaggle/input/data',
    '/kaggle/input/chest-xray-dataset',
    '/kaggle/input/nih-chest-xray-dataset',
]

# Search for Data_Entry_2017.csv
dataset_path = None
for path in possible_paths:
    if os.path.exists(path):
        # Check if it has the required files
        csv_file = None
        for root, dirs, files in os.walk(path):
            if 'Data_Entry_2017.csv' in files:
                dataset_path = root
                csv_file = os.path.join(root, 'Data_Entry_2017.csv')
                break
        if dataset_path:
            break

# If not found, search everywhere in /kaggle/input
if not dataset_path:
    print("Searching for dataset...")
    for root, dirs, files in os.walk('/kaggle/input'):
        if 'Data_Entry_2017.csv' in files:
            dataset_path = root
            break

if dataset_path:
    print(f"‚úÖ Dataset found at: {dataset_path}")
    
    # List contents
    print(f"\nDataset contents:")
    for item in os.listdir(dataset_path):
        item_path = os.path.join(dataset_path, item)
        if os.path.isdir(item_path):
            count = len(list(Path(item_path).rglob('*.png'))) + len(list(Path(item_path).rglob('*.jpg')))
            print(f"  {item}/: {count} images")
        elif item.endswith('.csv') or item.endswith('.txt'):
            print(f"  {item}")
else:
    print("‚ùå Dataset not found!")
    print("\nPlease add NIH Chest X-ray dataset:")
    print("  1. Click 'Add Data' (right panel)")
    print("  2. Search 'NIH Chest X-ray'")
    print("  3. Add to notebook")
    print("  4. Restart kernel")
    raise FileNotFoundError("NIH dataset not found")

DATASET_PATH = dataset_path


  Locating NIH Dataset
‚úÖ Dataset found at: /kaggle/input/data

Dataset contents:
  images_003/: 10000 images
  images_012/: 7121 images
  BBox_List_2017.csv
  images_009/: 10000 images
  images_008/: 10000 images
  images_007/: 10000 images
  test_list.txt
  images_010/: 10000 images
  images_002/: 10000 images
  images_011/: 10000 images
  Data_Entry_2017.csv
  images_001/: 4999 images
  train_val_list.txt
  images_005/: 10000 images
  images_004/: 10000 images
  images_006/: 10000 images


In [6]:
# CELL 2: Configuration
# ================================================================

print("\n" + "="*70)
print("  CONFIGURATION")
print("="*70)

config = {
    # Dataset
    'dataset_path': DATASET_PATH,
    'output_dir': '/kaggle/working',
    'sample_size': None,  # None = use all data, or set to number like 10000 for testing
}
# Settings
BATCH_SIZE = 12     # Large batch for T4 X2
NUM_WORKERS = 4     # Parallel loading
USE_FP16 = True     # Faster inference

# ‚ö†Ô∏è UPDATE THESE!
HUGGINGFACE_MODEL_ID = "anassaifi8912/chestxray-blip-report-generator"  # Your HF model
HF_TOKEN = None  # Set to "hf_xxxxx" if model is private, or None if public

# Test data path
TEST_DATASET_PATH = "/kaggle/input/nih-chest-xrays"  # NIH dataset
TEST_JSON = "/kaggle/working/test_data.json"  # If you already have test JSON

# Output directory
OUTPUT_DIR = "/kaggle/working/test_results"


print(f"\nüìã Settings:")
print(f"  Model: {HUGGINGFACE_MODEL_ID}")
print(f"  Test data: {TEST_DATASET_PATH}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  FP16: {USE_FP16}")


  CONFIGURATION

üìã Settings:
  Model: anassaifi8912/chestxray-blip-report-generator
  Test data: /kaggle/input/nih-chest-xrays
  Batch size: 12
  FP16: True


In [7]:
# ==================== CELL 5: Test Data Preparation ONLY ====================
print("\n" + "="*70)
print("  Preparing TEST Dataset Only")
print("="*70)

import pandas as pd
import shutil
from tqdm.notebook import tqdm
import random
from pathlib import Path
import os

class NIHTestDataPreparator:
    """Prepare ONLY test data from NIH dataset for final evaluation"""
    
    # Same templates as your training data - no extra content added
    REPORT_TEMPLATES = {
        'No Finding': [
            "Normal chest radiograph. No acute cardiopulmonary abnormality. The heart size is normal. The lungs are clear.",
            "The heart size and mediastinal contours are normal. The lungs are clear. No pleural effusion or pneumothorax.",
        ],
        'Atelectasis': [
            "Atelectasis present. Otherwise lungs are clear. No pleural effusion or pneumothorax.",
        ],
        'Cardiomegaly': [
            "Cardiomegaly is present. The lungs are clear. No acute pulmonary abnormality.",
        ],
        'Effusion': [
            "Pleural effusion noted. Otherwise clear lung fields. No pneumothorax.",
        ],
        'Infiltration': [
            "Infiltrate present, possibly representing infection. Clinical correlation recommended.",
        ],
        'Mass': [
            "Pulmonary mass identified. Recommend CT for further evaluation.",
        ],
        'Nodule': [
            "Pulmonary nodule noted. Follow-up imaging recommended.",
        ],
        'Pneumonia': [
            "Consolidation consistent with pneumonia. Clinical correlation recommended.",
        ],
        'Pneumothorax': [
            "Pneumothorax present. Clinical correlation recommended.",
        ],
        'Consolidation': [
            "Consolidation present. Clinical correlation recommended.",
        ],
        'Edema': [
            "Pulmonary edema with prominent interstitial markings. Cardiomegaly present.",
        ],
        'Emphysema': [
            "Emphysematous changes. Hyperinflation present. Heart size normal.",
        ],
        'Fibrosis': [
            "Pulmonary fibrosis with reticular opacities. No acute process.",
        ],
        'Pleural_Thickening': [
            "Pleural thickening. No acute abnormality. Lungs otherwise clear.",
        ],
        'Hernia': [
            "Hiatal hernia present. Otherwise unremarkable chest radiograph.",
        ]
    }
    
    def __init__(self, dataset_path, output_path):
        self.dataset_path = Path(dataset_path)
        self.output_path = Path(output_path)
        self.output_path.mkdir(exist_ok=True)
        
    def generate_report(self, findings, view='PA'):
        """Generate report from findings - exact same logic as training"""
        if pd.isna(findings) or findings == 'No Finding':
            findings_list = ['No Finding']
        else:
            findings_list = findings.split('|')
        
        parts = [f"{view} chest radiograph."]
        
        for finding in findings_list:
            if finding in self.REPORT_TEMPLATES:
                template = random.choice(self.REPORT_TEMPLATES[finding])
                parts.append(template)
        
        report = ' '.join(parts)
        impression = self._generate_impression(findings_list)
        report += f" Impression: {impression}"
        
        return report
    
    def _generate_impression(self, findings):
        """Generate impression - exact same as training"""
        if findings == ['No Finding']:
            return "No acute cardiopulmonary abnormality."
        impressions = [f.replace('_', ' ') for f in findings if f in self.REPORT_TEMPLATES]
        return ', '.join(impressions) + '.' if impressions else "See findings above."
    
    def find_image(self, image_name):
        """Find image in dataset folders"""
        for i in range(1, 13):
            folder = self.dataset_path / f'images_{i:03d}' / 'images'
            image_path = folder / image_name
            if image_path.exists():
                return image_path
        return None
    
    def prepare_test_only(self, sample_size=None):
        """Prepare ONLY test dataset"""
        print("\nLoading metadata...")
        df = pd.read_csv(self.dataset_path / 'Data_Entry_2017.csv')
        
        # Load test split
        with open(self.dataset_path / 'test_list.txt', 'r') as f:
            test_imgs = set(line.strip() for line in f)
        
        # Get test data
        test_df = df[df['Image Index'].isin(test_imgs)]
        
        if sample_size:
            test_df = test_df.sample(n=min(sample_size, len(test_df)), random_state=42)
            print(f"Using {len(test_df)} test samples")
        else:
            print(f"Using all {len(test_df)} test samples")
        
        print(f"\nData split:")
        print(f"  Test: {len(test_df)}")
        
        # Create output directory
        (self.output_path / 'images').mkdir(exist_ok=True)
        
        # Process test data
        print(f"\nProcessing test...")
        
        data = []
        for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
            # Generate report
            report = self.generate_report(row['Finding Labels'], row['View Position'])
            
            # Find image
            img_name = row['Image Index']
            src = self.find_image(img_name)
            
            if src is None:
                continue
            
            # Copy image (symlink to save space)
            dst = self.output_path / 'images' / img_name
            if not dst.exists():
                # Use symlink instead of copy to save space!
                try:
                    os.symlink(src, dst)
                except:
                    shutil.copy2(src, dst)
            
            data.append({'image_path': img_name, 'report': report})
        
        # Save CSV
        pd.DataFrame(data).to_csv(self.output_path / 'test_data.csv', index=False)
        print(f"  ‚úì Saved {len(data)} test samples")
        
        print(f"\n‚úÖ Test dataset prepared in {self.output_path}")
        
        #return data


# Prepare ONLY test dataset
preparator = NIHTestDataPreparator(config['dataset_path'], config['output_dir'])
preparator.prepare_test_only(sample_size=config['sample_size'])


  Preparing TEST Dataset Only

Loading metadata...
Using all 25596 test samples

Data split:
  Test: 25596

Processing test...


  0%|          | 0/25596 [00:00<?, ?it/s]

  ‚úì Saved 25596 test samples

‚úÖ Test dataset prepared in /kaggle/working


In [8]:
# CELL 3: Download Model from HuggingFace
# ================================================================

print("\n" + "="*70)
print("  DOWNLOADING MODEL FROM HUGGINGFACE")
print("="*70)

from transformers import BlipProcessor, BlipForConditionalGeneration
from huggingface_hub import login

# Login if needed
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("‚úÖ Logged in to HuggingFace")

# Download model
print(f"\nüì• Downloading: {HUGGINGFACE_MODEL_ID}")

processor = BlipProcessor.from_pretrained(
    HUGGINGFACE_MODEL_ID,
    token=HF_TOKEN
)
print("‚úÖ Processor loaded")

model = BlipForConditionalGeneration.from_pretrained(
    HUGGINGFACE_MODEL_ID,
    torch_dtype=torch.float16 if USE_FP16 else torch.float32,
    token=HF_TOKEN
)
print("‚úÖ Model loaded")

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Use DataParallel for T4 X2
if torch.cuda.device_count() > 1:
    print(f"\nüöÄ Using {torch.cuda.device_count()} GPUs (DataParallel)")
    model = torch.nn.DataParallel(model)

model = model.to(device)
model.eval()

print(f"\n‚úÖ Model ready on {device}")

# Print model info
total_params = sum(p.numel() for p in model.parameters()) / 1e6
print(f"üìä Model parameters: {total_params:.1f}M")


  DOWNLOADING MODEL FROM HUGGINGFACE


2026-01-19 05:29:19.325237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768800559.518724      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768800559.574805      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768800560.034504      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768800560.034544      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768800560.034547      55 computation_placer.cc:177] computation placer alr


üì• Downloading: anassaifi8912/chestxray-blip-report-generator


preprocessor_config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


‚úÖ Processor loaded


config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

‚úÖ Model loaded

üöÄ Using 2 GPUs (DataParallel)

‚úÖ Model ready on cuda
üìä Model parameters: 224.0M


In [9]:
# LOAD TEST DATA
# ================================================================

print("\nüìÇ Loading test data...")

import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from pathlib import Path
import torch


test_csv = Path("/kaggle/working/") / "test_data.csv"
test_df = pd.read_csv(test_csv)

print(f"‚úÖ Loaded {len(test_df)} test samples")

# Dataset class
class TestDataset(Dataset):
    def __init__(self, df, images_dir, processor):
        self.df = df
        self.images_dir = Path(images_dir)
        self.processor = processor
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        image_path = self.images_dir / row['image_path']
        try:
            image = Image.open(image_path).convert('RGB')
        except:
            image = Image.new('RGB', (384, 384), color='black')
        
        inputs = self.processor(images=image, return_tensors="pt")
        
        return {
            'pixel_values': inputs['pixel_values'].squeeze(0),
            'reference': row['report'],
            'image_id': row['image_path']
        }

# Create dataset and dataloader
test_dataset = TestDataset(
    test_df, 
    Path("/kaggle/working/") / "images",
    processor
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

print(f"‚úÖ DataLoader ready: {len(test_loader)} batches")



üìÇ Loading test data...
‚úÖ Loaded 25596 test samples
‚úÖ DataLoader ready: 2133 batches


In [10]:
#==================Cell 8: Always ON Display ==================================
from IPython.display import Javascript, display

# Keep session alive by simulating activity
def keep_alive():
    display(Javascript('''
        function KeepClicking(){
            console.log("Keeping session alive...");
            document.querySelector('body').click();
        }
        setInterval(KeepClicking, 60000); // Click every 60 seconds
    '''))

keep_alive()

<IPython.core.display.Javascript object>

In [18]:
# ================================================================
# CELL 5: Generate Predictions (WITH CHECKPOINT SAVING)
# ================================================================

# All necessary imports
import time
import json
from pathlib import Path
from tqdm import tqdm
import torch

# Fix deprecation warning - use new autocast API
try:
    from torch.amp import autocast
except ImportError:
    from torch.cuda.amp import autocast

print("\n" + "="*70)
print("  GENERATING PREDICTIONS")
print("="*70)

# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(exist_ok=True, parents=True)

predictions = []
references = []
image_ids = []

start_time = time.time()
checkpoint_every = 100  # Save every 100 batches

print(f"\nüöÄ Starting inference...")
print(f"‚ö° Checkpoints will be saved every {checkpoint_every} batches")

# FIX for DataParallel - extract the actual model
model_to_use = model.module if hasattr(model, 'module') else model

with torch.no_grad():
    for batch_idx, batch in enumerate(tqdm(test_loader, desc="Testing")):
        pixel_values = batch['pixel_values'].to(device)
        
        # Generate predictions
        if USE_FP16:
            with autocast('cuda'):  # Fixed deprecation warning
                outputs = model_to_use.generate(
                    pixel_values=pixel_values,
                    max_length=128,
                    num_beams=5,
                    early_stopping=True
                )
        else:
            outputs = model_to_use.generate(
                pixel_values=pixel_values,
                max_length=128,
                num_beams=5,
                early_stopping=True
            )
        
        # Decode predictions
        for i in range(len(outputs)):
            pred = processor.decode(outputs[i], skip_special_tokens=True)
            predictions.append(pred)
            references.append(batch['reference'][i])
            image_ids.append(batch['image_id'][i])
        
        # Save checkpoint every N batches
        if (batch_idx + 1) % checkpoint_every == 0:
            checkpoint_file = Path(OUTPUT_DIR) / f"predictions_checkpoint_{batch_idx+1}.json"
            
            # Ensure directory exists
            checkpoint_file.parent.mkdir(exist_ok=True, parents=True)
            
            with open(checkpoint_file, 'w') as f:
                json.dump({
                    'predictions': predictions,
                    'references': references,
                    'image_ids': image_ids,
                    'batches_processed': batch_idx + 1,
                    'samples_processed': len(predictions)
                }, f, indent=2)
            print(f"\n  üíæ Checkpoint saved: {checkpoint_file.name}")

total_time = time.time() - start_time

print(f"\n‚úÖ Predictions complete!")
print(f"  Total: {len(predictions)} samples")
print(f"  Time: {total_time:.1f}s ({total_time/60:.1f} min)")
print(f"  Speed: {len(predictions)/total_time:.1f} samples/sec")

# Save final predictions immediately
predictions_file = Path(OUTPUT_DIR) / "predictions_final.json"

# Ensure directory exists
predictions_file.parent.mkdir(exist_ok=True, parents=True)

with open(predictions_file, 'w') as f:
    json.dump({
        'predictions': predictions,
        'references': references,
        'image_ids': image_ids,
        'total_samples': len(predictions),
        'inference_time': total_time,
        'inference_speed': len(predictions)/total_time
    }, f, indent=2)

print(f"\nüíæ Final predictions saved: {predictions_file}")
print(f"üìÅ Location: {predictions_file}")


  GENERATING PREDICTIONS

üöÄ Starting inference...
‚ö° Checkpoints will be saved every 100 batches


Testing:   5%|‚ñç         | 100/2133 [08:15<2:48:11,  4.96s/it]


  üíæ Checkpoint saved: predictions_checkpoint_100.json


Testing:   9%|‚ñâ         | 200/2133 [16:31<2:40:25,  4.98s/it]


  üíæ Checkpoint saved: predictions_checkpoint_200.json


Testing:  14%|‚ñà‚ñç        | 300/2133 [24:48<2:31:44,  4.97s/it]


  üíæ Checkpoint saved: predictions_checkpoint_300.json


Testing:  19%|‚ñà‚ñâ        | 400/2133 [33:04<2:23:43,  4.98s/it]


  üíæ Checkpoint saved: predictions_checkpoint_400.json


Testing:  23%|‚ñà‚ñà‚ñé       | 500/2133 [41:19<2:15:34,  4.98s/it]


  üíæ Checkpoint saved: predictions_checkpoint_500.json


Testing:  28%|‚ñà‚ñà‚ñä       | 600/2133 [49:34<2:07:24,  4.99s/it]


  üíæ Checkpoint saved: predictions_checkpoint_600.json


Testing:  33%|‚ñà‚ñà‚ñà‚ñé      | 700/2133 [57:51<1:58:57,  4.98s/it]


  üíæ Checkpoint saved: predictions_checkpoint_700.json


Testing:  38%|‚ñà‚ñà‚ñà‚ñä      | 800/2133 [1:06:08<1:50:28,  4.97s/it]


  üíæ Checkpoint saved: predictions_checkpoint_800.json


Testing:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 900/2133 [1:14:24<1:42:00,  4.96s/it]


  üíæ Checkpoint saved: predictions_checkpoint_900.json


Testing:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 1000/2133 [1:22:41<1:33:56,  4.97s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1000.json


Testing:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 1100/2133 [1:30:59<1:25:57,  4.99s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1100.json


Testing:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 1200/2133 [1:39:16<1:17:42,  5.00s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1200.json


Testing:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 1300/2133 [1:47:32<1:09:01,  4.97s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1300.json


Testing:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 1400/2133 [1:55:48<1:01:14,  5.01s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1400.json


Testing:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 1500/2133 [2:04:05<52:27,  4.97s/it]  


  üíæ Checkpoint saved: predictions_checkpoint_1500.json


Testing:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 1600/2133 [2:12:21<44:12,  4.98s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1600.json


Testing:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 1700/2133 [2:20:31<35:37,  4.94s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1700.json


Testing:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 1800/2133 [2:28:48<27:43,  5.00s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1800.json


Testing:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 1900/2133 [2:37:05<19:20,  4.98s/it]


  üíæ Checkpoint saved: predictions_checkpoint_1900.json


Testing:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 2000/2133 [2:45:20<11:03,  4.99s/it]


  üíæ Checkpoint saved: predictions_checkpoint_2000.json


Testing:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 2100/2133 [2:53:33<02:45,  5.01s/it]


  üíæ Checkpoint saved: predictions_checkpoint_2100.json


Testing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2133/2133 [2:56:18<00:00,  4.96s/it]


‚úÖ Predictions complete!
  Total: 25596 samples
  Time: 10578.7s (176.3 min)
  Speed: 2.4 samples/sec

üíæ Final predictions saved: /kaggle/working/test_results/predictions_final.json
üìÅ Location: /kaggle/working/test_results/predictions_final.json





In [20]:
# CELL 6: Calculate Metrics
# ================================================================
print("\n" + "="*70)
print("  CALCULATING METRICS")
print("="*70)

import numpy as np
# Install NLTK
!pip install -q nltk rouge-score

import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)

from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# BLEU
print("\nüìä BLEU...")
smoothing = SmoothingFunction().method1
refs_tok = [[ref.lower().split()] for ref in references]
preds_tok = [pred.lower().split() for pred in predictions]

bleu_1 = corpus_bleu(refs_tok, preds_tok, weights=(1,0,0,0), smoothing_function=smoothing)
bleu_2 = corpus_bleu(refs_tok, preds_tok, weights=(0.5,0.5,0,0), smoothing_function=smoothing)
bleu_3 = corpus_bleu(refs_tok, preds_tok, weights=(0.33,0.33,0.33,0), smoothing_function=smoothing)
bleu_4 = corpus_bleu(refs_tok, preds_tok, weights=(0.25,0.25,0.25,0.25), smoothing_function=smoothing)

# METEOR
print("üìä METEOR...")
meteor_scores = []
for ref, pred in zip(references, predictions):
    try:
        score = meteor_score([ref.lower().split()], pred.lower().split())
        meteor_scores.append(score)
    except:
        meteor_scores.append(0.0)
meteor_avg = np.mean(meteor_scores)

# ROUGE-L
print("üìä ROUGE-L...")
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(references, predictions)]
rouge_avg = np.mean(rouge_scores)

# Clinical Accuracy
print("üìä Clinical Accuracy...")
clinical_terms = ['atelectasis', 'cardiomegaly', 'effusion', 'infiltration', 'mass', 
                  'nodule', 'pneumonia', 'pneumothorax', 'consolidation', 'edema',
                  'emphysema', 'fibrosis', 'pleural', 'thickening', 'hernia', 'normal']

correct = 0
for ref, pred in zip(references, predictions):
    ref_lower, pred_lower = ref.lower(), pred.lower()
    ref_terms = [t for t in clinical_terms if t in ref_lower]
    pred_terms = [t for t in clinical_terms if t in pred_lower]
    
    if not ref_terms:
        if not pred_terms:
            correct += 1
    else:
        matching = len(set(ref_terms) & set(pred_terms))
        if matching / len(ref_terms) >= 0.5:
            correct += 1

clinical_acc = correct / len(references)

metrics = {
    'BLEU-1': float(bleu_1),
    'BLEU-2': float(bleu_2),
    'BLEU-3': float(bleu_3),
    'BLEU-4': float(bleu_4),
    'METEOR': float(meteor_avg),
    'ROUGE-L': float(rouge_avg),
    'Clinical-Accuracy': float(clinical_acc)
}

print("\n‚úÖ Metrics calculated")


  CALCULATING METRICS

üìä BLEU...
üìä METEOR...
üìä ROUGE-L...
üìä Clinical Accuracy...

‚úÖ Metrics calculated


In [21]:
# CELL 7: Display & Save Results
# ================================================================
print("\n" + "="*70)
print("  FINAL TEST RESULTS")
print("="*70)

print("\nüìà Metrics:")
for metric, value in metrics.items():
    print(f"  {metric:20s}: {value:.4f}")

# Interpretation
print(f"\nüí° Performance:")
if metrics['BLEU-4'] > 0.30:
    print("  ‚úÖ Excellent BLEU-4")
elif metrics['BLEU-4'] > 0.20:
    print("  ‚úÖ Good BLEU-4")
else:
    print("  ‚ö†Ô∏è BLEU-4 needs improvement")

if metrics['Clinical-Accuracy'] > 0.70:
    print("  ‚úÖ Excellent clinical accuracy")
elif metrics['Clinical-Accuracy'] > 0.60:
    print("  ‚úÖ Good clinical accuracy")
else:
    print("  ‚ö†Ô∏è Clinical accuracy needs improvement")

# Save metrics
metrics_file = Path(OUTPUT_DIR) / "test_metrics.json"
with open(metrics_file, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"\nüíæ Metrics saved: {metrics_file}")

# Save detailed predictions
detailed_file = Path(OUTPUT_DIR) / "test_predictions_detailed.json"
results_detail = []
for i in range(len(predictions)):
    results_detail.append({
        'image_id': image_ids[i],
        'reference': references[i],
        'prediction': predictions[i]
    })

with open(detailed_file, 'w') as f:
    json.dump(results_detail, f, indent=2)
print(f"üíæ Detailed predictions: {detailed_file}")

# Save sample predictions
sample_file = Path(OUTPUT_DIR) / "sample_predictions.txt"
with open(sample_file, 'w') as f:
    f.write("="*70 + "\n")
    f.write("TEST SET - SAMPLE PREDICTIONS\n")
    f.write("="*70 + "\n\n")
    
    for i in range(min(20, len(results_detail))):
        result = results_detail[i]
        f.write(f"Sample {i+1}:\n")
        f.write(f"Image: {result['image_id']}\n\n")
        f.write(f"Reference:\n{result['reference']}\n\n")
        f.write(f"Prediction:\n{result['prediction']}\n")
        f.write("\n" + "-"*70 + "\n\n")

print(f"üíæ Sample predictions: {sample_file}")

# Create summary report
report_file = Path(OUTPUT_DIR) / "test_report.txt"
with open(report_file, 'w') as f:
    f.write("="*70 + "\n")
    f.write("BLIP CHEST X-RAY MODEL - TEST EVALUATION REPORT\n")
    f.write("="*70 + "\n\n")
    
    f.write(f"Test samples: {len(predictions)}\n")
    f.write(f"Inference time: {total_time:.1f}s ({total_time/60:.1f} min)\n")
    f.write(f"Inference speed: {len(predictions)/total_time:.1f} samples/sec\n")
    f.write(f"Device: {torch.cuda.device_count()} x GPU\n\n")
    
    f.write("METRICS:\n")
    f.write("-"*70 + "\n")
    for metric, value in metrics.items():
        f.write(f"{metric:20s}: {value:.4f}\n")
    
    f.write("\n" + "="*70 + "\n")
    f.write("FINAL EVALUATION - These are your official test results\n")
    f.write("="*70 + "\n")

print(f"üíæ Test report: {report_file}")

print("\n" + "="*70)
print("  ‚úÖ EVALUATION COMPLETE!")
print("="*70)

print(f"\nüìÅ All files saved to: {OUTPUT_DIR}")
print(f"\nüìÑ Files created:")
print(f"  1. test_metrics.json - All metric scores")
print(f"  2. test_predictions_detailed.json - Every prediction")
print(f"  3. sample_predictions.txt - 20 examples for review")
print(f"  4. test_report.txt - Comprehensive summary")
print(f"  5. predictions_final.json - Raw predictions data")
print(f"  6. predictions_checkpoint_*.json - Checkpoints (if any)")

print(f"\n‚ú® Test evaluation successful!")


  FINAL TEST RESULTS

üìà Metrics:
  BLEU-1              : 0.1019
  BLEU-2              : 0.0692
  BLEU-3              : 0.0341
  BLEU-4              : 0.0189
  METEOR              : 0.1692
  ROUGE-L             : 0.1803
  Clinical-Accuracy   : 0.3159

üí° Performance:
  ‚ö†Ô∏è BLEU-4 needs improvement
  ‚ö†Ô∏è Clinical accuracy needs improvement

üíæ Metrics saved: /kaggle/working/test_results/test_metrics.json
üíæ Detailed predictions: /kaggle/working/test_results/test_predictions_detailed.json
üíæ Sample predictions: /kaggle/working/test_results/sample_predictions.txt
üíæ Test report: /kaggle/working/test_results/test_report.txt

  ‚úÖ EVALUATION COMPLETE!

üìÅ All files saved to: /kaggle/working/test_results

üìÑ Files created:
  1. test_metrics.json - All metric scores
  2. test_predictions_detailed.json - Every prediction
  3. sample_predictions.txt - 20 examples for review
  4. test_report.txt - Comprehensive summary
  5. predictions_final.json - Raw predictions data
  

In [22]:
pip install kaggle


Note: you may need to restart the kernel to use updated packages.


In [26]:
!zip -r /kaggle/working/test_results.zip /kaggle/working/test_results


  adding: kaggle/working/test_results/ (stored 0%)
  adding: kaggle/working/test_results/predictions_checkpoint_400.json (deflated 97%)
  adding: kaggle/working/test_results/predictions_checkpoint_900.json (deflated 97%)
  adding: kaggle/working/test_results/predictions_checkpoint_1500.json (deflated 97%)
  adding: kaggle/working/test_results/sample_predictions.txt (deflated 92%)
  adding: kaggle/working/test_results/predictions_checkpoint_1800.json (deflated 97%)
  adding: kaggle/working/test_results/predictions_checkpoint_300.json (deflated 97%)
  adding: kaggle/working/test_results/predictions_checkpoint_1100.json (deflated 97%)
  adding: kaggle/working/test_results/predictions_checkpoint_200.json (deflated 97%)
  adding: kaggle/working/test_results/predictions_checkpoint_1300.json (deflated 97%)
  adding: kaggle/working/test_results/predictions_checkpoint_2000.json (deflated 97%)
  adding: kaggle/working/test_results/predictions_checkpoint_500.json (deflated 97%)
  adding: kaggle/w

In [27]:
!ls -lh /kaggle/working/test_results.zip


-rw-r--r-- 1 root root 6.5M Jan 19 09:09 /kaggle/working/test_results.zip
