In [1]:
# ==================== CELL 1: Check Environment ====================
import os
import torch
from pathlib import Path


print("="*70)
print("  Kaggle BLIP Training Environment Check")
print("="*70)

# Check GPU
print(f"\nPyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"Number of GPUs: {gpu_count}")
    for i in range(gpu_count):
        print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB")
else:
    print("‚ö†Ô∏è WARNING: No GPU detected!")
    print("Go to Settings ‚Üí Accelerator ‚Üí GPU T4 x2")

# Check Kaggle paths
print(f"\nKaggle Paths:")
print(f"  Working Dir: {os.getcwd()}")
print(f"  Input Dir: /kaggle/input/")
print(f"  Output Dir: /kaggle/working/")



  Kaggle BLIP Training Environment Check

PyTorch Version: 2.8.0+cu126
CUDA Available: True
Number of GPUs: 2
  GPU 0: Tesla T4
  Memory: 15.8 GB
  GPU 1: Tesla T4
  Memory: 15.8 GB

Kaggle Paths:
  Working Dir: /kaggle/working
  Input Dir: /kaggle/input/
  Output Dir: /kaggle/working/


In [2]:
# ==================== CELL 2: Install Dependencies ====================
print("\nInstalling dependencies...")

# Kaggle has most packages, just need a few
!pip install -q transformers==4.35.0 accelerate==0.24.0
!pip install -q rouge-score nltk

import nltk
nltk.download('punkt', quiet=True)

print("‚úÖ Dependencies installed!")


Installing dependencies...
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m123.1/123.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m7.9/7.9 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m261.0/261.0 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.8/3.8 MB[0m [31m96.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m295.0/29

In [3]:
#==================== CELL 3: Checking Dataset=====================================
from pathlib import Path

print("\n" + "="*70)
print("  Locating NIH Dataset")
print("="*70)

# Common paths where NIH dataset might be
possible_paths = [
    '/kaggle/input/nih-chest-xrays/data/versions/3',
    '/kaggle/input/data',
    '/kaggle/input/chest-xray-dataset',
    '/kaggle/input/nih-chest-xray-dataset',
]

# Search for Data_Entry_2017.csv
dataset_path = None
for path in possible_paths:
    if os.path.exists(path):
        # Check if it has the required files
        csv_file = None
        for root, dirs, files in os.walk(path):
            if 'Data_Entry_2017.csv' in files:
                dataset_path = root
                csv_file = os.path.join(root, 'Data_Entry_2017.csv')
                break
        if dataset_path:
            break

# If not found, search everywhere in /kaggle/input
if not dataset_path:
    print("Searching for dataset...")
    for root, dirs, files in os.walk('/kaggle/input'):
        if 'Data_Entry_2017.csv' in files:
            dataset_path = root
            break

if dataset_path:
    print(f"‚úÖ Dataset found at: {dataset_path}")
    
    # List contents
    print(f"\nDataset contents:")
    for item in os.listdir(dataset_path):
        item_path = os.path.join(dataset_path, item)
        if os.path.isdir(item_path):
            count = len(list(Path(item_path).rglob('*.png'))) + len(list(Path(item_path).rglob('*.jpg')))
            print(f"  {item}/: {count} images")
        elif item.endswith('.csv') or item.endswith('.txt'):
            print(f"  {item}")
else:
    print("‚ùå Dataset not found!")
    print("\nPlease add NIH Chest X-ray dataset:")
    print("  1. Click 'Add Data' (right panel)")
    print("  2. Search 'NIH Chest X-ray'")
    print("  3. Add to notebook")
    print("  4. Restart kernel")
    raise FileNotFoundError("NIH dataset not found")

DATASET_PATH = dataset_path


  Locating NIH Dataset
‚úÖ Dataset found at: /kaggle/input/data

Dataset contents:
  images_003/: 10000 images
  images_012/: 7121 images
  BBox_List_2017.csv
  images_009/: 10000 images
  images_008/: 10000 images
  images_007/: 10000 images
  test_list.txt
  images_010/: 10000 images
  images_002/: 10000 images
  images_011/: 10000 images
  Data_Entry_2017.csv
  images_001/: 4999 images
  train_val_list.txt
  images_005/: 10000 images
  images_004/: 10000 images
  images_006/: 10000 images


In [4]:
# ==================== CELL 4: Configuration ====================
import json

# Training configuration optimized for Kaggle T4 x2
config = {
    # Dataset
    'dataset_path': DATASET_PATH,
    'output_dir': '/kaggle/working',
    'sample_size': None,  # None = use all data, or set to number like 10000 for testing
    
    # Model
    'model_name': 'Salesforce/blip-image-captioning-base',
    'max_length': 256,
    'image_size': 384,
    
    # Training
    'batch_size': 12,  # Per GPU (total 48 with 2 GPUs)
    'num_epochs': 3,  # Reduced for 9-hour limit of kaggle gpu usage limit
    'learning_rate': 5e-5,
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'gradient_accumulation_steps': 1,
    'max_grad_norm': 1.0,
    
    # GPU settings
    'use_multi_gpu': True,  # Use both GPUs if available
    'mixed_precision': True,  # FP16 training
    
    # Checkpointing (important for 9-hour limit!)
    'save_every_steps': 300,
    'patience': 3,
    'eval_every_epochs': 1,
    
    # Hardware
    'num_workers': 4,
    'pin_memory': True,
}

# Save config
with open('/kaggle/working/training_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("‚úÖ Configuration created!")
print(f"\nKey settings:")
print(f"  Dataset: {config['dataset_path']}")
print(f"  Batch size: {config['batch_size']} per GPU")
print(f"  Total batch: {config['batch_size'] * (2 if config['use_multi_gpu'] else 1)}")
print(f"  Epochs: {config['num_epochs']}")
print(f"  Sample size: {'ALL' if config['sample_size'] is None else config['sample_size']}")


‚úÖ Configuration created!

Key settings:
  Dataset: /kaggle/input/data
  Batch size: 12 per GPU
  Total batch: 24
  Epochs: 3
  Sample size: ALL


In [5]:
# ==================== CELL 5: Data Preparation ====================
print("\n" + "="*70)
print("  Preparing Dataset")
print("="*70)

import pandas as pd
import shutil
from tqdm.notebook import tqdm
import random

class NIHDatasetPreparator:
    """Prepare NIH dataset for BLIP training"""
    
    REPORT_TEMPLATES = {
        'No Finding': [
            "Normal chest radiograph. No acute cardiopulmonary abnormality. The heart size is normal. The lungs are clear.",
            "The heart size and mediastinal contours are normal. The lungs are clear. No pleural effusion or pneumothorax.",
        ],
        'Atelectasis': [
            "Atelectasis present. Otherwise lungs are clear. No pleural effusion or pneumothorax.",
        ],
        'Cardiomegaly': [
            "Cardiomegaly is present. The lungs are clear. No acute pulmonary abnormality.",
        ],
        'Effusion': [
            "Pleural effusion noted. Otherwise clear lung fields. No pneumothorax.",
        ],
        'Infiltration': [
            "Infiltrate present, possibly representing infection. Clinical correlation recommended.",
        ],
        'Mass': [
            "Pulmonary mass identified. Recommend CT for further evaluation.",
        ],
        'Nodule': [
            "Pulmonary nodule noted. Follow-up imaging recommended.",
        ],
        'Pneumonia': [
            "Consolidation consistent with pneumonia. Clinical correlation recommended.",
        ],
        'Pneumothorax': [
            "Pneumothorax present. Clinical correlation recommended.",
        ],
        'Consolidation': [
            "Consolidation present. Clinical correlation recommended.",
        ],
        'Edema': [
            "Pulmonary edema with prominent interstitial markings. Cardiomegaly present.",
        ],
        'Emphysema': [
            "Emphysematous changes. Hyperinflation present. Heart size normal.",
        ],
        'Fibrosis': [
            "Pulmonary fibrosis with reticular opacities. No acute process.",
        ],
        'Pleural_Thickening': [
            "Pleural thickening. No acute abnormality. Lungs otherwise clear.",
        ],
        'Hernia': [
            "Hiatal hernia present. Otherwise unremarkable chest radiograph.",
        ]
    }
    
    def __init__(self, dataset_path, output_path):
        self.dataset_path = Path(dataset_path)
        self.output_path = Path(output_path)
        self.output_path.mkdir(exist_ok=True)
        
    def generate_report(self, findings, view='PA'):
        """Generate report from findings"""
        if pd.isna(findings) or findings == 'No Finding':
            findings_list = ['No Finding']
        else:
            findings_list = findings.split('|')
        
        parts = [f"{view} chest radiograph."]
        
        for finding in findings_list:
            if finding in self.REPORT_TEMPLATES:
                template = random.choice(self.REPORT_TEMPLATES[finding])
                parts.append(template)
        
        report = ' '.join(parts)
        impression = self._generate_impression(findings_list)
        report += f" Impression: {impression}"
        
        return report
    
    def _generate_impression(self, findings):
        if findings == ['No Finding']:
            return "No acute cardiopulmonary abnormality."
        impressions = [f.replace('_', ' ') for f in findings if f in self.REPORT_TEMPLATES]
        return ', '.join(impressions) + '.' if impressions else "See findings above."
    
    def find_image(self, image_name):
        """Find image in dataset folders"""
        for i in range(1, 13):
            folder = self.dataset_path / f'images_{i:03d}' / 'images'
            image_path = folder / image_name
            if image_path.exists():
                return image_path
        return None
    
    def prepare(self, sample_size=None):
        """Prepare dataset"""
        print("\nLoading metadata...")
        df = pd.read_csv(self.dataset_path / 'Data_Entry_2017.csv')
        
        if sample_size:
            df = df.sample(n=min(sample_size, len(df)), random_state=42)
            print(f"Using {len(df)} samples")
        else:
            print(f"Using all {len(df)} samples")
        
        # Load splits
        with open(self.dataset_path / 'train_val_list.txt', 'r') as f:
            train_val_imgs = set(line.strip() for line in f)
        with open(self.dataset_path / 'test_list.txt', 'r') as f:
            test_imgs = set(line.strip() for line in f)
        
        # Split data
        train_val_df = df[df['Image Index'].isin(train_val_imgs)]
        test_df = df[df['Image Index'].isin(test_imgs)]
        
        from sklearn.model_selection import train_test_split
        train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)
        
        print(f"\nData split:")
        print(f"  Train: {len(train_df)}")
        print(f"  Val: {len(val_df)}")
        print(f"  Test: {len(test_df)}")
        
        # Create output directory
        (self.output_path / 'images').mkdir(exist_ok=True)
        
        # Process each split
        for split_name, split_df in [('train', train_df), ('val', val_df), ('test', test_df)]:
            print(f"\nProcessing {split_name}...")
            
            data = []
            for _, row in tqdm(split_df.iterrows(), total=len(split_df)):
                # Generate report
                report = self.generate_report(row['Finding Labels'], row['View Position'])
                
                # Find image
                img_name = row['Image Index']
                src = self.find_image(img_name)
                
                if src is None:
                    continue
                
                # Copy image (symlink to save space)
                dst = self.output_path / 'images' / img_name
                if not dst.exists():
                    # Use symlink instead of copy to save space!
                    try:
                        os.symlink(src, dst)
                    except:
                        shutil.copy2(src, dst)
                
                data.append({'image_path': img_name, 'report': report})
            
            # Save CSV
            pd.DataFrame(data).to_csv(self.output_path / f'{split_name}_data.csv', index=False)
            print(f"  ‚úì Saved {len(data)} samples")
        
        print(f"\n‚úÖ Dataset prepared in {self.output_path}")

# Prepare dataset
preparator = NIHDatasetPreparator(config['dataset_path'], config['output_dir'])
preparator.prepare(sample_size=config['sample_size'])



  Preparing Dataset

Loading metadata...
Using all 112120 samples

Data split:
  Train: 69219
  Val: 17305
  Test: 25596

Processing train...


  0%|          | 0/69219 [00:00<?, ?it/s]

  ‚úì Saved 69219 samples

Processing val...


  0%|          | 0/17305 [00:00<?, ?it/s]

  ‚úì Saved 17305 samples

Processing test...


  0%|          | 0/25596 [00:00<?, ?it/s]

  ‚úì Saved 25596 samples

‚úÖ Dataset prepared in /kaggle/working


In [6]:
# ==================== CELL 6: Dataset and Model Classes ====================
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

class XrayDataset(Dataset):
    def __init__(self, csv_file, image_dir, processor, max_length=256):
        self.data = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.processor = processor
        self.max_length = max_length
        print(f"Loaded {len(self.data)} samples")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = os.path.join(self.image_dir, row['image_path'])
        
        try:
            image = Image.open(image_path).convert('RGB')
        except:
            image = Image.new('RGB', (384, 384), color='gray')
        
        report = str(row['report'])
        
        encoding = self.processor(
            images=image,
            text=report,
            return_tensors="pt",
            padding="max_length",
            max_length=self.max_length,
            truncation=True
        )
        return {k: v.squeeze(0) for k, v in encoding.items()}

print("‚úÖ Dataset class defined")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


‚úÖ Dataset class defined


  _torch_pytree._register_pytree_node(


In [10]:
# ==================== CELL 7: Training Setup ====================
print("\n" + "="*70)
print("  Setting Up Training")
print("="*70)

# Load model
LOCAL_MODEL_PATH = "/kaggle/input/blip-model/blip-base"

# In training setup
processor = BlipProcessor.from_pretrained(LOCAL_MODEL_PATH)
model = BlipForConditionalGeneration.from_pretrained(
    LOCAL_MODEL_PATH,
    local_files_only=True
)
# Multi-GPU setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if config['use_multi_gpu'] and torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
    model = torch.nn.DataParallel(model)
    
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters()) / 1e6
print(f"‚úÖ Model loaded: {total_params:.1f}M parameters")

# Create datasets
print("\nCreating dataloaders...")
train_dataset = XrayDataset(
    f"{config['output_dir']}/train_data.csv",
    f"{config['output_dir']}/images",
    processor,
    config['max_length']
)
val_dataset = XrayDataset(
    f"{config['output_dir']}/val_data.csv",
    f"{config['output_dir']}/images",
    processor,
    config['max_length']
)

train_loader = DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    num_workers=config['num_workers'],
    pin_memory=config['pin_memory']
)
val_loader = DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    num_workers=config['num_workers'],
    pin_memory=config['pin_memory']
)

print(f"‚úì Train: {len(train_dataset)} samples ({len(train_loader)} batches)")
print(f"‚úì Val: {len(val_dataset)} samples ({len(val_loader)} batches)")

# Optimizer
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup  

optimizer = AdamW(
    model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay']
)

num_training_steps = len(train_loader) * config['num_epochs']
num_warmup_steps = int(num_training_steps * config['warmup_ratio'])

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

print(f"\n‚úÖ Training setup complete!")
print(f"  Total steps: {num_training_steps}")
print(f"  Warmup steps: {num_warmup_steps}")

2026-01-16 08:11:18.833310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768551079.350073      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768551079.495058      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768551080.724784      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768551080.724829      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768551080.724832      55 computation_placer.cc:177] computation placer alr


  Setting Up Training


55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.


Using 2 GPUs with DataParallel
‚úÖ Model loaded: 247.4M parameters

Creating dataloaders...
Loaded 69219 samples
Loaded 17305 samples
‚úì Train: 69219 samples (5769 batches)
‚úì Val: 17305 samples (1443 batches)

‚úÖ Training setup complete!
  Total steps: 17307
  Warmup steps: 1730


In [7]:
#==================Cell 8: Always ON Display ==================================
from IPython.display import Javascript, display

# Keep session alive by simulating activity
def keep_alive():
    display(Javascript('''
        function KeepClicking(){
            console.log("Keeping session alive...");
            document.querySelector('body').click();
        }
        setInterval(KeepClicking, 60000); // Click every 60 seconds
    '''))

keep_alive()

<IPython.core.display.Javascript object>

In [12]:
# ==================== CELL 9: Training Loop ====================
import time

print("\n" + "="*70)
print("  TRAINING STARTED")
print("="*70)

best_val_loss = float('inf')
patience_counter = 0
global_step = 0

# Create checkpoint directory
checkpoint_dir = Path('/kaggle/working/checkpoints')
checkpoint_dir.mkdir(exist_ok=True)

for epoch in range(1, config['num_epochs'] + 1):
    print(f"\nEpoch {epoch}/{config['num_epochs']}")
    print("-"*70)
    
    # Training
    model.train()
    train_loss = 0
    start_time = time.time()
    
    progress = tqdm(train_loader, desc="Training")
    for batch_idx, batch in enumerate(progress):
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        
        outputs = model(pixel_values=pixel_values, input_ids=input_ids, labels=input_ids)
        
        # Handle DataParallel output
        loss = outputs.loss.mean() if hasattr(outputs.loss, 'mean') else outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        global_step += 1
        
        progress.set_postfix({'loss': f'{loss.item():.4f}', 'lr': f'{scheduler.get_last_lr()[0]:.2e}'})
        
        # Save checkpoint periodically
        if global_step % config['save_every_steps'] == 0:
            checkpoint_path = checkpoint_dir / f'checkpoint_step_{global_step}.pt'
           # Save only essential state (smaller file)
            torch.save({
                'epoch': epoch,
                'global_step': global_step,
                'model_state_dict': model.module.state_dict() if hasattr(model, 'module') else model.state_dict(),
                'best_val_loss': best_val_loss,
            }, checkpoint_path)
            print(f"\n  üíæ Checkpoint saved: step {global_step}")
            # Keep only last 2 checkpoints to save space
            checkpoints = sorted(checkpoint_dir.glob('checkpoint_step_*.pt'))
            if len(checkpoints) > 2:
                for old_ckpt in checkpoints[:-2]:
                    old_ckpt.unlink()

    
    
    avg_train_loss = train_loss / len(train_loader)
    epoch_time = time.time() - start_time
    
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Time: {epoch_time/60:.1f} min")
    
    # Validation
    if epoch % config['eval_every_epochs'] == 0:
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating"):
                pixel_values = batch['pixel_values'].to(device)
                input_ids = batch['input_ids'].to(device)
                
                outputs = model(pixel_values=pixel_values, input_ids=input_ids, labels=input_ids)
                loss = outputs.loss.mean() if hasattr(outputs.loss, 'mean') else outputs.loss
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_loader)
        print(f"  Val Loss: {avg_val_loss:.4f}")
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            
            best_model_dir = checkpoint_dir / 'best_model'
            best_model_dir.mkdir(exist_ok=True)
            
            # Save model (handle DataParallel)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(best_model_dir)
            processor.save_pretrained(best_model_dir)
            
            print(f"  üèÜ Best model saved! Val Loss: {best_val_loss:.4f}")
        else:
            patience_counter += 1
            print(f"  No improvement. Patience: {patience_counter}/{config['patience']}")
        
        # Early stopping
        if patience_counter >= config['patience']:
            print("\n‚ö†Ô∏è Early stopping!")
            break

print("\n" + "="*70)
print("  TRAINING COMPLETE!")
print("="*70)
print(f"Best Val Loss: {best_val_loss:.4f}")



  TRAINING STARTED

Epoch 1/3
----------------------------------------------------------------------


Training:   0%|          | 0/5769 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.



  üíæ Checkpoint saved: step 300

  üíæ Checkpoint saved: step 600

  üíæ Checkpoint saved: step 900

  üíæ Checkpoint saved: step 1200

  üíæ Checkpoint saved: step 1500

  üíæ Checkpoint saved: step 1800

  üíæ Checkpoint saved: step 2100

  üíæ Checkpoint saved: step 2400

  üíæ Checkpoint saved: step 2700

  üíæ Checkpoint saved: step 3000

  üíæ Checkpoint saved: step 3300

  üíæ Checkpoint saved: step 3600

  üíæ Checkpoint saved: step 3900

  üíæ Checkpoint saved: step 4200

  üíæ Checkpoint saved: step 4500

  üíæ Checkpoint saved: step 4800

  üíæ Checkpoint saved: step 5100

  üíæ Checkpoint saved: step 5400

  üíæ Checkpoint saved: step 5700
  Train Loss: 1.7498
  Time: 152.5 min


Validating:   0%|          | 0/1443 [00:00<?, ?it/s]

  Val Loss: 1.3668
  üèÜ Best model saved! Val Loss: 1.3668

Epoch 2/3
----------------------------------------------------------------------


Training:   0%|          | 0/5769 [00:00<?, ?it/s]




  üíæ Checkpoint saved: step 6000

  üíæ Checkpoint saved: step 6300

  üíæ Checkpoint saved: step 6600

  üíæ Checkpoint saved: step 6900

  üíæ Checkpoint saved: step 7200

  üíæ Checkpoint saved: step 7500

  üíæ Checkpoint saved: step 7800

  üíæ Checkpoint saved: step 8100

  üíæ Checkpoint saved: step 8400

  üíæ Checkpoint saved: step 8700

  üíæ Checkpoint saved: step 9000

  üíæ Checkpoint saved: step 9300

  üíæ Checkpoint saved: step 9600

  üíæ Checkpoint saved: step 9900

  üíæ Checkpoint saved: step 10200

  üíæ Checkpoint saved: step 10500

  üíæ Checkpoint saved: step 10800

  üíæ Checkpoint saved: step 11100

  üíæ Checkpoint saved: step 11400
  Train Loss: 1.3663
  Time: 152.9 min


Validating:   0%|          | 0/1443 [00:00<?, ?it/s]

  Val Loss: 1.3660
  üèÜ Best model saved! Val Loss: 1.3660

Epoch 3/3
----------------------------------------------------------------------


Training:   0%|          | 0/5769 [00:00<?, ?it/s]




  üíæ Checkpoint saved: step 11700

  üíæ Checkpoint saved: step 12000

  üíæ Checkpoint saved: step 12300

  üíæ Checkpoint saved: step 12600

  üíæ Checkpoint saved: step 12900

  üíæ Checkpoint saved: step 13200

  üíæ Checkpoint saved: step 13500

  üíæ Checkpoint saved: step 13800

  üíæ Checkpoint saved: step 14100

  üíæ Checkpoint saved: step 14400

  üíæ Checkpoint saved: step 14700

  üíæ Checkpoint saved: step 15000

  üíæ Checkpoint saved: step 15300

  üíæ Checkpoint saved: step 15600

  üíæ Checkpoint saved: step 15900

  üíæ Checkpoint saved: step 16200

  üíæ Checkpoint saved: step 16500

  üíæ Checkpoint saved: step 16800

  üíæ Checkpoint saved: step 17100
  Train Loss: 1.3654
  Time: 152.8 min


Validating:   0%|          | 0/1443 [00:00<?, ?it/s]

  Val Loss: 1.3657
  üèÜ Best model saved! Val Loss: 1.3657

  TRAINING COMPLETE!
Best Val Loss: 1.3657


In [37]:
# Check the best_model directory
best_model_dir = Path('/kaggle/working/checkpoints')
print("Contents of best_model/:")
for item in best_model_dir.iterdir():
    if item.is_file():
        size_mb = item.stat().st_size / (1024**2)
        print(f"  üìÑ {item.name} ({size_mb:.2f} MB)")
    else:
        print(f"  üìÅ {item.name}/")

Contents of best_model/:
  üìÅ best_model/
  üìÑ checkpoint_step_9900.pt (944.01 MB)
  üìÑ checkpoint_step_9600.pt (944.01 MB)


In [19]:
!zip -r /kaggle/working/best_model.zip /kaggle/working/checkpoints


  adding: kaggle/working/checkpoints/ (stored 0%)
  adding: kaggle/working/checkpoints/best_model/ (stored 0%)
  adding: kaggle/working/checkpoints/best_model/tokenizer_config.json (deflated 74%)
  adding: kaggle/working/checkpoints/best_model/tokenizer.json (deflated 71%)
  adding: kaggle/working/checkpoints/best_model/preprocessor_config.json (deflated 48%)
  adding: kaggle/working/checkpoints/best_model/generation_config.json (deflated 28%)
  adding: kaggle/working/checkpoints/best_model/config.json (deflated 52%)
  adding: kaggle/working/checkpoints/best_model/model.safetensors (deflated 7%)
  adding: kaggle/working/checkpoints/best_model/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/checkpoints/best_model/vocab.txt (deflated 53%)
  adding: kaggle/working/checkpoints/checkpoint_step_9900.pt (deflated 7%)
  adding: kaggle/working/checkpoints/checkpoint_step_9600.pt (deflated 7%)


In [33]:
!pip install -q huggingface_hub transformers safetensors


In [36]:
# Login to HuggingFace
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [39]:
from huggingface_hub import HfApi
from pathlib import Path

repo_id = "anassaifi8912/chestxray-blip-report-generator"
model_dir = Path("/kaggle/working/checkpoints/")

api = HfApi()

api.create_repo(
    repo_id=repo_id,
    repo_type="model",
    exist_ok=True
)

api.upload_folder(
    folder_path=model_dir,
    repo_id=repo_id,
    repo_type="model"
)

print("‚úÖ Model successfully uploaded to Hugging Face!")


checkpoint_step_9900.pt:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

checkpoint_step_9600.pt:   0%|          | 0.00/990M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

‚úÖ Model successfully uploaded to Hugging Face!


In [1]:
!pip install kaggle

