# Check and Read Data

In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

train_captions = pd.read_csv('/kaggle/input/obss-intern-competition-2025/train.csv')
test_captions = pd.read_csv('/kaggle/input/obss-intern-competition-2025/test.csv')
img_dir = '/kaggle/input/obss-intern-competition-2025/train/train'

In [None]:
train_captions.isna().sum()

# Clean Caption Data

In [None]:
import re
from collections import Counter


def remove_punc(text) -> str:
    return re.sub(r'[^\w\s]','',text)

def to_lower_case(text) -> str:
    return text.lower()



def remove_numbers(text) -> str:
    return re.sub(r'[0-9]','',text)

def remove_multiple_spaces(text) -> str:
    return re.sub(r' +',' ',text).strip()



def clean_text(text) -> str:
    text = remove_punc(text)
    text = to_lower_case(text)
   # text = remove_stopwords(text)
    text = remove_numbers(text)
    text = remove_multiple_spaces(text)
    return text

In [None]:
train_captions['caption'] = train_captions['caption'].apply(clean_text)

In [None]:
caption_lengths = train_captions['caption'].apply(lambda x: len(str(x).split()))

print(caption_lengths.describe())

plt.figure(figsize=(10, 6))
plt.hist(caption_lengths, bins=50, alpha=0.7, color='blue')
plt.title('Distribution of Explanation Lengths')
plt.xlabel('Word Count')
plt.ylabel('Number of Descriptions')
plt.grid(True)
plt.show()

In [None]:
import os
import torch
from torchvision import transforms
from PIL import Image
from textwrap import wrap

def read_image(path,img_size=224):
    transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor()  
    ])
    image = Image.open(path).convert("RGB")
    return transform(image)

def display_images(temp_df, img_path):
    temp_df = temp_df.reset_index(drop=True)
    plt.figure(figsize=(20, 20))
    for i in range(min(15, len(temp_df))):
        plt.subplot(5, 5, i + 1)
        plt.subplots_adjust(hspace=0.9, wspace=0.5)
        
        image_filename = str(temp_df.image_id[i])
        if not image_filename.endswith('.jpg'):
            image_filename += '.jpg'
        
        image_path = os.path.join(img_path, image_filename)
        image_tensor = read_image(image_path)
        image_np = image_tensor.permute(1, 2, 0).numpy()
        plt.imshow(image_np)
        plt.title("\n".join(wrap(temp_df.caption[i], 20)))
        plt.axis("off")
    plt.show()


In [None]:
display_images(train_captions.sample(15),img_path)

# Models Definition

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.models import ResNet50_Weights
from PIL import Image
import math

class EncoderCNN(nn.Module):
    def __init__(self, embed_size, dropout_p=0.5): 
        super(EncoderCNN, self).__init__()
        weights = ResNet50_Weights.DEFAULT
        resnet = models.resnet50(weights=weights)
        
        modules = list(resnet.children())[:-2]
        self.resnet = nn.Sequential(*modules)
        
        self.adaptive_pool = nn.AdaptiveAvgPool2d((14, 14))
        
        self.conv_to_embed = nn.Conv2d(2048, embed_size, kernel_size=1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_p) 
        
        self.layer_norm = nn.LayerNorm(embed_size)
        
    def forward(self, images):
        features = self.resnet(images)  
        features = self.adaptive_pool(features)  
        features = self.conv_to_embed(features)  
        features = self.relu(features)
        features = self.dropout(features) 
        
        features = features.permute(0, 2, 3, 1)
        features = self.layer_norm(features)
        
        return features



class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout_p=0.1, max_len=5000): 
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout_p) 
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  
        self.register_buffer('pe', pe)

    def forward(self, x):
        
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x) 

class DecoderTransformer(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, num_heads, dropout_p): 
        super(DecoderTransformer, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, dropout_p=dropout_p) 
        
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=embed_size,  
            nhead=num_heads, 
            dim_feedforward=hidden_size,  
            dropout=dropout_p, 
            batch_first=True
        )
        
        self.transformer_decoder = nn.TransformerDecoder(
            decoder_layer,  
            num_layers=num_layers
        )
        
        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout_p) 
        
        nn.init.uniform_(self.embedding.weight, -0.1, 0.1)

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask.to(torch.bool)

    def forward(self, encoder_features, captions):
        batch_size, seq_len = captions.size()
        
        memory = encoder_features.view(batch_size, -1, self.embed_size)
        
        tgt = self.embedding(captions)
        tgt = self.positional_encoding(tgt)
        
        
        tgt_mask = self._generate_square_subsequent_mask(seq_len).to(tgt.device)
        
        output = self.transformer_decoder(
            tgt,  
            memory,  
            tgt_mask=tgt_mask,
        )
        
        output = self.dropout(output) 
        outputs = self.fc_out(output)
        
        return outputs, None 

    def caption_image(self, image, vocabulary, encoder_cnn, max_length=26, device='cuda'):
        self.eval()
        encoder_cnn.eval()
        
        with torch.no_grad():
            encoder_out = encoder_cnn(image)
            memory = encoder_out.view(1, -1, self.embed_size)
            
            sos_token_id = vocabulary.stoi.get("<SOS>", vocabulary.stoi.get("<SOS>", 1))
            
            generated_ids = [sos_token_id]
            result_caption = []
            attention_maps = [] 
            
            for step in range(max_length):
                current_sequence_tensor = torch.tensor(generated_ids, device=device).unsqueeze(0)
                
                tgt_embed = self.embedding(current_sequence_tensor)
                tgt_embed = self.positional_encoding(tgt_embed)
                
                tgt_mask = self._generate_square_subsequent_mask(current_sequence_tensor.size(1)).to(device)
                
                decoder_output = self.transformer_decoder(
                    tgt_embed,  
                    memory,  
                    tgt_mask=tgt_mask
                )
                
                last_token_output = decoder_output[:, -1, :]
                
                output = self.fc_out(last_token_output)
                predicted_id = output.argmax(1).item()
                
                predicted_word = vocabulary.itos[predicted_id]
                
                if predicted_word in ["<EOS>"]:
                    break
                    
                if predicted_word not in ["<SOS>"]:
                    result_caption.append(predicted_word)
                    
                generated_ids.append(predicted_id)
        
        return result_caption, attention_maps

class ImageCaption(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, num_heads, dropout):
        super(ImageCaption, self).__init__()
        self.encoder_cnn = EncoderCNN(embed_size, dropout_p=dropout) 
        self.decoder_transformer = DecoderTransformer(
            embed_size=embed_size,
            hidden_size=hidden_size,  
            vocab_size=vocab_size,
            num_layers=num_layers,
            num_heads=num_heads, 
            dropout_p=dropout 
        )

    def forward(self, images, captions):
        features = self.encoder_cnn(images)
        outputs, _ = self.decoder_transformer(features, captions)  
        return outputs, None

    def caption_image(self, image, vocabulary, max_length=26, device='cuda'):
        return self.decoder_transformer.caption_image(
            image, vocabulary, self.encoder_cnn, max_length, device
        )


# Dataset preparation

In [None]:
import os
import torch
from torch.utils.data import Dataset

from torchvision import transforms
from PIL import Image

train_transforms = transforms.Compose([
    transforms.Resize(256),                  
    transforms.CenterCrop(224),               

    transforms.RandomHorizontalFlip(p=0.5),  
    transforms.RandomRotation(degrees=5),     

    transforms.ColorJitter(
        brightness=0.1,
        contrast=0.1,
        saturation=0.1,
        hue=0.05
    ),

    transforms.RandomApply([
        transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 1.0))
    ], p=0.1),

    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


val_transforms = transforms.Compose([
     transforms.Resize(256),         
    transforms.CenterCrop(224),     
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


class CaptionDataset(Dataset):
    def __init__(self, dataframe, img_dir, word2idx, max_len=50, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.img_dir = img_dir
        self.word2idx = word2idx
        self.max_len = max_len
        self.transform = transform 

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = os.path.join(self.img_dir, f"{row['image_id']}.jpg")

        image = Image.open(image_path).convert("RGB")

        if self.transform is not None:
            image = self.transform(image)

        caption = row['caption'].lower().split()
        caption = ['<SOS>'] + caption[:self.max_len - 2] + ['<EOS>']

        caption_ids = [self.word2idx.get(w, self.word2idx['<UNK>']) for w in caption]

        caption_ids += [self.word2idx['<PAD>']] * (self.max_len - len(caption_ids))

        return image, torch.tensor(caption_ids)


class Vocabulary:
    def __init__(self, freq_threshold=5):
        self.itos = {0: '<PAD>', 1: '<UNK>', 2: '<SOS>', 3: '<EOS>'}
        self.stoi = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
        self.freq_threshold = freq_threshold
        self.index = 4

    def __len__(self):
        return len(self.itos)

    def build_vocabulary(self, caption_series):
        word_counts = {}
        for caption in caption_series:
            for word in caption.lower().split():
                if word not in word_counts:
                    word_counts[word] = 1
                else:
                    word_counts[word] += 1

        for word, count in word_counts.items():
            if count >= self.freq_threshold and word not in self.stoi:
                self.stoi[word] = self.index
                self.itos[self.index] = word
                self.index += 1

# Model Training

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split 
from sklearn.model_selection import train_test_split 
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import time
import matplotlib.pyplot as plt
from tqdm import tqdm


vocab = Vocabulary()
vocab.build_vocabulary(train_captions['caption']) 


embed_size = 512
hidden_size = 2048
num_heads = 8
num_layers = 8
dropout = 0.4               

batch_size = 64
num_epochs = 500            
learning_rate = 1e-4      
#warmup_steps = 1000       
weight_decay = 1e-3       
label_smoothing = 0.05     
max_norm = 1.0            



train_df, val_df = train_test_split(train_captions, test_size=0.1, random_state=42) 


train_dataset = CaptionDataset(train_df, img_dir, vocab.stoi, transform=train_transforms)
val_dataset = CaptionDataset(val_df, img_dir, vocab.stoi, transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4) 

print(f"Eğitim seti boyutu: {len(train_dataset)}")
print(f"Doğrulama seti boyutu: {len(val_dataset)}")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory // 1024**3} GB")



model = ImageCaption(
    embed_size=embed_size,
    hidden_size=hidden_size,
    vocab_size=len(vocab),
    num_layers=num_layers,
    num_heads=num_heads,
    dropout=dropout
)
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size: {total_params * 4 / (1024**2):.1f} MB")


criterion = nn.CrossEntropyLoss(
    ignore_index=vocab.stoi["<PAD>"],
    label_smoothing=label_smoothing
)
optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay
)

total_steps = len(train_loader) * num_epochs
#scheduler = CosineAnnealingWarmRestarts(
#    optimizer,
#    T_0=25,       # Önceki öneriyle aynı
#    T_mult=1,     # Önceki öneriyle aynı
#    eta_min=1e-6
#)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',         
    patience=2,       
    factor=0.5,        
    verbose=True,       
    min_lr=1e-7        
)

best_val_loss = float('inf')
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

train_losses = []
val_losses = []
learning_rates = []
patience = 10
counter = 0

print("Starting training...")
print(f"Total steps: {total_steps}")
print(f"Steps per epoch: {len(train_loader)}")
print("=" * 60)


for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    epoch_start_time = time.time()

    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs} (Train)')

    for idx, (imgs, captions) in enumerate(pbar):
        imgs, captions = imgs.to(device), captions.to(device)

        inputs = captions[:, :-1]
        targets = captions[:, 1:]

        model_output = model(imgs, inputs)

        if isinstance(model_output, tuple):
            outputs, attention_weights = model_output
        else:
            outputs = model_output

        outputs = outputs.reshape(-1, outputs.shape[2])
        targets = targets.reshape(-1)

        loss = criterion(outputs, targets)
        total_train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()

        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm)

        optimizer.step()

        pbar.set_postfix({
            'Loss': f'{loss.item():.4f}',
            'LR': f'{scheduler.get_last_lr()[0]:.2e}',
            'Grad': f'{grad_norm:.2f}'
        })

        if idx % 100 == 0:
            current_lr = scheduler.get_last_lr()[0]
            learning_rates.append(current_lr) 


    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    model.eval() 
    total_val_loss = 0
    val_pbar = tqdm(val_loader, desc=f'Epoch {epoch+1}/{num_epochs} (Validation)')

    with torch.no_grad(): 
        for val_idx, (val_imgs, val_captions) in enumerate(val_pbar):
            val_imgs, val_captions = val_imgs.to(device), val_captions.to(device)

            val_inputs = val_captions[:, :-1]
            val_targets = val_captions[:, 1:]

            val_model_output = model(val_imgs, val_inputs)

            if isinstance(val_model_output, tuple):
                val_outputs, _ = val_model_output
            else:
                val_outputs = val_model_output

            val_outputs = val_outputs.reshape(-1, val_outputs.shape[2])
            val_targets = val_targets.reshape(-1)

            val_loss = criterion(val_outputs, val_targets)
            total_val_loss += val_loss.item()
            val_pbar.set_postfix({'Val Loss': f'{val_loss.item():.4f}'})


    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    epoch_time = time.time() - epoch_start_time
    scheduler.step(avg_val_loss)

    print(f"\nEpoch [{epoch+1}/{num_epochs}] Summary:")
    print(f"  Average Train Loss: {avg_train_loss:.4f}")
    print(f"  Average Validation Loss: {avg_val_loss:.4f}")
    print(f"  Time: {epoch_time:.1f}s")
    print(f"  Learning Rate: {scheduler.get_last_lr()[0]:.2e}")


    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0 

        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_val_loss': best_val_loss,
            'vocab': vocab,
            'hyperparameters': {
                'embed_size': embed_size,
                'hidden_size': hidden_size,
                'num_heads': num_heads,
                'num_layers': num_layers,
                'dropout': dropout,
                'learning_rate': learning_rate, 
                'batch_size': batch_size,
                'num_epochs': num_epochs,
                'max_norm': max_norm,
                'weight_decay': weight_decay,
                'label_smoothing': label_smoothing
            }
        }
        torch.save(checkpoint, os.path.join(checkpoint_dir, "best_model.pth"))
        print(f"✅ Best model saved with VALIDATION loss: {best_val_loss:.4f}")
    else:
        counter += 1
        print(f"Validation loss did not improve. Patience: {counter}/{patience}")
        if counter >= patience:
            print(f" Early stopping triggered after {epoch+1} epochs. Validation loss did not improve for {patience} consecutive epochs.")
            break 

    if (epoch + 1) % 5 == 0:
        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'best_val_loss_so_far': best_val_loss, 
            'vocab': vocab,
            'hyperparameters': {
                'embed_size': embed_size,
                'hidden_size': hidden_size,
                'num_heads': num_heads,
                'num_layers': num_layers,
                'dropout': dropout,
                'learning_rate': learning_rate,
                'batch_size': batch_size,
                'num_epochs': num_epochs,
                'max_norm': max_norm,
                'weight_decay': weight_decay,
                'label_smoothing': label_smoothing
            }
        }
        torch.save(checkpoint, os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch+1}.pth"))
        print(f"Checkpoint saved: epoch_{epoch+1} (Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f})")

    print("-" * 60)

print("Training completed!")
print(f"Final best validation loss achieved: {best_val_loss:.4f}")

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(learning_rates)
plt.title('Learning Rate Schedule')
plt.xlabel('Step ')
plt.ylabel('Learning Rate')
plt.yscale('log')
plt.grid(True)

plt.tight_layout()
plt.savefig(os.path.join(checkpoint_dir, 'training_validation_curves.png'))
plt.show()


In [None]:
import torch
import os 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

checkpoint_path = "/kaggle/working/checkpoints/best_model.pth"

try:
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

  
    hparams = checkpoint['hyperparameters']
    vocab_size = len(checkpoint['vocab']) 

    model = ImageCaption(
         embed_size=hparams['embed_size'],
        hidden_size=hparams['hidden_size'],
        vocab_size=vocab_size, 
        num_layers=hparams['num_layers'],
        num_heads=hparams['num_heads'],
        dropout = hparams['dropout']
    )
    model = model.to(device)

    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval() 

   

    vocab = checkpoint['vocab']
    print("model yüklendi.")

except FileNotFoundError:
    print(f"Hata: Checkpoint dosyası bulunamadı: {checkpoint_path}")
except Exception as e:
    print(f"Checkpoint yüklenirken bir hata oluştu: {e}")

# Test Result

In [None]:
import pickle

def generate_caption(image_path, model, vocab, transform, max_length=26):
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)  

    with torch.no_grad():
        output ,_= model.caption_image(image, vocab, max_length=max_length)

    caption = ' '.join(output)
    return caption

transform = transforms.Compose([
       transforms.Resize(256),        
    transforms.CenterCrop(224),     
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


test_dir = '/kaggle/input/obss-intern-competition-2025/test/test'



In [None]:
captions = []
for img_id in test_captions["image_id"]:
    img_filename = f"{img_id}.jpg"  
    img_path = os.path.join(test_dir, img_filename)

    caption = generate_caption(img_path, model, vocab, transform)
    cleaned_caption = ' '.join([
        word for word in caption.split()
        if word not in ['<UNK>', '<PAD>', '<SOS>', '<EOS>']
    ])
    captions.append(cleaned_caption)

test_captions["caption"] = captions

#test_captions.to_csv("submission.csv", index=False)

In [None]:
display_images(test_captions.sample(15),test_dir)

In [None]:
test_captions.caption[6]