In [2]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
! pip install arabic_reshaper

Collecting arabic_reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic_reshaper
Successfully installed arabic_reshaper-3.0.0


In [None]:
# ==========================================
# üöÄ FINAL HYBRID MASTER CELL: ViT + AraGPT2
# Tweak: Added Dropout & Regularization to match Old Code
# ==========================================
import os
import re
import json
import time
import torch
import pandas as pd
import numpy as np
import random
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import evaluate
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# üõ†Ô∏è IMPORTS (Old & New Mixed)
from torchvision import transforms
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from torch.optim import AdamW
import arabic_reshaper
from bidi.algorithm import get_display

# 0. CONFIGURATION
# ---------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Using Device: {device}")

# üö® PATHS
TXT_FILE_PATH = '/kaggle/input/datasets/samahfetouh/arabic-flickr8k-dataset/captions.txt'
IMG_ROOT_DIR = '/kaggle/input/datasets/samahfetouh/arabic-flickr8k-dataset/Images'

# Config
ENCODER_CHECKPOINT = "google/vit-base-patch16-224-in21k"
DECODER_CHECKPOINT = "aubmindlab/aragpt2-base"
OUTPUT_DIR = "./flickr8k_hybrid_model"
LOG_FILE = "training_log.csv"

# üß† HYPERPARAMETERS (Matched to Old Code)
MAX_LENGTH = 32         
BATCH_SIZE = 32         
FINE_TUNE_AT_EPOCH = 8  # üîí The "Old Code" Magic Number
TOTAL_EPOCHS = 20       
DROPOUT_RATE = 0.3      # ‚¨ÜÔ∏è Increased from 0.1 to match Old Code's robustness

# üõ†Ô∏è HELPER: ARABIC NORMALIZATION
def normalize_arabic(text):
    text = re.sub(r"[ÿ•ÿ£ÿ¢ÿß]", "ÿß", text)
    text = re.sub(r"ÿ©", "Ÿá", text)
    text = re.sub(r"Ÿâ", "Ÿä", text)
    text = re.sub(r"_", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()

# 1. ROBUST DATA LOADING
# ---------------------------------------------------------
def load_data_aggressive(txt_path, img_root):
    image_map = {}
    for root, dirs, files in os.walk(img_root):
        for f in files:
            if f.lower().endswith(('.jpg', '.jpeg', '.png')):
                key = os.path.splitext(f)[0].strip()
                image_map[key] = os.path.join(root, f)
    
    if not image_map: raise ValueError("‚ùå No images found!")
    
    data = []
    with open(txt_path, 'r', encoding='utf-8') as f: lines = f.readlines()
    
    for line in lines:
        line = line.strip()
        if len(line) < 5: continue
        parts = re.split(r'[,\t]', line, maxsplit=1)
        if len(parts) < 2: continue
        
        img_key = parts[0].strip().split('#')[0]
        img_key = os.path.splitext(img_key)[0].strip()
        caption = parts[1].strip()
        
        if img_key in image_map:
            data.append({'image_path': image_map[img_key], 'caption': caption})
            
    return pd.DataFrame(data)

print("‚è≥ Loading Data...")
full_df = load_data_aggressive(TXT_FILE_PATH, IMG_ROOT_DIR)
print(f"‚úÖ Loaded {len(full_df)} pairs.")

train_df, val_df = train_test_split(full_df, test_size=0.1, random_state=42)

# 2. DATASET WITH AUGMENTATION (The "Old Code" Secret)
# ---------------------------------------------------------
feature_extractor = ViTImageProcessor.from_pretrained(ENCODER_CHECKPOINT)
tokenizer = AutoTokenizer.from_pretrained(DECODER_CHECKPOINT)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[BOS]'})

# üé® AUGMENTATION: Random Flips + Noise (Crucial for Small Datasets)
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5), # 50% chance to flip
    transforms.ColorJitter(brightness=0.1, contrast=0.1), 
    transforms.ToTensor()
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

class Flickr8kDataset(Dataset):
    def __init__(self, df, tokenizer, transform):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.transform = transform 

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['image_path']).convert("RGB")
        
        # Apply Torchvision Augmentation
        img_tensor = self.transform(image) 
        
        # Convert to HuggingFace Pixel Values
        # do_rescale=False because ToTensor() already scales to [0,1]
        pixel_values = feature_extractor(images=img_tensor, return_tensors="pt", do_rescale=False).pixel_values.squeeze()
        
        caption = normalize_arabic(row['caption'])
        raw_tokens = self.tokenizer(caption, add_special_tokens=False).input_ids
        if len(raw_tokens) > MAX_LENGTH - 2: raw_tokens = raw_tokens[:MAX_LENGTH - 2]
        final_tokens = [self.tokenizer.bos_token_id] + raw_tokens + [self.tokenizer.eos_token_id]
        
        padding_len = MAX_LENGTH - len(final_tokens)
        if padding_len > 0:
            final_tokens = final_tokens + [self.tokenizer.pad_token_id] * padding_len
            
        labels = torch.tensor(final_tokens)
        labels[labels == self.tokenizer.pad_token_id] = -100 
        
        return {"pixel_values": pixel_values, "labels": labels}

train_ds = Flickr8kDataset(train_df, tokenizer, train_transforms)
val_ds = Flickr8kDataset(val_df, tokenizer, val_transforms)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# 3. MODEL INIT (With High Dropout)
# ---------------------------------------------------------
print("üèóÔ∏è Initializing Model...")
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(ENCODER_CHECKPOINT, DECODER_CHECKPOINT)
model.decoder.resize_token_embeddings(len(tokenizer))

# üõ†Ô∏è INJECTING DROPOUT (The "Old Code" Stability Fix)
# We force the model to use higher dropout to prevent memorization
model.config.decoder.activation_dropout = DROPOUT_RATE
model.config.decoder.attn_pdrop = DROPOUT_RATE
model.config.decoder.embd_pdrop = DROPOUT_RATE
print(f"üõ°Ô∏è Dropout set to {DROPOUT_RATE} (Matching Old Code)")

# Generation Config
model.config.num_beams = 3
model.config.max_length = MAX_LENGTH
model.config.early_stopping = True 
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id 
model.config.eos_token_id = tokenizer.eos_token_id
model.config.vocab_size = len(tokenizer)

model.to(device)

# 4. TRAINING ENGINE
# ---------------------------------------------------------
# CSV Logger
if not os.path.exists(LOG_FILE):
    with open(LOG_FILE, "w") as f: f.write("epoch,train_loss,val_bleu1,val_bleu4,saved\n")

def save_log(epoch, train_loss, bleu1, bleu4, saved):
    with open(LOG_FILE, "a") as f:
        f.write(f"{epoch},{train_loss:.4f},{bleu1:.2f},{bleu4:.2f},{saved}\n")

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5, weight_decay=0.01) # Added Weight Decay
best_bleu = 0.0

print(f"üöÄ Starting {TOTAL_EPOCHS} Epoch Training Scheme...")

for epoch in range(1, TOTAL_EPOCHS + 1):
    
    # --- PHASE SWITCHER ---
    if epoch == 1:
        print("‚ùÑÔ∏è PHASE 1: Encoder Frozen (Epochs 1-8)")
        for param in model.encoder.parameters(): param.requires_grad = False
        optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5, weight_decay=0.01)
        
    elif epoch == FINE_TUNE_AT_EPOCH:
        print("üîì PHASE 2: Unfreezing Encoder (Epoch 8+)")
        print("   -> Lowering LR to prevent catastrophic forgetting")
        for param in model.encoder.parameters(): param.requires_grad = True
        optimizer = AdamW([
            {'params': model.decoder.parameters(), 'lr': 5e-5, 'weight_decay': 0.01}, 
            {'params': model.encoder.parameters(), 'lr': 1e-5, 'weight_decay': 0.01}  
        ])

    # --- TRAIN ---
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{TOTAL_EPOCHS}")
    for batch in loop:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    
    avg_train_loss = total_loss / len(train_loader)

    # --- VALIDATE ---
    model.eval()
    image_map = {}
    for _, row in val_df.iterrows():
        if row['image_path'] not in image_map: image_map[row['image_path']] = []
        image_map[row['image_path']].append(row['caption'])
    
    eval_imgs = list(image_map.keys())[:300]
    references, hypotheses = [], []
    
    print("‚è≥ Validating...")
    with torch.no_grad():
        for img_path in tqdm(eval_imgs, desc="Eval", leave=False):
            try:
                image = Image.open(img_path).convert("RGB")
                img_tensor = val_transforms(image)
                pixel_values = feature_extractor(images=img_tensor, return_tensors="pt", do_rescale=False).pixel_values.to(device)
            except: continue
            
            gen_ids = model.generate(pixel_values, max_new_tokens=30)
            pred = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
            hypotheses.append(normalize_arabic(pred).split())
            references.append([normalize_arabic(c).split() for c in image_map[img_path]])

    chencherry = SmoothingFunction()
    bleu4 = corpus_bleu(references, hypotheses, smoothing_function=chencherry.method4) * 100
    bleu1 = corpus_bleu(references, hypotheses, weights=(1.0, 0, 0, 0), smoothing_function=chencherry.method4) * 100
    
    print(f"üìâ Epoch {epoch} Results:")
    print(f"   ‚Ä¢ Train Loss: {avg_train_loss:.4f}")
    print(f"   ‚Ä¢ BLEU-4:     {bleu4:.2f} (Target: >14.0)")
    
    # --- SAVE LOGIC ---
    saved_status = False
    if bleu4 > best_bleu:
        best_bleu = bleu4
        print(f"‚≠ê New Best Model! Saving to {OUTPUT_DIR}...")
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
        feature_extractor.save_pretrained(OUTPUT_DIR)
        saved_status = True
        
    save_log(epoch, avg_train_loss, bleu1, bleu4, saved_status)

print(f"‚úÖ Training Complete. Best BLEU-4: {best_bleu:.2f}")
print(f"üìÑ Log saved to: {LOG_FILE}")

In [5]:
# ==========================================
# üöÄ FLICKR8K FINAL MASTER CELL
# Strategy: Aggressive Regularization to stop Overfitting
# ==========================================
import os
import re
import json
import torch
import pandas as pd
import numpy as np
import random
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import evaluate
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

from torchvision import transforms
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from torch.optim import AdamW
import arabic_reshaper
from bidi.algorithm import get_display

# 0. CONFIGURATION
# ---------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Using Device: {device}")

# üö® PATHS
TXT_FILE_PATH = '/kaggle/input/datasets/samahfetouh/arabic-flickr8k-dataset/captions.txt'
IMG_ROOT_DIR = '/kaggle/input/datasets/samahfetouh/arabic-flickr8k-dataset/Images'

ENCODER_CHECKPOINT = "google/vit-base-patch16-224-in21k"
DECODER_CHECKPOINT = "aubmindlab/aragpt2-base"
OUTPUT_DIR = "./flickr8k_regularized"

# üõ†Ô∏è TUNED HYPERPARAMETERS
MAX_LENGTH = 32         
BATCH_SIZE = 32         
EPOCHS = 15             
LEARNING_RATE = 2e-5    # Lower LR to prevent memorization
WEIGHT_DECAY = 0.05     # High decay to punish complexity
LABEL_SMOOTHING = 0.1   # Prevent model from being "too sure"

# 1. ROBUST DATA LOADER
# ---------------------------------------------------------
def normalize_arabic(text):
    text = re.sub(r"[ÿ•ÿ£ÿ¢ÿß]", "ÿß", text)
    text = re.sub(r"ÿ©", "Ÿá", text)
    text = re.sub(r"Ÿâ", "Ÿä", text)
    text = re.sub(r"_", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text.strip()

def load_data_aggressive(txt_path, img_root):
    image_map = {}
    for root, dirs, files in os.walk(img_root):
        for f in files:
            if f.lower().endswith(('.jpg', '.jpeg', '.png')):
                key = os.path.splitext(f)[0].strip()
                image_map[key] = os.path.join(root, f)
    
    data = []
    with open(txt_path, 'r', encoding='utf-8') as f: lines = f.readlines()
    for line in lines:
        line = line.strip()
        if len(line) < 5: continue
        parts = re.split(r'[,\t]', line, maxsplit=1)
        if len(parts) < 2: continue
        img_key = parts[0].strip().split('#')[0]
        img_key = os.path.splitext(img_key)[0].strip()
        if img_key in image_map:
            data.append({'image_path': image_map[img_key], 'caption': parts[1].strip()})
    return pd.DataFrame(data)

print("‚è≥ Loading Data...")
full_df = load_data_aggressive(TXT_FILE_PATH, IMG_ROOT_DIR)
train_df, val_df = train_test_split(full_df, test_size=0.1, random_state=42)
print(f"‚úÖ Train: {len(train_df)} | Val: {len(val_df)}")

# 2. DATASET (WITH STRONG AUGMENTATION)
# ---------------------------------------------------------
feature_extractor = ViTImageProcessor.from_pretrained(ENCODER_CHECKPOINT)
tokenizer = AutoTokenizer.from_pretrained(DECODER_CHECKPOINT)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[BOS]'})

# üé® AGGRESSIVE AUGMENTATION
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)), # Zoom in randomly
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10), # Slight rotation
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.ToTensor()
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

class Flickr8kDataset(Dataset):
    def __init__(self, df, tokenizer, transform):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.transform = transform 

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row['image_path']).convert("RGB")
        img_tensor = self.transform(image)
        # do_rescale=False because ToTensor() already scales to [0,1]
        pixel_values = feature_extractor(images=img_tensor, return_tensors="pt", do_rescale=False).pixel_values.squeeze()
        
        caption = normalize_arabic(row['caption'])
        raw_tokens = self.tokenizer(caption, add_special_tokens=False).input_ids
        if len(raw_tokens) > MAX_LENGTH - 2: raw_tokens = raw_tokens[:MAX_LENGTH - 2]
        final_tokens = [self.tokenizer.bos_token_id] + raw_tokens + [self.tokenizer.eos_token_id]
        
        padding_len = MAX_LENGTH - len(final_tokens)
        if padding_len > 0:
            final_tokens = final_tokens + [self.tokenizer.pad_token_id] * padding_len
            
        labels = torch.tensor(final_tokens)
        labels[labels == self.tokenizer.pad_token_id] = -100 
        
        return {"pixel_values": pixel_values, "labels": labels}

train_ds = Flickr8kDataset(train_df, tokenizer, train_transforms)
val_ds = Flickr8kDataset(val_df, tokenizer, val_transforms)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# 3. MODEL INIT (PARTIAL FREEZE)
# ---------------------------------------------------------
print("üèóÔ∏è Initializing Model...")
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(ENCODER_CHECKPOINT, DECODER_CHECKPOINT)
model.decoder.resize_token_embeddings(len(tokenizer))

model.config.num_beams = 4
model.config.max_length = MAX_LENGTH
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id 
model.config.eos_token_id = tokenizer.eos_token_id
model.config.vocab_size = len(tokenizer)

# üîí FREEZE STRATEGY
# 1. Freeze ViT (Encoder) entirely first
for param in model.encoder.parameters(): param.requires_grad = False
# 2. Freeze Bottom 6 Layers of AraGPT2 (Decoder) - Keep basic language skills
for i, block in enumerate(model.decoder.transformer.h):
    if i < 6: # AraGPT2-base has 12 layers, freeze first half
        for param in block.parameters(): param.requires_grad = False

print("‚ùÑÔ∏è Frozen: ViT Encoder + Bottom 6 Decoder Layers")

model.to(device)

# 4. TRAINING WITH LABEL SMOOTHING
# ---------------------------------------------------------
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
# Custom Loss with Label Smoothing
criterion = torch.nn.CrossEntropyLoss(ignore_index=-100, label_smoothing=LABEL_SMOOTHING)

best_bleu = 0.0
history = []

print(f"üöÄ Starting Training ({EPOCHS} Epochs)...")

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}")
    
    for batch in loop:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass (get logits, not loss directly, so we can smooth)
        outputs = model(pixel_values=pixel_values, labels=labels)
        logits = outputs.logits
        
        # Reshape for Loss: (Batch * Seq, Vocab)
        loss = criterion(logits.view(-1, model.config.vocab_size), labels.view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    
    avg_loss = total_loss / len(train_loader)
    
    # VALIDATE (Every 2 epochs to save time, or every epoch)
    model.eval()
    image_map = {}
    for _, row in val_df.iterrows():
        if row['image_path'] not in image_map: image_map[row['image_path']] = []
        image_map[row['image_path']].append(row['caption'])
    
    eval_imgs = list(image_map.keys())[:300]
    references, hypotheses = [], []
    
    print("‚è≥ Validating...")
    with torch.no_grad():
        for img_path in tqdm(eval_imgs, desc="Eval", leave=False):
            try:
                image = Image.open(img_path).convert("RGB")
                img_tensor = val_transforms(image)
                pixel_values = feature_extractor(images=img_tensor, return_tensors="pt", do_rescale=False).pixel_values.to(device)
            except: continue
            
            gen_ids = model.generate(pixel_values, max_new_tokens=30)
            pred = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
            hypotheses.append(normalize_arabic(pred).split())
            references.append([normalize_arabic(c).split() for c in image_map[img_path]])

    chencherry = SmoothingFunction()
    bleu1 = corpus_bleu(references, hypotheses, weights=(1.0, 0, 0, 0), smoothing_function=chencherry.method4) * 100
    bleu4 = corpus_bleu(references, hypotheses, smoothing_function=chencherry.method4) * 100
    
    print(f"üìâ Epoch {epoch}: Loss={avg_loss:.4f} | BLEU-1={bleu1:.2f} | BLEU-4={bleu4:.2f}")
    
    if bleu1 > best_bleu:
        best_bleu = bleu1
        print(f"‚≠ê Saving Best Model (BLEU-1: {best_bleu:.2f})")
        model.save_pretrained(OUTPUT_DIR)
        tokenizer.save_pretrained(OUTPUT_DIR)
        feature_extractor.save_pretrained(OUTPUT_DIR)

print("‚úÖ DONE!")

üöÄ Using Device: cuda
‚è≥ Loading Data...
‚úÖ Train: 21845 | Val: 2428


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

üèóÔ∏è Initializing Model...


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/553M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at aubmindlab/aragpt2-base and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'tr

‚ùÑÔ∏è Frozen: ViT Encoder + Bottom 6 Decoder Layers
üöÄ Starting Training (15 Epochs)...


Epoch 1/15:   0%|          | 0/683 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]



üìâ Epoch 1: Loss=6.9931 | BLEU-1=7.36 | BLEU-4=0.71
‚≠ê Saving Best Model (BLEU-1: 7.36)




Epoch 2/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 2: Loss=5.1785 | BLEU-1=14.27 | BLEU-4=1.24
‚≠ê Saving Best Model (BLEU-1: 14.27)


Epoch 3/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 3: Loss=4.6241 | BLEU-1=16.00 | BLEU-4=1.36
‚≠ê Saving Best Model (BLEU-1: 16.00)


Epoch 4/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 4: Loss=4.4148 | BLEU-1=17.93 | BLEU-4=2.62
‚≠ê Saving Best Model (BLEU-1: 17.93)


Epoch 5/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 5: Loss=4.2629 | BLEU-1=18.71 | BLEU-4=3.46
‚≠ê Saving Best Model (BLEU-1: 18.71)


Epoch 6/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 6: Loss=4.1382 | BLEU-1=20.86 | BLEU-4=3.82
‚≠ê Saving Best Model (BLEU-1: 20.86)


Epoch 7/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 7: Loss=4.0395 | BLEU-1=22.66 | BLEU-4=4.55
‚≠ê Saving Best Model (BLEU-1: 22.66)


Epoch 8/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 8: Loss=3.9588 | BLEU-1=22.53 | BLEU-4=4.90


Epoch 9/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 9: Loss=3.8900 | BLEU-1=22.66 | BLEU-4=4.60
‚≠ê Saving Best Model (BLEU-1: 22.66)


Epoch 10/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 10: Loss=3.8237 | BLEU-1=22.68 | BLEU-4=4.99
‚≠ê Saving Best Model (BLEU-1: 22.68)


Epoch 11/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 11: Loss=3.7701 | BLEU-1=23.25 | BLEU-4=5.36
‚≠ê Saving Best Model (BLEU-1: 23.25)


Epoch 12/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 12: Loss=3.7189 | BLEU-1=24.64 | BLEU-4=5.90
‚≠ê Saving Best Model (BLEU-1: 24.64)


Epoch 13/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 13: Loss=3.6703 | BLEU-1=24.92 | BLEU-4=5.94
‚≠ê Saving Best Model (BLEU-1: 24.92)


Epoch 14/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 14: Loss=3.6280 | BLEU-1=24.77 | BLEU-4=6.61


Epoch 15/15:   0%|          | 0/683 [00:00<?, ?it/s]

‚è≥ Validating...


Eval:   0%|          | 0/300 [00:00<?, ?it/s]

üìâ Epoch 15: Loss=3.5865 | BLEU-1=24.70 | BLEU-4=7.03
‚úÖ DONE!
