# Install requirements

In [None]:
!python --version

In [None]:
# T5-Base Question Generation Pipeline

# Install required libraries
!pip install pytorch_lightning scikit-learn matplotlib seaborn optuna sacrebleu rouge-score nltk==3.8.1 transformers>=4.41.0
!pip install --quiet sentencepiece
!pip install --quiet tqdm==4.57.0
!pip install --quiet evaluate openpyxl
!pip install ipywidgets

In [None]:
!pip show sentencepiece

# Import Libraries

In [None]:
# Import libraries
import os
import argparse
import copy
import pandas as pd
import numpy as np
import torch
import csv
import unicodedata
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from sklearn.utils import shuffle
from datasets import load_dataset, concatenate_datasets
import matplotlib.pyplot as plt
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.optim.lr_scheduler import ReduceLROnPlateau
import optuna
import nltk
import sacrebleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import sentence_bleu

# Download NLTK resources
nltk.download('wordnet')
nltk.download('punkt')

# Set Seed

In [None]:
# Set seed for reproducibility
pl.seed_everything(42)

# Paths

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define project paths
ROOT_PATH = '/content/drive/My Drive/.Skripsi'
PROJECT_PATH = os.path.join(ROOT_PATH, 'HPO_Final')
DATA_PATH = os.path.join(PROJECT_PATH, 'dataset')
MODEL_PATH = os.path.join(PROJECT_PATH, 'model')
TOKENIZER_PATH = os.path.join(PROJECT_PATH, 'tokenizer')
LOG_PATH = os.path.join(PROJECT_PATH, 'log')
EVALUATION_PATH = os.path.join(PROJECT_PATH, 'evaluation')
OPTUNA_PATH = os.path.join(PROJECT_PATH, 'optuna')

# Create directories if missing
os.makedirs(PROJECT_PATH, exist_ok=True)
os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(TOKENIZER_PATH, exist_ok=True)
os.makedirs(LOG_PATH, exist_ok=True)
os.makedirs(EVALUATION_PATH, exist_ok=True)
os.makedirs(OPTUNA_PATH, exist_ok=True)

In [None]:
# Show project folder contents
contents = os.listdir(PROJECT_PATH)
print(f"Contents of {PROJECT_PATH}:")
for item in contents:
    print(item)

# Preprocessing

In [None]:
# Load Excel dataset and convert to CSV
xlsx_input_file = os.path.join(ROOT_PATH, "dataset_final.xlsx")
csv_output_file = os.path.join(ROOT_PATH, "dataset_final.csv")
df_xlsx = pd.read_excel(xlsx_input_file)
print(f"Original dataset shape: {df_xlsx.shape}")
print(f"Original columns: {list(df_xlsx.columns)}")

# Extract required columns
required_columns = ['context', 'question', 'answers', 'question_type']
missing_columns = [col for col in required_columns if col not in df_xlsx.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns present")
df_extracted = df_xlsx[required_columns].copy()
print(f"Extracted dataset shape: {df_extracted.shape}")
print(f"Final columns: {list(df_extracted.columns)}")

# Check missing and empty values
for col in df_extracted.columns:
    missing_count = df_extracted[col].isna().sum()
    empty_count = (df_extracted[col].astype(str).str.strip() == '').sum()
    print(f"  {col}: {missing_count} missing, {empty_count} empty strings")

# Save extracted CSV
df_extracted.to_csv(csv_output_file, index=False, quoting=csv.QUOTE_ALL)
print(f"Saved extracted data to: {csv_output_file}")

# Show sample data
print("\nSample of extracted data:")
print("Context (first 100 chars):", str(df_extracted['context'].iloc[0])[:100] + "...")
print("Question:", df_extracted['question'].iloc[0])
print("Answer:", df_extracted['answers'].iloc[0])
print("Question Type:", df_extracted['question_type'].iloc[0])

In [None]:
# Step 1: Clean excessive quotes
input_file_path = os.path.join(ROOT_PATH, "dataset_final.csv")
output_file_quoted_path = os.path.join(ROOT_PATH, "dataset_final_quoted.csv")
df = pd.read_csv(input_file_path, quotechar='"', dtype=str, keep_default_na=False)
print(f"Loaded dataset with shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Remove extra quotes
def remove_extra_quotes(text):
    return text.strip('"')

print("Removing excessive quotes...")
df["context"] = df["context"].apply(remove_extra_quotes)
df["answers"] = df["answers"].apply(remove_extra_quotes)
df["question"] = df["question"].apply(remove_extra_quotes)
df.to_csv(output_file_quoted_path, index=False, quoting=csv.QUOTE_ALL)
print(f"Saved quoted-cleaned file to: {output_file_quoted_path}")

In [None]:
# Step 2: Clean and normalize text
input_csv = output_file_quoted_path
output_csv = os.path.join(ROOT_PATH, "dataset_final_cleaned.csv")

# Normalize text
def clean_text(text):
    if pd.isna(text):
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    return text.strip()

columns_to_clean = ["context", "question", "answers"]
df = pd.read_csv(input_csv, dtype=str, keep_default_na=False)
print(f"Processing {len(df)} rows for text normalization...")

for col in columns_to_clean:
    if col in df.columns:
        print(f"Cleaning column: {col}")
        df[col] = df[col].apply(clean_text)
    else:
        print(f"WARNING: Column '{col}' not found")

df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL)
print(f"Saved cleaned CSV to: {output_csv}")

# Show sample of cleaned data
print("\nSample of cleaned data:")
print("Context (first 100 chars):", df['context'].iloc[0][:100] + "...")
print("Question:", df['question'].iloc[0])
print("Answer:", df['answers'].iloc[0])
print("Question Type:", df['question_type'].iloc[0])

# Load data

In [None]:
# Load cleaned dataset
dataset_path = os.path.join(ROOT_PATH, 'dataset_final_cleaned.csv')
df = pd.read_csv(dataset_path)

# Show dataset info
print(f"Dataset shape before stratified split: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(df.head())

In [None]:
# Show distribution before split
print("\nDistribution before split:")
print("Question Type distribution:")
question_type_dist = df['question_type'].value_counts().sort_index()
total_samples = len(df)
print("="*50)
for qtype, count in question_type_dist.items():
    percentage = (count / total_samples) * 100
    print(f"{qtype:8}: {count:4} samples ({percentage:5.2f}%)")
print("="*50)

# Minimum samples check for stratification
min_samples = question_type_dist.min()
print(f"\nMinimum samples per question type: {min_samples}")

In [None]:
if min_samples < 2:
    print("WARNING: Some question types have less than 2 samples. Cannot perform stratified sampling.")
    print("Falling back to regular random sampling.")
    # Stratified split into train, validation, and test sets (80:10:10) - Regular split
    train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
else:
    print("All question types have sufficient samples. Proceeding with stratified sampling.")
    stratified = True
    # Stratified split into train, validation, and test sets (80:10:10)
    train_df, temp_df = train_test_split(
        df,
        test_size=0.2,
        random_state=42,
        stratify=df['question_type']
    )
    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,
        random_state=42,
        stratify=temp_df['question_type']
    )

In [None]:
# DETAILED LOGGING OF QUESTION TYPE DISTRIBUTION AFTER SPLIT
# (BEFORE removing question_type column)
print("\n" + "="*80)
print("DETAILED QUESTION TYPE DISTRIBUTION AFTER SPLIT")
print("="*80)

# Calculate distributions for each split
train_dist = train_df['question_type'].value_counts().sort_index()
val_dist = val_df['question_type'].value_counts().sort_index()
test_dist = test_df['question_type'].value_counts().sort_index()

# Get all unique question types
all_qtypes = sorted(df['question_type'].unique())

# Create summary table
print(f"{'Question Type':<12} {'Original':<12} {'Train':<12} {'Validation':<12} {'Test':<12}")
print(f"{'':12} {'Count (%)':<12} {'Count (%)':<12} {'Count (%)':<12} {'Count (%)':<12}")
print("-" * 72)

for qtype in all_qtypes:
    orig_count = question_type_dist.get(qtype, 0)
    orig_pct = (orig_count / total_samples) * 100

    train_count = train_dist.get(qtype, 0)
    train_pct = (train_count / len(train_df)) * 100

    val_count = val_dist.get(qtype, 0)
    val_pct = (val_count / len(val_df)) * 100

    test_count = test_dist.get(qtype, 0)
    test_pct = (test_count / len(test_df)) * 100

    print(f"{qtype:<12} {orig_count:4}({orig_pct:5.1f}%) {train_count:4}({train_pct:5.1f}%) {val_count:4}({val_pct:5.1f}%) {test_count:4}({test_pct:5.1f}%)")

print("-" * 72)
print(f"{'TOTAL':<12} {total_samples:4}({100.0:5.1f}%) {len(train_df):4}({100.0:5.1f}%) {len(val_df):4}({100.0:5.1f}%) {len(test_df):4}({100.0:5.1f}%)")

# Verify stratification quality (only if stratified sampling was used)
if stratified:
    print("\n" + "="*50)
    print("STRATIFICATION QUALITY CHECK")
    print("="*50)
    print("Checking if proportions are maintained across splits...")

    max_deviation = 0
    for qtype in all_qtypes:
        if qtype in question_type_dist:
            orig_pct = (question_type_dist[qtype] / total_samples) * 100
            train_pct = (train_dist.get(qtype, 0) / len(train_df)) * 100
            val_pct = (val_dist.get(qtype, 0) / len(val_df)) * 100
            test_pct = (test_dist.get(qtype, 0) / len(test_df)) * 100

            train_dev = abs(orig_pct - train_pct)
            val_dev = abs(orig_pct - val_pct)
            test_dev = abs(orig_pct - test_pct)

            max_dev = max(train_dev, val_dev, test_dev)
            max_deviation = max(max_deviation, max_dev)

            print(f"{qtype}: Original {orig_pct:.1f}% | Deviations: Train ±{train_dev:.1f}%, Val ±{val_dev:.1f}%, Test ±{test_dev:.1f}%")

    print(f"\nMaximum deviation from original proportions: ±{max_deviation:.1f}%")
    if max_deviation < 2.0:
        print(" EXCELLENT: Stratification maintained proportions very well")
    elif max_deviation < 5.0:
        print(" GOOD: Stratification maintained proportions adequately")
    else:
        print(" WARNING: Some proportions deviated significantly")

In [None]:
# Keep only required columns
core_columns = ['context', 'answers', 'question']
train_df = train_df[core_columns].copy()
val_df = val_df[core_columns].copy()
test_df = test_df[core_columns].copy()

# Shuffle the dataframes
train_df = shuffle(train_df, random_state=42)
val_df = shuffle(val_df, random_state=42)
test_df = shuffle(test_df, random_state=42)

# Save processed dataset
processed_dataset_path = os.path.join(DATA_PATH, 'processed_dataset.csv')
df[core_columns].to_csv(processed_dataset_path, index=False, quoting=csv.QUOTE_ALL)

# Save splits
train_df_path = os.path.join(DATA_PATH, 'train_df.csv')
val_df_path = os.path.join(DATA_PATH, 'val_df.csv')
test_df_path = os.path.join(DATA_PATH, 'test_df.csv')

train_df.to_csv(train_df_path, index=False, quoting=csv.QUOTE_ALL)
val_df.to_csv(val_df_path, index=False, quoting=csv.QUOTE_ALL)
test_df.to_csv(test_df_path, index=False, quoting=csv.QUOTE_ALL)

# Reload dataset
train_df = pd.read_csv(train_df_path)
val_df = pd.read_csv(val_df_path)
test_df = pd.read_csv(test_df_path)

# Count and show dataset distribution
total_len = len(train_df) + len(val_df) + len(test_df)
train_len = len(train_df)
val_len = len(val_df)
test_len = len(test_df)

train_percentage = (train_len / total_len) * 100
val_percentage = (val_len / total_len) * 100
test_percentage = (test_len / total_len) * 100

print(f"\n=== FINAL DATASET SPLIT SUMMARY ===")
print(f"Train length: {train_len} samples ({train_percentage:.2f}%)")
print(f"Validation length: {val_len} samples ({val_percentage:.2f}%)")
print(f"Test length: {test_len} samples ({test_percentage:.2f}%)")
print(f"Total: {total_len} samples")

if stratified:
    print(f"\Stratified random sampling completed successfully")
    print(f"Proportional distribution maintained across question_type")
else:
    print(f"\Regular random sampling completed")
    print(f"Note: Stratification was not possible due to insufficient samples in some categories")

print(f"Final datasets contain only core columns: {core_columns}")
print(f"All split files saved to: {DATA_PATH}")

In [None]:
## Load T5-base pretrained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
TOKENIZER = T5Tokenizer.from_pretrained("t5-base")

print(f"Model loaded: T5-base")
print(f"Model parameters: {MODEL.num_parameters():,}")

In [None]:
# Analyze token lengths for each column
def analyze_lengths(df, tokenizer, column):
    lengths = [len(tokenizer.encode(str(row[column]))) for _, row in df.iterrows()]
    print(f"{column} - Max length: {max(lengths)}, Avg length: {sum(lengths)/len(lengths):.2f}")

analyze_lengths(train_df, TOKENIZER, 'context')
analyze_lengths(train_df, TOKENIZER, 'answers')
analyze_lengths(train_df, TOKENIZER, 'question')

# Combine splits for sequence length analysis
full_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
input_seq_lengths, target_seq_lengths = [], []

for _, row in full_df.iterrows():
    input_text = f"context: {row['context']} answers: {row['answers']}"
    input_tokens = TOKENIZER.encode(input_text, add_special_tokens=True)
    input_seq_lengths.append(len(input_tokens))

    target_text = f"question: {row['question']}"
    target_tokens = TOKENIZER.encode(target_text, add_special_tokens=True)
    target_seq_lengths.append(len(target_tokens))

# Compute percentiles for input sequence lengths
input_max_len_90 = int(np.percentile(input_seq_lengths, 90))
input_max_len_95 = int(np.percentile(input_seq_lengths, 95))
input_max_len_99 = int(np.percentile(input_seq_lengths, 99))
print(f"Input Sequence Lengths (context + answers):")
print(f"90th percentile: {input_max_len_90}")
print(f"95th percentile: {input_max_len_95}")
print(f"99th percentile: {input_max_len_99}")
print(f"Max length: {max(input_seq_lengths)}")
print(f"Average length: {np.mean(input_seq_lengths):.2f}")

# Compute percentiles for target sequence lengths
target_max_len_90 = int(np.percentile(target_seq_lengths, 90))
target_max_len_95 = int(np.percentile(target_seq_lengths, 95))
target_max_len_99 = int(np.percentile(target_seq_lengths, 99))
print(f"\nTarget Sequence Lengths (question):")
print(f"90th percentile: {target_max_len_90}")
print(f"95th percentile: {target_max_len_95}")
print(f"99th percentile: {target_max_len_99}")
print(f"Max length: {max(target_seq_lengths)}")
print(f"Average length: {np.mean(target_seq_lengths):.2f}")

# Class Dasataset

In [None]:
# Use max lengths for training
input_max_len = max(input_seq_lengths)
target_max_len = max(target_seq_lengths)
TOKENIZER = T5Tokenizer.from_pretrained("t5-base")

print(f"Using sequence lengths - Input: {input_max_len}, Target: {target_max_len}")
print(f"Device: {device}")

# Custom dataset for T5
class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=input_max_len, max_len_out=target_max_len):
        self.path = filepath
        self.passage_column = "context"
        self.answers = "answers"
        self.question = "question"
        self.data = pd.read_csv(self.path)
        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs, self.targets = [], []
        self.skippedcount = 0
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()
        src_mask = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()
        labels = copy.deepcopy(target_ids)
        labels[labels == 0] = -100
        return {
            "source_ids": source_ids,
            "source_mask": src_mask,
            "target_ids": target_ids,
            "target_mask": target_mask,
            "labels": labels
        }

    def _build(self):
        print(f"Building dataset from: {self.path}")
        print(f"Dataset shape: {self.data.shape}")
        print(f"Columns: {list(self.data.columns)}")

        for idx in tqdm(range(len(self.data)), desc="Processing samples"):
            passage = self.data.loc[idx, self.passage_column]
            answers = self.data.loc[idx, self.answers]
            target = self.data.loc[idx, self.question]

            input_ = f"context: {passage} answers: {answers}"
            target = f"question: {str(target)}"

            test_input_encoding = self.tokenizer.encode_plus(input_, truncation=False, return_tensors="pt")
            length_of_input_encoding = len(test_input_encoding['input_ids'][0])
            if length_of_input_encoding > self.max_len_input:
                self.skippedcount += 1
                continue

            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_],
                max_length=self.max_len_input,
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )

            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target],
                max_length=self.max_len_output,
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

        print(f"  Dataset built successfully")
        print(f"  Total samples processed: {len(self.data)}")
        print(f"  Samples included: {len(self.inputs)}")
        print(f"  Samples skipped (too long): {self.skippedcount}")
        print(f"  Inclusion rate: {len(self.inputs)/len(self.data)*100:.2f}%")

# Build train, validation, and test datasets
train_dataset = QuestionGenerationDataset(TOKENIZER, train_df_path)
val_dataset   = QuestionGenerationDataset(TOKENIZER, val_df_path)
test_dataset  = QuestionGenerationDataset(TOKENIZER, test_df_path)

# Show dataset statistics
total_len = len(train_dataset) + len(val_dataset) + len(test_dataset)
train_len, val_len, test_len = len(train_dataset), len(val_dataset), len(test_dataset)
print(f"\n=== FINAL DATASET STATISTICS ===")
print(f"Train dataset length: {train_len}")
print(f"Validation dataset length: {val_len}")
print(f"Test dataset length: {test_len}")
print(f"Total dataset length: {total_len}")
print(f"Train percentage: {(train_len/total_len)*100:.2f}%")
print(f"Validation percentage: {(val_len/total_len)*100:.2f}%")
print(f"Test percentage: {(test_len/total_len)*100:.2f}%")

# Show skipped samples summary
total_skipped = train_dataset.skippedcount + val_dataset.skippedcount + test_dataset.skippedcount
original_total = len(pd.read_csv(train_df_path)) + len(pd.read_csv(val_df_path)) + len(pd.read_csv(test_df_path))

if total_skipped > 0:
    print(f"\nSKIPPED SAMPLES SUMMARY:")
    print(f"Train skipped: {train_dataset.skippedcount}")
    print(f"Validation skipped: {val_dataset.skippedcount}")
    print(f"Test skipped: {test_dataset.skippedcount}")
    print(f"Total skipped: {total_skipped}")
    print(f"Skip rate: {total_skipped/original_total*100:.2f}%")
else:
    print(f"\nNo samples skipped - all data within sequence length limits")

print(f"\nDataset creation completed successfully")
print(f"Ready for training with sequence lengths: {input_max_len}/{target_max_len}")

# HPO

## Fungsi Analisis Data HPO

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from collections import defaultdict
import json
import os
from datetime import datetime
import optuna
import gc

# 1. Export basic trial data to CSV
def export_trials_to_csv(study, base_path=None):
    # Export all trial data to CSV only
    if base_path is None:
        base_path = LOG_PATH if 'LOG_PATH' in globals() else './'

    df = study.trials_dataframe()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = os.path.join(base_path, f'optuna_trials_{timestamp}.csv')
    df.to_csv(csv_filename, index=False)
    print(f"Trial data exported to CSV: {csv_filename}")
    return df

# 2. Create detailed TPE analysis
def create_detailed_tpe_analysis(study, base_path=None):
    # Create detailed analysis of TPE decision making process
    if base_path is None:
        base_path = LOG_PATH if 'LOG_PATH' in globals() else './'

    trials_data = []
    for i, trial in enumerate(study.trials):
        trial_data = {
            'trial_number': trial.number,
            'objective_value': trial.value,
            'bleu_score': -trial.value if trial.value is not None else None,
            'state': trial.state.name,
            'duration_seconds': trial.duration.total_seconds() if trial.duration else None,
            'datetime_start': trial.datetime_start,
            'datetime_complete': trial.datetime_complete
        }

        for param_name, param_value in trial.params.items():
            trial_data[f'param_{param_name}'] = param_value
        for attr_name, attr_value in trial.user_attrs.items():
            trial_data[attr_name] = attr_value

        if i > 0:
            completed_trials = [t for t in study.trials[:i] if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None]
            if completed_trials:
                values = [t.value for t in completed_trials]
                threshold = np.percentile(values, 25)
                trial_data['is_good_region'] = trial.value <= threshold if trial.value is not None else None
                trial_data['rank_percentile'] = (sorted(values + [trial.value]).index(trial.value) + 1) / len(values + [trial.value]) * 100 if trial.value is not None else None
            else:
                trial_data['is_good_region'] = None
                trial_data['rank_percentile'] = None
        else:
            trial_data['is_good_region'] = None
            trial_data['rank_percentile'] = None

        completed_values = [t.value for t in study.trials[:i+1] if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None]
        if completed_values:
            best_value = min(completed_values)
            trial_data['best_bleu_so_far'] = -best_value
            trial_data['is_improvement'] = trial.value == best_value if trial.value is not None else False
        else:
            trial_data['best_bleu_so_far'] = None
            trial_data['is_improvement'] = False

        trials_data.append(trial_data)

    detailed_df = pd.DataFrame(trials_data)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = os.path.join(base_path, f'detailed_tpe_analysis_{timestamp}.csv')
    detailed_df.to_csv(csv_filename, index=False)
    print(f"Detailed TPE analysis exported to CSV: {csv_filename}")
    return detailed_df

# 3. Export parameter importance analysis
def export_parameter_importance(study, base_path=None):
    # Calculate and export parameter importance
    if base_path is None:
        base_path = LOG_PATH if 'LOG_PATH' in globals() else './'
    try:
        importance = optuna.importance.get_param_importances(study)
        importance_df = pd.DataFrame([
            {'parameter': param, 'importance': imp}
            for param, imp in importance.items()
        ]).sort_values('importance', ascending=False)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_filename = os.path.join(base_path, f'parameter_importance_{timestamp}.csv')
        importance_df.to_csv(csv_filename, index=False)
        print(f"Parameter importance exported to CSV: {csv_filename}")
        return importance_df
    except Exception as e:
        print(f"Could not calculate parameter importance: {e}")
        return None

# 4. Analyze TPE distributions per parameter
def analyze_tpe_distributions_per_param(study, param_name):
    # Analyze TPE distributions for a specific parameter
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None]
    if len(completed_trials) < 2:
        return None

    sorted_trials = sorted(completed_trials, key=lambda x: x.value)
    n_good = max(1, len(sorted_trials) // 4)
    good_trials, poor_trials = sorted_trials[:n_good], sorted_trials[n_good:]

    good_values = [t.params[param_name] for t in good_trials if param_name in t.params]
    poor_values = [t.params[param_name] for t in poor_trials if param_name in t.params]

    stats = {
        'parameter': param_name,
        'total_trials': len(completed_trials),
        'good_trials_count': len(good_values),
        'poor_trials_count': len(poor_values),
        'good_mean': np.mean(good_values) if good_values else None,
        'good_std': np.std(good_values) if good_values else None,
        'good_min': np.min(good_values) if good_values else None,
        'good_max': np.max(good_values) if good_values else None,
        'poor_mean': np.mean(poor_values) if poor_values else None,
        'poor_std': np.std(poor_values) if poor_values else None,
        'poor_min': np.min(poor_values) if poor_values else None,
        'poor_max': np.max(poor_values) if poor_values else None,
    }
    return stats

def export_all_tpe_distributions(study, base_path=None):
    # Export TPE distribution analysis for all parameters
    if base_path is None:
        base_path = LOG_PATH if 'LOG_PATH' in globals() else './'

    all_params = set()
    for trial in study.trials:
        all_params.update(trial.params.keys())

    distribution_stats = []
    for param in all_params:
        stats = analyze_tpe_distributions_per_param(study, param)
        if stats:
            distribution_stats.append(stats)

    if distribution_stats:
        dist_df = pd.DataFrame(distribution_stats)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_filename = os.path.join(base_path, f'tpe_distributions_analysis_{timestamp}.csv')
        dist_df.to_csv(csv_filename, index=False)
        print(f"TPE distributions analysis exported to CSV: {csv_filename}")
        return dist_df
    else:
        print("No distribution analysis data available")
        return None

# 5. Generate comprehensive optimization report
def generate_optimization_report(study, base_path=None):
    # Generate comprehensive text report of optimization process
    if base_path is None:
        base_path = LOG_PATH if 'LOG_PATH' in globals() else './'

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = os.path.join(base_path, f'optimization_report_{timestamp}.txt')
    with open(filename, 'w') as f:
        f.write("T5 QUESTION GENERATION - OPTUNA TPE OPTIMIZATION REPORT\n")
        f.write("=" * 70 + "\n\n")
        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Study Direction: {study.direction.name}\n")
        f.write(f"Total Trials: {len(study.trials)}\n")
        completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
        f.write(f"Completed Trials: {len(completed_trials)}\n")
        failed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.FAIL]
        f.write(f"Failed Trials: {len(failed_trials)}\n\n")

        if study.best_trial:
            f.write("BEST RESULT:\n")
            f.write(f"Best BLEU-4 Score: {-study.best_value:.4f}%\n")
            f.write(f"Best Trial Number: {study.best_trial.number}\n")
            f.write("Best Hyperparameters:\n")
            for param, value in study.best_params.items():
                f.write(f"  {param}: {value}\n")
            f.write("\nBest Trial Additional Metrics:\n")
            for attr_name, attr_value in study.best_trial.user_attrs.items():
                f.write(f"  {attr_name}: {attr_value}\n")
            f.write("\n")

        try:
            importance = optuna.importance.get_param_importances(study)
            f.write("PARAMETER IMPORTANCE:\n")
            for param, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True):
                f.write(f"  {param}: {imp:.4f}\n")
            f.write("\n")
        except:
            f.write("PARAMETER IMPORTANCE: Could not calculate\n\n")

        f.write("TPE ANALYSIS SUMMARY:\n")
        all_params = set()
        for trial in study.trials:
            all_params.update(trial.params.keys())
        for param in all_params:
            stats = analyze_tpe_distributions_per_param(study, param)
            if stats and stats['good_mean'] is not None and stats['poor_mean'] is not None:
                f.write(f"\n{param}:\n")
                f.write(f"  Good trials mean: {stats['good_mean']}\n")
                f.write(f"  Poor trials mean: {stats['poor_mean']}\n")
                f.write(f"  Difference: {abs(stats['good_mean'] - stats['poor_mean']):.4f}\n")
                f.write(f"  Good trials count: {stats['good_trials_count']}\n")
                f.write(f"  Poor trials count: {stats['poor_trials_count']}\n")

        f.write("\nTOP 10 TRIALS:\n")
        f.write("Rank | Trial | BLEU-4  | LR      | Batch | Weight Decay | Train Loss | Val Loss\n")
        f.write("-" * 80 + "\n")
        successful_trials = [(t, -t.value) for t in completed_trials if t.value is not None]
        successful_trials.sort(key=lambda x: x[1], reverse=True)
        for i, (trial, bleu) in enumerate(successful_trials[:10], 1):
            lr = trial.params.get('learning_rate', 'N/A')
            bs = trial.params.get('batch_size', 'N/A')
            wd = trial.params.get('weight_decay', 'N/A')
            train_loss = trial.user_attrs.get('final_train_loss', 'N/A')
            val_loss = trial.user_attrs.get('final_val_loss', 'N/A')
            f.write(f"{i:4d} | {trial.number:5d} | {bleu:6.2f}% | {lr} | {bs:5} | {wd} | {train_loss} | {val_loss}\n")
    print(f"Comprehensive optimization report saved to: {filename}")
    return filename

# 6. Export performance summary statistics
def export_performance_summary(study, base_path=None):
    # Export summary statistics of optimization performance
    if base_path is None:
        base_path = LOG_PATH if 'LOG_PATH' in globals() else './'
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None]
    if not completed_trials:
        print("No completed trials for performance summary")
        return None

    bleu_scores = [-t.value for t in completed_trials]
    summary_stats = {
        'metric': ['BLEU-4 Score'],
        'count': [len(bleu_scores)],
        'mean': [np.mean(bleu_scores)],
        'std': [np.std(bleu_scores)],
        'min': [np.min(bleu_scores)],
        'max': [np.max(bleu_scores)],
        'median': [np.median(bleu_scores)],
        'q25': [np.percentile(bleu_scores, 25)],
        'q75': [np.percentile(bleu_scores, 75)]
    }

    meteor_scores = [t.user_attrs.get('meteor_score') for t in completed_trials if 'meteor_score' in t.user_attrs]
    if meteor_scores:
        meteor_scores = [s for s in meteor_scores if s is not None]
        if meteor_scores:
            summary_stats['metric'].append('METEOR Score')
            summary_stats['count'].append(len(meteor_scores))
            summary_stats['mean'].append(np.mean(meteor_scores))
            summary_stats['std'].append(np.std(meteor_scores))
            summary_stats['min'].append(np.min(meteor_scores))
            summary_stats['max'].append(np.max(meteor_scores))
            summary_stats['median'].append(np.median(meteor_scores))
            summary_stats['q25'].append(np.percentile(meteor_scores, 25))
            summary_stats['q75'].append(np.percentile(meteor_scores, 75))

    summary_df = pd.DataFrame(summary_stats)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = os.path.join(base_path, f'performance_summary_{timestamp}.csv')
    summary_df.to_csv(csv_filename, index=False)
    print(f"Performance summary exported to CSV: {csv_filename}")
    return summary_df

# 7. Create trial monitoring callback
def create_monitoring_callback(base_path=None, save_every=5):
    # Create callback for real-time monitoring during optimization
    if base_path is None:
        base_path = LOG_PATH if 'LOG_PATH' in globals() else './'
    def monitoring_callback(study, trial):
        if trial.number % save_every == 0 and trial.number > 0:
            print(f"\n=== MONITORING UPDATE - Trial {trial.number} ===")
            if study.best_trial:
                best_bleu = -study.best_value
                print(f"Current Best BLEU-4: {best_bleu:.4f}%")
                print(f"Best Trial: {study.best_trial.number}")
                print(f"Best Parameters: {study.best_params}")
            recent_trials = [t for t in study.trials[-save_every:] if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None]
            if recent_trials:
                recent_bleus = [-t.value for t in recent_trials]
                print(f"Recent {len(recent_trials)} trials BLEU-4: {np.mean(recent_bleus):.4f}% ± {np.std(recent_bleus):.4f}%")
            try:
                intermediate_df = study.trials_dataframe()
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                filename = os.path.join(base_path, f'intermediate_results_trial_{trial.number}_{timestamp}.csv')
                intermediate_df.to_csv(filename, index=False)
                print(f"Intermediate results saved: {filename}")
            except Exception as e:
                print(f"Could not save intermediate results: {e}")
            print("=" * 50)
    return monitoring_callback

# 8. Master analysis function - run all analyses
def run_complete_hpo_analysis(study, base_path=None):
    # Run all analysis functions in sequence
    if base_path is None:
        base_path = LOG_PATH if 'LOG_PATH' in globals() else './'

    print("\n" + "="*70)
    print("RUNNING COMPLETE HPO ANALYSIS")
    print("="*70)
    try:
        print("\n1. Exporting basic trial data...")
        trials_df = export_trials_to_csv(study, base_path)
        print("\n2. Creating detailed TPE analysis...")
        detailed_df = create_detailed_tpe_analysis(study, base_path)
        print("\n3. Calculating parameter importance...")
        importance_df = export_parameter_importance(study, base_path)
        print("\n4. Analyzing TPE distributions...")
        distributions_df = export_all_tpe_distributions(study, base_path)
        print("\n5. Generating comprehensive report...")
        report_file = generate_optimization_report(study, base_path)
        print("\n6. Creating performance summary...")
        summary_df = export_performance_summary(study, base_path)
        print("\n" + "="*70)
        print("COMPLETE HPO ANALYSIS FINISHED")
        print("="*70)
        print(f"All files saved to: {base_path}")
        return {
            'trials_df': trials_df,
            'detailed_df': detailed_df,
            'importance_df': importance_df,
            'distributions_df': distributions_df,
            'summary_df': summary_df,
            'report_file': report_file
        }
    except Exception as e:
        print(f"Error during analysis: {e}")
        import traceback
        traceback.print_exc()
        return None

## HPO nya tanpa Dropout Rate

In [None]:
# Hyperparameter Optimization Optuna - Clean Version

import torch
import gc
import optuna
import numpy as np
import pandas as pd
import os
import time
from tqdm import tqdm
from datetime import datetime
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from optuna.pruners import MedianPruner
from evaluate import load
import sacrebleu

# Global variable to track best BLEU score
current_best_bleu = 0.0

def cleanup_memory(*objects):
    """Clean up GPU memory and objects"""
    for obj in objects:
        del obj
    torch.cuda.empty_cache()
    gc.collect()

def compute_bleu_score_sacrebleu(references, predictions):
    """
    Compute BLEU score using sacreBLEU (corpus-level)
    References should be a list of strings
    Predictions should be a list of strings
    """
    return sacrebleu.corpus_bleu(predictions, [references]).score

def compute_bleu_score_identical_to_baseline(references, predictions):
    """
    Compute BLEU score using Hugging Face evaluate library
    This is more stable and compatible with newer Python versions
    """
    try:
        # Load BLEU metric from Hugging Face evaluate
        bleu_metric = load("bleu")

        # Format references as list of lists (required by evaluate library)
        formatted_refs = [[ref] for ref in references]

        # Compute BLEU score
        result = bleu_metric.compute(
            predictions=predictions,
            references=formatted_refs
        )

        return result["bleu"] * 100

    except Exception as e:
        print(f"Error computing BLEU with evaluate library: {e}")
        # Fallback to manual implementation
        return compute_bleu_score_manual(references, predictions)

def compute_bleu_score_manual(references, predictions):
    """
    Manual BLEU implementation as fallback
    Compatible with all Python versions
    """
    from collections import Counter
    import math

    def get_ngrams(tokens, n):
        return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

    def compute_bleu_for_sentence(reference, prediction):
        ref_tokens = reference.split()
        pred_tokens = prediction.split()

        if len(pred_tokens) == 0:
            return 0.0

        # Compute precision for n-grams (1 to 4)
        precisions = []
        for n in range(1, 5):
            if len(pred_tokens) < n:
                precisions.append(0.0)
                continue

            pred_ngrams = get_ngrams(pred_tokens, n)
            ref_ngrams = get_ngrams(ref_tokens, n)

            if len(pred_ngrams) == 0:
                precisions.append(0.0)
                continue

            pred_counter = Counter(pred_ngrams)
            ref_counter = Counter(ref_ngrams)

            overlap = 0
            for ngram in pred_counter:
                overlap += min(pred_counter[ngram], ref_counter[ngram])

            precision = overlap / len(pred_ngrams) if len(pred_ngrams) > 0 else 0.0
            precisions.append(precision)

        # Brevity penalty
        if len(pred_tokens) > 0:
            bp = min(1.0, math.exp(1 - len(ref_tokens) / len(pred_tokens)))
        else:
            bp = 0.0

        # Geometric mean of precisions
        if all(p > 0 for p in precisions):
            bleu = bp * math.exp(sum(math.log(p) for p in precisions) / 4)
        else:
            bleu = 0.0

        return bleu

    scores = []
    for ref_list, pred in zip(references, predictions):
        ref = ref_list[0] if isinstance(ref_list, list) else ref_list
        score = compute_bleu_for_sentence(ref, pred)
        scores.append(score)

    return np.mean(scores) * 100 if scores else 0.0


def objective(trial):
    global current_best_bleu

    start_time = time.time()  # Start time of trial


    # Hyperparameter suggestions - only 3 parameters
    # learning_rate = trial.suggest_categorical('learning_rate', [2e-5, 3e-5, 4e-5, 5e-5])
    # batch_size = trial.suggest_categorical('batch_size', [4, 6, 8, 12])
    # weight_decay = trial.suggest_categorical('weight_decay', [1e-6, 1e-5, 5e-5, 1e-4])

    # jika gridsearch
    learning_rate = trial.suggest_categorical('learning_rate', search_space_optimal['learning_rate'])
    batch_size = trial.suggest_categorical('batch_size', search_space_optimal['batch_size'])
    weight_decay = trial.suggest_categorical('weight_decay', search_space_optimal['weight_decay'])

    # Log trial parameters
    print(f"\n=== Trial {trial.number} ===")
    print(f"LR: {learning_rate}, Batch: {batch_size}, WD: {weight_decay}")
    print(f"Current Best BLEU: {current_best_bleu:.4f}%")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Model setup - standard T5 without dropout modification
    MODEL = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    # Optimizer setup
    OPTIMIZER = AdamW(MODEL.parameters(), lr=learning_rate, eps=1e-8, weight_decay=weight_decay)

    # Data loaders (assuming train_dataset and val_dataset are defined globally)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Training tracking
    train_losses = []
    val_losses = []
    best_bleu_this_trial = 0.0
    best_val_loss = float('inf')
    trial_status = "Ongoing"
    trial_data = []

    # Early stopping for Val Loss
    val_loss_patience_counter  = 0
    val_loss_patience = 2

    # Training and validation loop
    for epoch in range(10):  # 10 epochs as in original
        train_loss = 0
        val_loss = 0
        train_batch_count = 0
        val_batch_count = 0

        # TRAINING
        MODEL.train()
        for batch in tqdm(train_loader, desc=f'[Trial {trial.number}] Epoch {epoch+1} - Training'):
            input_ids = batch['source_ids'].to(device)
            attention_mask = batch['source_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['target_mask'].to(device)

            # Forward pass
            outputs = MODEL(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                decoder_attention_mask=decoder_attention_mask
            )

            # Optimization
            OPTIMIZER.zero_grad()
            outputs.loss.backward()
            OPTIMIZER.step()

            train_loss += outputs.loss.item()
            train_batch_count += 1

        # VALIDATION
        MODEL.eval()
        for batch in tqdm(val_loader, desc=f'[Trial {trial.number}] Epoch {epoch+1} - Validation'):
            input_ids = batch['source_ids'].to(device)
            attention_mask = batch['source_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['target_mask'].to(device)

            with torch.no_grad():
                outputs = MODEL(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    decoder_attention_mask=decoder_attention_mask
                )

            val_loss += outputs.loss.item()
            val_batch_count += 1

        # Calculate losses
        avg_train_loss = train_loss / train_batch_count
        avg_val_loss = val_loss / val_batch_count
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)

        # COMPUTE BLEU THIS EPOCH
        val_refs_bleu, val_hyps_bleu = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['source_ids'].to(device)
                attention_mask = batch['source_mask'].to(device)
                labels = batch['labels'].to(device)

                preds = MODEL.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=target_max_len  # Assuming this is defined globally
                )

                decoded_preds = [tokenizer.decode(g, skip_special_tokens=True) for g in preds]
                decoded_labels = [tokenizer.decode([t for t in label if t != -100], skip_special_tokens=True) for label in labels]

                val_hyps_bleu.extend(decoded_preds)
                val_refs_bleu.extend([[ref] for ref in decoded_labels])

        # Calculate BLEU using sacreBLEU
        flattened_refs = [ref[0] if isinstance(ref, list) else ref for ref in val_refs_bleu]
        epoch_bleu = compute_bleu_score_sacrebleu(flattened_refs, val_hyps_bleu)

        # Alternative: Use Hugging Face BLEU
        # epoch_bleu = compute_bleu_score_identical_to_baseline(val_refs_bleu, val_hyps_bleu)

        # Update BLEU tracking
        if epoch_bleu > best_bleu_this_trial:
            best_bleu_this_trial = epoch_bleu

        # Update global best BLEU
        if current_best_bleu < best_bleu_this_trial:
            current_best_bleu = best_bleu_this_trial

        print(f"[Trial {trial.number}] Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f} | Val Loss = {avg_val_loss:.4f} | BLEU = {epoch_bleu:.2f}% | Current Best BLEU = {current_best_bleu:.2f}%")

        # Store trial data for CSV logging
        trial_data.append({
            'Trial': trial.number,
            'Learning Rate': learning_rate,
            'Batch Size': batch_size,
            'Weight Decay': weight_decay,
            'Epoch': epoch + 1,
            'Train Loss': avg_train_loss,
            'Val Loss': avg_val_loss,
            'BLEU': epoch_bleu,
            'Status': trial_status,
            'GPU Memory (GB)': torch.cuda.max_memory_allocated()/1024**3 if torch.cuda.is_available() else 0
        })

        # PRUNING CHECK (disable when Gridsearch)
        # trial.report(-epoch_bleu, epoch)
        # if trial.should_prune():
        #     print(f"[Trial {trial.number}] PRUNED at epoch {epoch+1} based on BLEU")
        #     trial_status = "Pruned"
        #     trial.set_user_attr("status", trial_status)
        #     trial.set_user_attr("final_train_loss", f"{avg_train_loss:.6f}")
        #     trial.set_user_attr("final_val_loss", f"{avg_val_loss:.6f}")
        #     trial.set_user_attr("epochs_completed", epoch + 1)
        #     trial.set_user_attr("best_bleu_epoch", best_bleu_this_trial)

        #     end_time = time.time()
        #     duration = end_time - start_time
        #     formatted_duration = time.strftime("%H:%M:%S", time.gmtime(duration))
        #     print(f"[Trial {trial.number}] Duration: {formatted_duration} ({duration:.2f} seconds)")

        #     cleanup_memory(MODEL, OPTIMIZER, train_loader, val_loader)
        #     raise optuna.exceptions.TrialPruned()

        # EARLY STOPPING based on validation loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            val_loss_patience_counter  = 0
        else:
            val_loss_patience_counter  += 1
            print(f"Val Loss Patience Counter: {val_loss_patience_counter} / {val_loss_patience}")
            if val_loss_patience_counter > val_loss_patience:
                print(f"[Trial {trial.number}] EARLY STOPPED at epoch {epoch+1}")
                trial_status = "Early Stopped"
                trial.set_user_attr("status", trial_status)
                trial.set_user_attr("final_train_loss", f"{avg_train_loss:.6f}")
                trial.set_user_attr("final_val_loss", f"{avg_val_loss:.6f}")
                trial.set_user_attr("epochs_completed", epoch + 1)
                trial.set_user_attr("best_bleu_epoch", best_bleu_this_trial)

                end_time = time.time()
                duration = end_time - start_time
                formatted_duration = time.strftime("%H:%M:%S", time.gmtime(duration))
                print(f"[Trial {trial.number}] Duration: {formatted_duration} ({duration:.2f} seconds)")

                cleanup_memory(MODEL, OPTIMIZER, train_loader, val_loader)
                break

    print(f"[Trial {trial.number}] Final BLEU-4: {best_bleu_this_trial:.4f}")
    print(f"[Trial {trial.number}] Final Train Loss: {train_losses[-1]:.6f}")
    print(f"[Trial {trial.number}] Final Val Loss: {val_losses[-1]:.6f}")

    # Store final trial results
    if trial_status == "Ongoing":
        trial_status = "Completed"

    trial.set_user_attr("status", trial_status)
    trial.set_user_attr("final_train_loss", f"{train_losses[-1]:.6f}")
    trial.set_user_attr("final_val_loss", f"{val_losses[-1]:.6f}")
    trial.set_user_attr("epochs_completed", len(train_losses))
    trial.set_user_attr("best_bleu_epoch", best_bleu_this_trial)

    # Save trial data to CSV (assuming OPTUNA_PATH is defined)
    if 'OPTUNA_PATH' in globals():
        trial_df = pd.DataFrame(trial_data)
        csv_file = os.path.join(OPTUNA_PATH, f'trial_{trial.number}_results.csv')
        trial_df.to_csv(csv_file, index=False)

    end_time = time.time()
    duration = end_time - start_time
    formatted_duration = time.strftime("%H:%M:%S", time.gmtime(duration))
    print(f"[Trial {trial.number}] Duration: {formatted_duration} ({duration:.2f} seconds)")

    cleanup_memory(MODEL, OPTIMIZER, train_loader, val_loader)

    return -best_bleu_this_trial  # Negative for minimization


# MAIN EXECUTION
print("="*60)
print("HPO STUDY STARTS NOW!!!")
print("="*60)

# Final cleanup
torch.cuda.empty_cache()
gc.collect()

# Create Optuna study

# Metode Gridsearch (brute force semua kombinasi)
# search_space = {
#     "learning_rate": [2e-5, 3e-5, 4e-5, 5e-5],
#     "batch_size": [4, 6, 8, 12],
#     "weight_decay": [1e-6, 1e-5, 5e-5, 1e-4]
# }
# study = optuna.create_study(
#     direction="minimize",
#     sampler=optuna.samplers.GridSampler(search_space)
# )

# OPTIMAL NARROW GRIDSEARCH CONFIGURATION
# Total combinations: 4×3×3 = 36
# 100% Coverage
search_space_optimal = {
    "learning_rate": [2.8e-5, 3e-5, 3.2e-5, 3.5e-5],   # Fine-grain around best
    "batch_size": [6, 7, 8],                            # Only proven winners
    "weight_decay": [9e-5, 1e-4, 1.1e-4]               # Fine-tune around dominant
}

study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.GridSampler(search_space_optimal),
    study_name="QG_T5_GridSearch_Optimal"
)
study.optimize(objective, n_trials=36)  # 36 = total combinations (4×3×3)

# study untuk menggunakan TPE sampler custom config
# study = optuna.create_study(
#     direction="minimize",
#     study_name="QG_T5_HPO_CLEAN",
#     sampler=optuna.samplers.TPESampler(
#         n_startup_trials=3,    # Reduced from default 10
#         n_ei_candidates=12     # Focus search efficiency
#     ),
#     pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# )

# study 64 kombinasi TPE sampler custom config
# study = optuna.create_study(
#     direction="minimize",
#     study_name="QG_T5_TPE_64Search",
#     sampler=optuna.samplers.TPESampler(
#         # random trials awal sebelum TPE mulai membangun distribusi probabilistik dari parameter yang bagus dan buruk.
#         n_startup_trials=8,       # 12.5% dari 64
#         # menentukan berapa kandidat yang akan dipertimbangkan dalam optimasi Expected Improvement (EI).
#         n_ei_candidates=16        # cek 25% dari kombinasi, cukup agresif
#     ),
#     pruner=MedianPruner(n_startup_trials=8, n_warmup_steps=2)
# )
# study.optimize(objective, n_trials=32)  # ~50% coverage

# study untuk menggunakan default konfigurasi TPE (n_startup = 10, n_ei_cand = 24)
# study = optuna.create_study(
#     direction="minimize",
#     study_name="QG_T5_HPO_WITHOUT_DROPOUT",
#     pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=2)
# )


# # Run optimization
# try:
#     study.optimize(objective, n_trials=20)
# except Exception as e:
#     print(f"Optuna optimization failed: {e}")
#     print("Continuing with available results...")

# RESULTS ANALYSIS
if study.trials:
    best_trial = study.best_trial
    best_bleu = -best_trial.value

    print("\n" + "="*60)
    print("HPO STUDY RESULTS")
    print("="*60)
    print(f"Best BLEU-4 Score: {best_bleu:.4f}")

    print(f"\nBest Hyperparameters:")
    for param in best_trial.params:
        print(f"• {param.replace('_', ' ').title()}: {best_trial.params[param]}")

    # Analyze all trials
    # successful_trials = [t for t in study.trials if t.value != float('inf')]
    successful_trials = [t for t in study.trials if t.value is not None and t.value != float('inf')]
    print(f"\nAll Trial Results:")
    print("Rank | BLEU-4  | LR     | BS | WD     | Status")
    print("-" * 50)

    # Sort by BLEU score
    trial_results = [(t, -t.value) for t in successful_trials]
    trial_results.sort(key=lambda x: x[1], reverse=True)

    for i, (trial, bleu) in enumerate(trial_results[:10], 1):  # Top 10
        lr = trial.params['learning_rate']
        bs = trial.params['batch_size']
        wd = trial.params['weight_decay']
        status = trial.user_attrs.get('status', 'Unknown')[:8]
        print(f"{i:4d} | {bleu:6.2f}% | {lr:.0e} | {bs:2d} | {wd:.0e} | {status}")

    # Parameter effectiveness analysis
    print(f"\nParameter Effectiveness Analysis:")

    # Group by parameter values
    from collections import defaultdict

    param_stats = defaultdict(lambda: defaultdict(list))

    for trial, bleu in trial_results:
        for param in ['learning_rate', 'batch_size', 'weight_decay']:
            value = trial.params[param]
            param_stats[param][value].append(bleu)

    print("\nAverage BLEU-4 by parameter value:")

    for param in ['learning_rate', 'batch_size', 'weight_decay']:
        print(f"\n{param.replace('_', ' ').title()}:")
        for val in sorted(param_stats[param].keys()):
            avg_bleu = np.mean(param_stats[param][val])
            n_trials = len(param_stats[param][val])
            if isinstance(val, float):
                print(f"  {val:.2e}: {avg_bleu:.2f}% ({n_trials} trials)")
            else:
                print(f"  {val:>7}: {avg_bleu:.2f}% ({n_trials} trials)")

    # Save detailed results
    results_data = []
    for trial, bleu in trial_results:
        results_data.append({
            'Trial': trial.number,
            'BLEU_4': bleu,
            'Learning_Rate': trial.params['learning_rate'],
            'Batch_Size': trial.params['batch_size'],
            'Weight_Decay': trial.params['weight_decay'],
            'Status': trial.user_attrs.get('status', 'Unknown'),
            'Final_Train_Loss': trial.user_attrs.get('final_train_loss', 0),
            'Final_Val_Loss': trial.user_attrs.get('final_val_loss', 0),
            'Epochs_Completed': trial.user_attrs.get('epochs_completed', 0)
        })

    results_df = pd.DataFrame(results_data)

    # Save results (assuming LOG_PATH is defined)
    if 'LOG_PATH' in globals():
        csv_path = os.path.join(LOG_PATH, f'hpo_study_postAnalysis_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv')
        results_df.to_csv(csv_path, index=False)
        print(f"\nDetailed results saved to: {csv_path}")

    print(f"\nSUMMARY:")
    print(f"• {len(successful_trials)}/{len(study.trials)} trials successful")
    print(f"• Best BLEU-4: {best_bleu:.4f}%")
    print(f"• Best hyperparameters:")
    for param, value in best_trial.params.items():
        print(f"  - {param}: {value}")

    # Trial status breakdown
    status_counts = defaultdict(int)
    for trial in study.trials:
        status = trial.user_attrs.get('status', 'Unknown')
        status_counts[status] += 1

    print(f"\nTrial Status Breakdown:")
    for status, count in status_counts.items():
        print(f"• {status}: {count} trials")

else:
    print("No trials completed successfully.")

print("\n" + "="*60)
print("HPO STUDY COMPLETED")
print("="*60)

# Final cleanup
torch.cuda.empty_cache()
gc.collect()

## HPO NYA dengan Dropout Rate

In [None]:
# Hyperparameter Optimazion Optuna

import torch
import gc
import time
import optuna
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
from optuna.pruners import MedianPruner
from evaluate import load
import sacrebleu
from IPython.display import display, Javascript

def auto_scroll_output():
    display(Javascript('''
        var outCells = document.querySelectorAll('.output_scroll');
        if (outCells.length > 0) {
            var lastOut = outCells[outCells.length - 1];
            lastOut.scrollTop = lastOut.scrollHeight;
        }
    '''))

# Global variable to track best BLEU score
current_best_bleu = 0.0

def cleanup_memory(*objects):
    for obj in objects:
        del obj
    torch.cuda.empty_cache()
    gc.collect()

def compute_bleu_score_sacrebleu(references, predictions):
    """
    Compute BLEU score using sacreBLEU (corpus-level)
    References should be a list of strings
    Predictions should be a list of strings
    """
    return sacrebleu.corpus_bleu(predictions, [references]).score

def compute_bleu_score_identical_to_baseline(references, predictions):
    """
    Compute BLEU score using Hugging Face evaluate library
    This is more stable and compatible with newer Python versions
    """
    try:
        # Load BLEU metric from Hugging Face evaluate
        bleu_metric = load("bleu")

        # Format references as list of lists (required by evaluate library)
        formatted_refs = [[ref] for ref in references]

        # Compute BLEU score
        result = bleu_metric.compute(
            predictions=predictions,
            references=formatted_refs
        )

        return result["bleu"] * 100

    except Exception as e:
        print(f"Error computing BLEU with evaluate library: {e}")
        # Fallback to manual implementation
        return compute_bleu_score_manual(references, predictions)

def compute_bleu_score_manual(references, predictions):
    """
    Manual BLEU implementation as fallback
    Compatible with all Python versions
    """
    from collections import Counter
    import math

    def get_ngrams(tokens, n):
        return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

    def compute_bleu_for_sentence(reference, prediction):
        ref_tokens = reference.split()
        pred_tokens = prediction.split()

        if len(pred_tokens) == 0:
            return 0.0

        # Compute precision for n-grams (1 to 4)
        precisions = []
        for n in range(1, 5):
            if len(pred_tokens) < n:
                precisions.append(0.0)
                continue

            pred_ngrams = get_ngrams(pred_tokens, n)
            ref_ngrams = get_ngrams(ref_tokens, n)

            if len(pred_ngrams) == 0:
                precisions.append(0.0)
                continue

            pred_counter = Counter(pred_ngrams)
            ref_counter = Counter(ref_ngrams)

            overlap = 0
            for ngram in pred_counter:
                overlap += min(pred_counter[ngram], ref_counter[ngram])

            precision = overlap / len(pred_ngrams) if len(pred_ngrams) > 0 else 0.0
            precisions.append(precision)

        # Brevity penalty
        if len(pred_tokens) > 0:
            bp = min(1.0, math.exp(1 - len(ref_tokens) / len(pred_tokens)))
        else:
            bp = 0.0

        # Geometric mean of precisions
        if all(p > 0 for p in precisions):
            bleu = bp * math.exp(sum(math.log(p) for p in precisions) / 4)
        else:
            bleu = 0.0

        return bleu

    scores = []
    for ref_list, pred in zip(references, predictions):
        ref = ref_list[0] if isinstance(ref_list, list) else ref_list
        score = compute_bleu_for_sentence(ref, pred)
        scores.append(score)

    return np.mean(scores) * 100 if scores else 0.0


def objective(trial):
    global current_best_bleu

    start_time = time.time()  # Start time of trial

    # # Search space HPO with Dropout rate (TPE)
    # learning_rate = trial.suggest_categorical('learning_rate', [3e-5, 4e-5, 5e-5])
    # batch_size = trial.suggest_categorical('batch_size', [6, 8, 12])
    # weight_decay = trial.suggest_categorical('weight_decay', [1e-5, 5e-5, 1e-4])
    # dropout_rate = trial.suggest_categorical('dropout_rate', [0.1, 0.15, 0.2])

    # # Search space HPO with Dropout rate (Gridsearch)
    learning_rate = trial.suggest_categorical('learning_rate', search_space_conservative['learning_rate'])
    batch_size = trial.suggest_categorical('batch_size', search_space_conservative['batch_size'])
    weight_decay = trial.suggest_categorical('weight_decay', search_space_conservative['weight_decay'])
    dropout_rate = trial.suggest_categorical('dropout_rate', search_space_conservative['dropout_rate'])

    #  # 1. DROPOUT RATE - Narrow ke optimal zone
    # dropout_rate = trial.suggest_float('dropout_rate', 0.10, 0.15)

    # # 2. LEARNING RATE - High precision search di sweet spot
    # learning_rate = trial.suggest_float('learning_rate', 1.25e-05, 1.75e-05)

    # # 3. BATCH SIZE - Fokus pada yang terbukti bagus
    # batch_size = trial.suggest_categorical('batch_size', [4, 6])

    # # 4. WEIGHT DECAY - Fokus pada range rendah
    # weight_decay = trial.suggest_float('weight_decay', 5e-07, 1e-05, log=True)

    # Log trial parameters
    print(f"\n=== Trial {trial.number} ===")
    print(f"LR: {learning_rate}, Batch: {batch_size}, WD: {weight_decay}, Dropout: {dropout_rate}")
    # print(f"LR: {learning_rate}, Batch: {batch_size}, WD: {weight_decay}")
    print(f"Current Best BLEU: {current_best_bleu:.4f}%")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    config = T5Config.from_pretrained("t5-base")
    config.dropout_rate = dropout_rate
    MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", config=config).to(device)
    # MODEL = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    # IDENTICAL optimizer setup to original - only LR and weight_decay change
    OPTIMIZER = AdamW(MODEL.parameters(), lr=learning_rate, eps=1e-8, weight_decay=weight_decay)

    # IDENTICAL data loaders to original - only batch_size changes
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Training tracking
    train_losses = []
    val_losses = []
    best_bleu_this_trial = 0.0
    best_val_loss = float('inf')
    trial_status = "On going"  # Default
    trial_data = []

    # Early stopping for BLEU
    val_loss_patience_counter = 0
    val_loss_patience = 3

    # training and val loop
    for epoch in range(20):  # 10 to 20 karena menggunakan dropout
        train_loss = 0
        val_loss = 0
        train_batch_count = 0
        val_batch_count = 0

        # TRAINING - identical to original
        MODEL.train()
        for batch in tqdm(train_loader, desc=f'[Trial {trial.number}] Epoch {epoch+1} - Training'):
            input_ids = batch['source_ids'].to(device)
            attention_mask = batch['source_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['target_mask'].to(device)

            # Forward pass
            outputs = MODEL(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                decoder_attention_mask=decoder_attention_mask
            )

            # Optimization
            OPTIMIZER.zero_grad()
            outputs.loss.backward()
            OPTIMIZER.step()

            train_loss += outputs.loss.item()
            train_batch_count += 1

        # VALIDATION
        MODEL.eval()
        for batch in tqdm(val_loader, desc=f'[Trial {trial.number}] Epoch {epoch+1} - Validation'):
            input_ids = batch['source_ids'].to(device)
            attention_mask = batch['source_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['target_mask'].to(device)

            with torch.no_grad():
                outputs = MODEL(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    decoder_attention_mask=decoder_attention_mask
                )

            val_loss += outputs.loss.item()
            val_batch_count += 1

        # Calculate losses
        avg_train_loss = train_loss / train_batch_count
        avg_val_loss = val_loss / val_batch_count
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)

        # COMPUTE BLEU THIS EPOCH
        val_refs_bleu, val_hyps_bleu = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['source_ids'].to(device)
                attention_mask = batch['source_mask'].to(device)
                labels = batch['labels'].to(device)

                preds = MODEL.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=target_max_len
                )

                decoded_preds = [tokenizer.decode(g, skip_special_tokens=True) for g in preds]
                decoded_labels = [tokenizer.decode([t for t in label if t != -100], skip_special_tokens=True) for label in labels]

                val_hyps_bleu.extend(decoded_preds)
                val_refs_bleu.extend([[ref] for ref in decoded_labels])

        # sacreBLEU
        flattened_refs = [ref[0] if isinstance(ref, list) else ref for ref in val_refs_bleu]
        epoch_bleu = compute_bleu_score_sacrebleu(flattened_refs, val_hyps_bleu)

        #  Hugging Face NLTK BLEU
        # epoch_bleu = compute_bleu_score_identical_to_baseline(val_refs_bleu, val_hyps_bleu)

        # Update BLEU tracking
        if epoch_bleu > best_bleu_this_trial:
            best_bleu_this_trial = epoch_bleu

        if current_best_bleu < best_bleu_this_trial:
            current_best_bleu = best_bleu_this_trial

        print(f"[Trial {trial.number}] Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f} | Val Loss = {avg_val_loss:.4f} | BLEU = {epoch_bleu:.2f}% | Current Best BLEU = {current_best_bleu:.2f}%")

        auto_scroll_output() # autoscroll output cell notebook

        # Store trial data to CSV
        trial_data.append({
            'Trial': trial.number,
            'Learning Rate': learning_rate,
            'Batch Size': batch_size,
            'Weight Decay': weight_decay,
            'Dropout Rate': dropout_rate,
            'Epoch': epoch + 1,
            'Train Loss': avg_train_loss,
            'Val Loss': avg_val_loss,
            'BLEU': epoch_bleu,
            'Status': trial_status,
            'GPU Memory (GB)': torch.cuda.max_memory_allocated()/1024**3
        })

        # PRUNING
        # trial.report(-epoch_bleu, epoch)
        # if trial.should_prune():
        #     print(f"[Trial {trial.number}] PRUNED at epoch {epoch+1} based on BLEU")
        #     trial_status = "Pruned"
        #     trial.set_user_attr("status", trial_status)
        #     trial.set_user_attr("final_train_loss", f"{avg_train_loss:.6f}")
        #     trial.set_user_attr("final_val_loss", f"{avg_val_loss:.6f}")
        #     trial.set_user_attr("epochs_completed", epoch + 1)
        #     trial.set_user_attr("best_bleu_epoch", best_bleu_this_trial)

        #     end_time = time.time()
        #     duration = end_time - start_time
        #     formatted_duration = time.strftime("%H:%M:%S", time.gmtime(duration))
        #     print(f"[Trial {trial.number}] Duration: {formatted_duration} ({duration:.2f} seconds)")

        #     cleanup_memory(MODEL, OPTIMIZER, train_loader, val_loader)
        #     raise optuna.exceptions.TrialPruned()

        # EARLY STOPPING
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            val_loss_patience_counter = 0
        else:
            val_loss_patience_counter += 1
            print(f"Val Loss Patience Counter: {val_loss_patience_counter} / {val_loss_patience}")
            if val_loss_patience_counter > val_loss_patience:
                print(f"[Trial {trial.number}] EARLY STOPPED at epoch {epoch+1}")
                trial_status = "Early Stopped"
                trial.set_user_attr("status", trial_status)
                trial.set_user_attr("final_train_loss", f"{avg_train_loss:.6f}")
                trial.set_user_attr("final_val_loss", f"{avg_val_loss:.6f}")
                trial.set_user_attr("epochs_completed", epoch + 1)
                trial.set_user_attr("best_bleu_epoch", best_bleu_this_trial)

                cleanup_memory(MODEL, OPTIMIZER, train_loader, val_loader)
                break


    print(f"[Trial {trial.number}] Final BLEU-4: {best_bleu_this_trial:.4f}")
    print(f"[Trial {trial.number}] Final Train Loss: {train_losses[-1]:.6f}")
    print(f"[Trial {trial.number}] Final Val Loss: {val_losses[-1]:.6f}")

    # Store results for analysis
    if trial_status == "Ongoing":
        trial_status = "Completed"

    trial.set_user_attr("status", trial_status)
    trial.set_user_attr("final_train_loss", f"{train_losses[-1]:.6f}")  # FIXED
    trial.set_user_attr("final_val_loss", f"{val_losses[-1]:.6f}")      # FIXED
    trial.set_user_attr("epochs_completed", len(train_losses))
    trial.set_user_attr("best_bleu_epoch", best_bleu_this_trial)

    # Save trial data to CSV
    trial_df = pd.DataFrame(trial_data)
    csv_file = os.path.join(OPTUNA_PATH, f'trial_{trial.number}_results.csv')
    if os.path.exists(csv_file):
        trial_df.to_csv(csv_file, mode='a', header=False, index=False)
    else:
        trial_df.to_csv(csv_file, index=False)

    end_time = time.time()
    duration = end_time - start_time
    formatted_duration = time.strftime("%H:%M:%S", time.gmtime(duration))
    print(f"[Trial {trial.number}] Duration: {formatted_duration} ({duration:.2f} seconds)")

    cleanup_memory(MODEL, OPTIMIZER, train_loader, val_loader)

    return -best_bleu_this_trial  # Negative for minimization

# MAIN EXECUTION
print("="*60)
print("HPO STUDY starts now!!!")
print("="*60)

# Final cleanup
torch.cuda.empty_cache()
gc.collect()

# TPE
# study = optuna.create_study(
#     direction="minimize",
#     study_name="QG_T5_TPE_WithDropout",
#     sampler=optuna.samplers.TPESampler(
#         # random trials awal sebelum TPE mulai membangun distribusi probabilistik dari parameter yang bagus dan buruk.
#         n_startup_trials=12,      # 15% dari 81 combinations
#         # menentukan berapa kandidat yang akan dipertimbangkan dalam optimasi Expected Improvement (EI).
#         n_ei_candidates=20        # ~25% dari 81 combinations, balanced exploration
#     ),
#     pruner=MedianPruner(n_startup_trials=10, n_warmup_steps=5)
# )

# Conservative GridSearch berdasarkan proven high-performance zone
search_space_conservative = {
    "learning_rate": [4.5e-5, 5e-5, 5.5e-5],        # Tight around LR=5e-05 winner
    "batch_size": [6, 7, 8],                         # Focus pada proven performers
    "weight_decay": [1e-5, 2e-5],                    # Lower WD preferred with dropout
    "dropout_rate": [0.10, 0.125, 0.15]             # Optimal dropout range only
}

# GridSearch Study Configuration
study = optuna.create_study(
    direction="minimize",
    study_name="QG_T5_GridSearch_DropoutOptimal",
    sampler=optuna.samplers.GridSampler(search_space_conservative),
    pruner=optuna.pruners.NopPruner()               # No pruning untuk exhaustive search
)

study.optimize(objective, n_trials=54)              # 3×3×2×3 = 54 combinations

# try:
#     study.optimize(objective, n_trials=48)  # ~60% coverage
# except Exception as e:
#     print(f"Optuna optimization failed: {e}")
#     print("Continuing with available results...")

# RESULTS ANALYSIS
if study.trials:
    best_trial = study.best_trial
    best_bleu = -best_trial.value

    print("\n" + "="*60)
    print("HPO STUDY RESULTS")
    print("="*60)
    print(f"Best BLEU-4 Score: {best_bleu:.4f}")

    # print(f"\nBest Hyperparameters:")
    # print(f"• Learning Rate: {best_trial.params['learning_rate']}")
    # print(f"• Batch Size: {best_trial.params['batch_size']}")
    # print(f"• Weight Decay: {best_trial.params['weight_decay']}")
    print(f"\nBest Hyperparameters:")
    for param in best_trial.params:
        print(f"• {param.replace('_', ' ').title()}: {best_trial.params[param]}")

    # Analyze all trials
    # successful_trials = [t for t in study.trials if t.value != float('inf')]
    successful_trials = [t for t in study.trials if t.value is not None and t.value != float('inf')]
    print(f"\nAll Trial Results:")
    print("Rank | BLEU-4  | LR     | BS | WD     | DO")
    print("-" * 55)

    # Sort by BLEU score
    trial_results = [(t, -t.value) for t in successful_trials]
    trial_results.sort(key=lambda x: x[1], reverse=True)

    for i, (trial, bleu) in enumerate(trial_results[:10], 1):  # Top 10
        lr = trial.params['learning_rate']
        bs = trial.params['batch_size']
        wd = trial.params['weight_decay']
        do = trial.params['dropout_rate']
        print(f"{i:4d} | {bleu:6.2f}% | {lr:.0e} | {bs:2d} | {wd:.0e} | {do:.2f}")

    # Parameter effectiveness analysis
    print(f"\nParameter Effectiveness Analysis:")

    # Group by parameter values
    from collections import defaultdict

    param_stats = defaultdict(lambda: defaultdict(list))

    for trial, bleu in trial_results:
        for param in ['learning_rate', 'batch_size', 'weight_decay', 'dropout_rate']:
            value = trial.params[param]
            param_stats[param][value].append(bleu)

    print("\nAverage BLEU-4 by parameter value:")

    for param in ['learning_rate', 'batch_size', 'weight_decay', 'dropout_rate']:
        print(f"\n{param.replace('_', ' ').title()}:")
        for val in sorted(param_stats[param].keys()):
            avg_bleu = np.mean(param_stats[param][val])
            n_trials = len(param_stats[param][val])
            if isinstance(val, float):
                print(f"  {val:.2e}: {avg_bleu:.2f}% ({n_trials} trials)")
            else:
                print(f"  {val:>7}: {avg_bleu:.2f}% ({n_trials} trials)")

    # Save results
    results_data = []
    for trial, bleu in trial_results:
        results_data.append({
            'Trial': trial.number,
            'BLEU_4': bleu,
            'Learning_Rate': trial.params['learning_rate'],
            'Batch_Size': trial.params['batch_size'],
            'Weight_Decay': trial.params['weight_decay'],
            'Dropout_Rate': trial.params['dropout_rate'],
            'Status': trial.user_attrs.get('status', 'Unknown'),
            'Final_Train_Loss': trial.user_attrs.get('final_train_loss', 0),
            'Final_Val_Loss': trial.user_attrs.get('final_val_loss', 0),
            'Epochs_Completed': trial.user_attrs.get('epochs_completed', 0)
        })

    results_df = pd.DataFrame(results_data)
    csv_path = os.path.join(LOG_PATH, f'hpo_study_postAnalysis_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv')
    results_df.to_csv(csv_path, index=False)

    print(f"\nDetailed results saved to: {csv_path}")

    print(f"\nSUMMARY:")
    print(f"• {len(successful_trials)}/{len(study.trials)} trials successful")

else:
    print("No trials completed successfully.")

print("\n" + "="*60)
print("HPO STUDY COMPLETED")
print("="*60)

torch.cuda.empty_cache()
gc.collect()

In [None]:
# RESULTS ANALYSIS
if study.trials:
    best_trial = study.best_trial
    best_bleu = -best_trial.value

    print("\n" + "="*60)
    print("HPO STUDY RESULTS")
    print("="*60)
    print(f"Best BLEU-4 Score: {best_bleu:.4f}")

    # print(f"\nBest Hyperparameters:")
    # print(f"• Learning Rate: {best_trial.params['learning_rate']}")
    # print(f"• Batch Size: {best_trial.params['batch_size']}")
    # print(f"• Weight Decay: {best_trial.params['weight_decay']}")
    print(f"\nBest Hyperparameters:")
    for param in best_trial.params:
        print(f"• {param.replace('_', ' ').title()}: {best_trial.params[param]}")

    # Analyze all trials
    # successful_trials = [t for t in study.trials if t.value != float('inf')]
    successful_trials = [t for t in study.trials if t.value is not None and t.value != float('inf')]
    print(f"\nAll Trial Results:")
    print("Rank | BLEU-4  | LR     | BS | WD     | DO")
    print("-" * 55)

    # Sort by BLEU score
    trial_results = [(t, -t.value) for t in successful_trials]
    trial_results.sort(key=lambda x: x[1], reverse=True)

    for i, (trial, bleu) in enumerate(trial_results[:10], 1):  # Top 10
        lr = trial.params['learning_rate']
        bs = trial.params['batch_size']
        wd = trial.params['weight_decay']
        do = trial.params['dropout_rate']
        print(f"{i:4d} | {bleu:6.2f}% | {lr:.0e} | {bs:2d} | {wd:.0e} | {do:.2f}")

    # Parameter effectiveness analysis
    print(f"\nParameter Effectiveness Analysis:")

    # Group by parameter values
    from collections import defaultdict

    param_stats = defaultdict(lambda: defaultdict(list))

    for trial, bleu in trial_results:
        for param in ['learning_rate', 'batch_size', 'weight_decay', 'dropout_rate']:
            value = trial.params[param]
            param_stats[param][value].append(bleu)

    print("\nAverage BLEU-4 by parameter value:")

    for param in ['learning_rate', 'batch_size', 'weight_decay', 'dropout_rate']:
        print(f"\n{param.replace('_', ' ').title()}:")
        for val in sorted(param_stats[param].keys()):
            avg_bleu = np.mean(param_stats[param][val])
            n_trials = len(param_stats[param][val])
            if isinstance(val, float):
                print(f"  {val:.2e}: {avg_bleu:.2f}% ({n_trials} trials)")
            else:
                print(f"  {val:>7}: {avg_bleu:.2f}% ({n_trials} trials)")

    # Save results
    results_data = []
    for trial, bleu in trial_results:
        results_data.append({
            'Trial': trial.number,
            'BLEU_4': bleu,
            'Learning_Rate': trial.params['learning_rate'],
            'Batch_Size': trial.params['batch_size'],
            'Weight_Decay': trial.params['weight_decay'],
            'Dropout_Rate': trial.params['dropout_rate'],
            'Status': trial.user_attrs.get('status', 'Unknown'),
            'Final_Train_Loss': trial.user_attrs.get('final_train_loss', 0),
            'Final_Val_Loss': trial.user_attrs.get('final_val_loss', 0),
            'Epochs_Completed': trial.user_attrs.get('epochs_completed', 0)
        })

    results_df = pd.DataFrame(results_data)
    csv_path = os.path.join(LOG_PATH, f'hpo_study_postAnalysis_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv')
    results_df.to_csv(csv_path, index=False)

    print(f"\nDetailed results saved to: {csv_path}")

    print(f"\nSUMMARY:")
    print(f"• {len(successful_trials)}/{len(study.trials)} trials successful")

else:
    print("No trials completed successfully.")

print("\n" + "="*60)
print("HPO STUDY COMPLETED")
print("="*60)

torch.cuda.empty_cache()
gc.collect()

## Run Analisis Data HPO

In [None]:
print("\n" + "="*60)
print("RUNNING POST-HPO ANALYSIS")
print("="*60)

try:
    # Ensure all required variables are available
    if 'LOG_PATH' not in globals():
        LOG_PATH = './'  # Default to current directory

    print(f"\nLOG_PATH = {LOG_PATH}\n")

    # Run complete analysis
    print("Starting comprehensive HPO analysis...")
    analysis_results = run_complete_hpo_analysis(study, LOG_PATH)

    if analysis_results:
        print("✓ HPO Analysis completed successfully!")
        print(f"✓ Files saved to: {LOG_PATH}")
    else:
        print("✗ HPO Analysis failed!")

except Exception as e:
    print(f"Error running HPO analysis: {e}")
    import traceback
    traceback.print_exc()

    # Try individual functions if complete analysis fails
    print("\nTrying individual analysis functions...")
    try:
        print("1. Exporting basic trials...")
        trials_df = export_trials_to_csv(study)
        print("✓ Basic trials exported")

        print("2. Creating detailed analysis...")
        detailed_df = create_detailed_tpe_analysis(study)
        print("✓ Detailed analysis created")

        print("3. Generating report...")
        report_file = generate_optimization_report(study)
        print("✓ Report generated")

    except Exception as e2:
        print(f"Individual functions also failed: {e2}")

# Final cleanup
print("\nCleaning up...")
torch.cuda.empty_cache()
gc.collect()
print("✓ Cleanup completed")

print("\n" + "="*60)
print("HPO PROCESS FULLY COMPLETED")
print("="*60)

## Visualisasi Data HPO

### Tabel Data dari masing2 Visualisasi

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# UTILITY FUNCTIONS - CSV FILE LOADERS

def load_latest_csv(base_path, pattern):
    """Load the most recent CSV file matching the pattern"""
    files = glob.glob(os.path.join(base_path, f"{pattern}_*.csv"))
    if not files:
        print(f"No files found matching pattern: {pattern}")
        return None

    latest_file = max(files, key=os.path.getctime)
    print(f"Loading: {latest_file}")
    return pd.read_csv(latest_file)

def load_all_analysis_data(base_path):
    """Load all analysis CSV files"""
    data = {}

    # Load basic trials data
    data['trials'] = load_latest_csv(base_path, 'optuna_trials')

    # Load detailed TPE analysis
    data['detailed_tpe'] = load_latest_csv(base_path, 'detailed_tpe_analysis')

    # Load parameter importance
    data['param_importance'] = load_latest_csv(base_path, 'parameter_importance')

    # Load TPE distributions
    data['tpe_distributions'] = load_latest_csv(base_path, 'tpe_distributions_analysis')

    # Load performance summary
    data['performance_summary'] = load_latest_csv(base_path, 'performance_summary')

    return data

# 1. OPTIMIZATION HISTORY TABLE

def create_optimization_history_table(data):
    """Create table data for optimization history visualization"""

    if data['detailed_tpe'] is not None:
        df = data['detailed_tpe'].copy()
    else:
        df = data['trials'].copy()

    # Extract relevant columns
    history_table = pd.DataFrame({
        'Trial_Number': df['trial_number'] if 'trial_number' in df.columns else df.get('number', range(len(df))),
        'BLEU_Score': df['bleu_score'] if 'bleu_score' in df.columns else -df['value'],
        'Objective_Value': df['objective_value'] if 'objective_value' in df.columns else df['value'],
        'Best_BLEU_So_Far': df['best_bleu_so_far'] if 'best_bleu_so_far' in df.columns else None,
        'Is_Improvement': df['is_improvement'] if 'is_improvement' in df.columns else False,
        'Trial_State': df['state'] if 'state' in df.columns else 'COMPLETE',
        'Duration_Seconds': df['duration_seconds'] if 'duration_seconds' in df.columns else None,
        'Datetime_Complete': df['datetime_complete'] if 'datetime_complete' in df.columns else None
    })

    # Calculate cumulative best if not available
    if history_table['Best_BLEU_So_Far'].isna().all():
        history_table['Best_BLEU_So_Far'] = history_table['BLEU_Score'].cummax()
        history_table['Is_Improvement'] = history_table['BLEU_Score'] == history_table['Best_BLEU_So_Far']

    # Add improvement indicators
    history_table['Improvement_From_Previous'] = history_table['BLEU_Score'].diff()
    history_table['Trials_Since_Best'] = history_table.groupby((history_table['Is_Improvement']).cumsum()).cumcount()

    # Sort by trial number
    history_table = history_table.sort_values('Trial_Number').reset_index(drop=True)

    return history_table

# 2. PARAMETER IMPORTANCE TABLE

def create_parameter_importance_table(data):
    """Create table data for parameter importance visualization"""

    if data['param_importance'] is None:
        print("No parameter importance data available")
        return None

    importance_table = data['param_importance'].copy()

    # Add additional metrics
    importance_table['Importance_Percentage'] = (
        importance_table['importance'] / importance_table['importance'].sum() * 100
    )

    importance_table['Cumulative_Importance'] = importance_table['Importance_Percentage'].cumsum()

    importance_table['Importance_Rank'] = range(1, len(importance_table) + 1)

    # Categorize importance levels
    importance_table['Importance_Category'] = pd.cut(
        importance_table['Importance_Percentage'],
        bins=[0, 5, 15, 30, 100],
        labels=['Low', 'Medium', 'High', 'Critical']
    )

    return importance_table

# 3. PARALLEL COORDINATE TABLE

def create_parallel_coordinate_table(data):
    """Create table data for parallel coordinate visualization"""

    if data['trials'] is None:
        print("No trials data available")
        return None

    df = data['trials'].copy()

    # Extract parameter columns
    param_cols = [col for col in df.columns if col.startswith('params_')]

    if not param_cols:
        print("No parameter columns found in trials data")
        return None

    # Create base table
    parallel_table = df[['number', 'value'] + param_cols].copy()

    # Convert objective to positive BLEU
    parallel_table['BLEU_Score'] = -parallel_table['value']

    # Add performance categories
    bleu_scores = parallel_table['BLEU_Score'].dropna()
    parallel_table['Performance_Quartile'] = pd.qcut(
        parallel_table['BLEU_Score'],
        q=4,
        labels=['Q1_Worst', 'Q2_Below_Avg', 'Q3_Above_Avg', 'Q4_Best']
    )

    # Add top performers flag
    top_10_threshold = bleu_scores.quantile(0.9)
    parallel_table['Is_Top_10_Percent'] = parallel_table['BLEU_Score'] >= top_10_threshold

    # Normalize parameters to 0-1 scale for better visualization
    param_cols_clean = [col.replace('params_', '') for col in param_cols]
    for i, col in enumerate(param_cols):
        col_clean = param_cols_clean[i]
        parallel_table[f'{col_clean}_normalized'] = (
            (parallel_table[col] - parallel_table[col].min()) /
            (parallel_table[col].max() - parallel_table[col].min())
        )

    return parallel_table

# 4. CONTOUR PLOT DATA TABLE

def create_contour_data_table(data, param1, param2):
    """Create table data for specific parameter pair contour plot"""

    if data['trials'] is None:
        print("No trials data available")
        return None

    df = data['trials'].copy()

    param1_col = f'params_{param1}'
    param2_col = f'params_{param2}'

    if param1_col not in df.columns or param2_col not in df.columns:
        print(f"Parameters {param1} or {param2} not found in trials data")
        return None

    # Create contour table
    contour_table = pd.DataFrame({
        'Trial_Number': df['number'],
        f'{param1}': df[param1_col],
        f'{param2}': df[param2_col],
        'BLEU_Score': -df['value'],
        'Objective_Value': df['value']
    })

    # Remove NaN values
    contour_table = contour_table.dropna()

    # Add performance binning
    contour_table['Performance_Bin'] = pd.cut(
        contour_table['BLEU_Score'],
        bins=5,
        labels=['Very_Poor', 'Poor', 'Average', 'Good', 'Excellent']
    )

    # Add distance from best point
    best_idx = contour_table['BLEU_Score'].idxmax()
    best_param1 = contour_table.loc[best_idx, param1]
    best_param2 = contour_table.loc[best_idx, param2]

    contour_table['Distance_From_Best'] = np.sqrt(
        (contour_table[param1] - best_param1)**2 +
        (contour_table[param2] - best_param2)**2
    )

    # Add grid coordinates for contour plotting
    param1_grid = np.linspace(contour_table[param1].min(), contour_table[param1].max(), 50)
    param2_grid = np.linspace(contour_table[param2].min(), contour_table[param2].max(), 50)

    contour_table['Grid_X_Idx'] = pd.cut(contour_table[param1], bins=param1_grid, labels=False)
    contour_table['Grid_Y_Idx'] = pd.cut(contour_table[param2], bins=param2_grid, labels=False)

    return contour_table

# 5. SLICE PLOT DATA TABLE

def create_slice_data_table(data, param_name):
    """Create table data for parameter slice plot"""

    if data['tpe_distributions'] is None or data['trials'] is None:
        print("Required data not available")
        return None

    df = data['trials'].copy()
    param_col = f'params_{param_name}'

    if param_col not in df.columns:
        print(f"Parameter {param_name} not found in trials data")
        return None

    # Create slice table
    slice_table = pd.DataFrame({
        'Trial_Number': df['number'],
        f'{param_name}': df[param_col],
        'BLEU_Score': -df['value'],
        'Objective_Value': df['value']
    })

    # Remove NaN values
    slice_table = slice_table.dropna()

    # Sort by parameter value
    slice_table = slice_table.sort_values(param_name)

    # Add moving average for trend
    slice_table['BLEU_Moving_Avg'] = slice_table['BLEU_Score'].rolling(window=5, center=True).mean()

    # Add TPE distribution info if available
    tpe_dist = data['tpe_distributions']
    param_dist = tpe_dist[tpe_dist['parameter'] == param_name]

    if not param_dist.empty:
        good_mean = param_dist['good_mean'].iloc[0]
        poor_mean = param_dist['poor_mean'].iloc[0]

        slice_table['TPE_Good_Mean'] = good_mean
        slice_table['TPE_Poor_Mean'] = poor_mean
        slice_table['Distance_From_Good_Mean'] = abs(slice_table[param_name] - good_mean)
        slice_table['Distance_From_Poor_Mean'] = abs(slice_table[param_name] - poor_mean)
        slice_table['Closer_To_Good_Region'] = (
            slice_table['Distance_From_Good_Mean'] < slice_table['Distance_From_Poor_Mean']
        )

    # Add performance percentiles
    slice_table['Performance_Percentile'] = slice_table['BLEU_Score'].rank(pct=True) * 100

    # Bin parameter values
    slice_table['Parameter_Bin'] = pd.cut(
        slice_table[param_name],
        bins=10,
        labels=[f'Bin_{i+1}' for i in range(10)]
    )

    return slice_table

# 6. MASTER FUNCTION - CREATE ALL TABLES

def create_all_visualization_tables(base_path):
    """Create all visualization tables and save them"""

    print("Loading analysis data...")
    data = load_all_analysis_data(base_path)

     #  Perbaikan: cek data yang tidak None dan tidak kosong
    if not any(df is not None and not df.empty for df in data.values()):
        print(" No analysis data found or all files are empty!")
        return None

    #  Debug info - tunjukkan status tiap file
    print("\n--- FILE STATUS ---")
    for name, df in data.items():
        if df is None:
            print(f" {name}: not loaded")
        elif df.empty:
            print(f" {name}: loaded but empty")
        else:
            print(f" {name}: loaded with shape {df.shape}")
    print("-------------------\n")

    results = {}

    # 1. Optimization History Table
    print("\n1. Creating Optimization History Table...")
    results['optimization_history'] = create_optimization_history_table(data)

    # 2. Parameter Importance Table
    print("2. Creating Parameter Importance Table...")
    results['parameter_importance'] = create_parameter_importance_table(data)

    # 3. Parallel Coordinate Table
    print("3. Creating Parallel Coordinate Table...")
    results['parallel_coordinate'] = create_parallel_coordinate_table(data)

    # 4. Contour Tables - for all parameter pairs
    if data['trials'] is not None:
        print("4. Creating Contour Data Tables...")
        param_cols = [col.replace('params_', '') for col in data['trials'].columns if col.startswith('params_')]
        results['contour_tables'] = {}

        from itertools import combinations
        for param1, param2 in combinations(param_cols, 2):
            print(f"   Creating contour table for {param1} vs {param2}")
            results['contour_tables'][f'{param1}_vs_{param2}'] = create_contour_data_table(data, param1, param2)

    # 5. Slice Tables - for each parameter
    if data['trials'] is not None:
        print("5. Creating Slice Data Tables...")
        param_cols = [col.replace('params_', '') for col in data['trials'].columns if col.startswith('params_')]
        results['slice_tables'] = {}

        for param in param_cols:
            print(f"   Creating slice table for {param}")
            results['slice_tables'][param] = create_slice_data_table(data, param)

    # Save all tables to CSV
    print("\nSaving all tables to CSV...")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    for table_name, table_data in results.items():
        if table_data is not None and not isinstance(table_data, dict):
            filename = os.path.join(base_path, f'viz_table_{table_name}_{timestamp}.csv')
            table_data.to_csv(filename, index=False)
            print(f"✓ Saved: {filename}")

    # Save contour and slice tables
    if 'contour_tables' in results:
        for contour_name, contour_data in results['contour_tables'].items():
            if contour_data is not None:
                filename = os.path.join(base_path, f'viz_table_contour_{contour_name}_{timestamp}.csv')
                contour_data.to_csv(filename, index=False)
                print(f"✓ Saved: {filename}")

    if 'slice_tables' in results:
        for slice_name, slice_data in results['slice_tables'].items():
            if slice_data is not None:
                filename = os.path.join(base_path, f'viz_table_slice_{slice_name}_{timestamp}.csv')
                slice_data.to_csv(filename, index=False)
                print(f"✓ Saved: {filename}")

    return results

# 7. SUMMARY STATISTICS FOR EACH VISUALIZATION

def create_visualization_summary_stats(results):
    """Create summary statistics for each visualization table"""

    summary_stats = {}

    # Optimization History Stats
    if results.get('optimization_history') is not None:
        hist_data = results['optimization_history']
        summary_stats['optimization_history'] = {
            'total_trials': len(hist_data),
            'best_bleu_score': hist_data['BLEU_Score'].max(),
            'worst_bleu_score': hist_data['BLEU_Score'].min(),
            'average_bleu_score': hist_data['BLEU_Score'].mean(),
            'improvement_rate': (hist_data['Is_Improvement'].sum() / len(hist_data)) * 100,
            'convergence_trial': hist_data[hist_data['BLEU_Score'] == hist_data['BLEU_Score'].max()]['Trial_Number'].iloc[0]
        }

    # Parameter Importance Stats
    if results.get('parameter_importance') is not None:
        param_data = results['parameter_importance']
        summary_stats['parameter_importance'] = {
            'most_important_param': param_data.iloc[0]['parameter'],
            'least_important_param': param_data.iloc[-1]['parameter'],
            'top_3_params_contribute': param_data.head(3)['Importance_Percentage'].sum(),
            'parameters_above_10_percent': len(param_data[param_data['Importance_Percentage'] > 10])
        }

    # Parallel Coordinate Stats
    if results.get('parallel_coordinate') is not None:
        par_data = results['parallel_coordinate']
        summary_stats['parallel_coordinate'] = {
            'top_10_percent_trials': par_data['Is_Top_10_Percent'].sum(),
            'best_quartile_trials': len(par_data[par_data['Performance_Quartile'] == 'Q4_Best']),
            'parameter_count': len([col for col in par_data.columns if col.endswith('_normalized')])
        }

    return summary_stats

"""Example usage of all functions"""

# Set your base path where CSV files are located
BASE_PATH = LOG_PATH

# Create all visualization tables
print("Creating all visualization tables...")
results = create_all_visualization_tables(BASE_PATH)

if results:
    print("\n" + "="*50)
    print("VISUALIZATION TABLES CREATED SUCCESSFULLY")
    print("="*50)

    # Create summary statistics
    summary_stats = create_visualization_summary_stats(results)

    # Print summary
    for viz_name, stats in summary_stats.items():
        print(f"\n{viz_name.upper()} SUMMARY:")
        for stat_name, stat_value in stats.items():
            print(f"  {stat_name}: {stat_value}")

    # Display sample data from each table
    print("\n" + "="*50)
    print("SAMPLE DATA FROM EACH TABLE")
    print("="*50)

    for table_name, table_data in results.items():
        if table_data is not None and not isinstance(table_data, dict):
            print(f"\n{table_name.upper()} (First 5 rows):")
            print(table_data.head())
            print(f"Shape: {table_data.shape}")

else:
    print("Failed to create visualization tables!")

### Plot Visualisasi

In [None]:
import optuna.visualization.matplotlib as vis
import matplotlib.pyplot as plt
import numpy as np
import itertools

# 1. Optimization History Plot
ax1 = vis.plot_optimization_history(study)
fig1 = ax1.figure
fig1.set_size_inches(14, 8)
plt.tight_layout()
plt.savefig(os.path.join(OPTUNA_PATH, "optuna_history_bleu4_full.png"))
plt.show()

# 2. Param Importances Plot
ax2 = vis.plot_param_importances(study)
fig2 = ax2.figure
fig2.set_size_inches(14, 8)
plt.tight_layout()
plt.savefig(os.path.join(OPTUNA_PATH, "optuna_param_importance_full.png"))
plt.show()

# 3. Parallel Coordinate Plot
ax4 = vis.plot_parallel_coordinate(study)
fig4 = ax4.figure
fig4.set_size_inches(14, 8)
plt.tight_layout()
plt.savefig(os.path.join(OPTUNA_PATH, "optuna_parallel_coordinate.png"))
plt.show()

# 4. Contour Plot for all param pairs
all_params = list(study.best_trial.params.keys())
param_pairs = list(itertools.combinations(all_params, 2))

for param1, param2 in param_pairs:
    ax = vis.plot_contour(study, params=[param1, param2])
    if isinstance(ax, plt.Axes):
        fig = ax.figure
    else:
        fig = ax[0, 0].figure
    fig.set_size_inches(8, 6)
    plt.suptitle(f'Contour Plot: {param1} vs {param2}', fontsize=14)
    plt.tight_layout()
    fname = f"optuna_contour_{param1}_vs_{param2}.png"
    plt.savefig(os.path.join(OPTUNA_PATH, fname))
    plt.show()

# 5. Slice Plot for each param
for param in all_params:
    ax = vis.plot_slice(study, params=[param])
    if isinstance(ax, plt.Axes):
        fig = ax.figure
    else:
        fig = ax[0].figure
    fig.set_size_inches(8, 6)
    plt.title(f"Slice Plot for '{param}'")
    plt.tight_layout()
    plt.savefig(os.path.join(OPTUNA_PATH, f"optuna_slice_{param}.png"))
    plt.show()

# Single Finetune

## Finetuning Loader

In [None]:
# # Fine-tuning Process
# Melatih model T5 dengan dataset yang telah disiapkan
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = T5Config.from_pretrained("t5-base")
# config.dropout_rate = 0.1254686517839752
# MODEL = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)
MODEL = T5ForConditionalGeneration.from_pretrained("t5-base", config=config).to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Inisialisasi optimizer
current_lr=4e-05
current_wd=1e-05
OPTIMIZER = AdamW(MODEL.parameters(), lr=current_lr, eps=1e-8, weight_decay=current_wd)
best_val_loss = float('inf')
batch_size = 12

# Inisialisasi data loader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## T5-BASE FINETUNE

In [None]:
# List untuk menyimpan loss
train_losses = []
val_losses = []

loss_file = os.path.join(LOG_PATH, 'losses.txt')
with open(loss_file, 'w') as f:
    f.write(f'\nHyperparmater Combination:\nLearning rate: {current_lr} \nBatch size: {batch_size} \nWeight Decay: {current_wd}\nDropout: {config.dropout_rate}\n')
    print(f'\nHyperparmater Combination:\nLearning rate: {current_lr} \nBatch size: {batch_size} \nWeight Decay: {current_wd} \nDropout: {config.dropout_rate}\n')

    for epoch in range(10):
        train_loss = 0
        val_loss = 0
        train_batch_count = 0
        val_batch_count = 0
        MODEL.train()

        for batch in tqdm(train_loader, desc='Training batches'):
            input_ids = batch['source_ids'].to(device)
            attention_mask = batch['source_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['target_mask'].to(device)

            outputs = MODEL(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                decoder_attention_mask=decoder_attention_mask
            )

            OPTIMIZER.zero_grad()
            outputs.loss.backward()
            OPTIMIZER.step()
            train_loss += outputs.loss.item()
            train_batch_count += 1

        MODEL.eval()
        for batch in tqdm(val_loader, desc='Validation batches'):
            input_ids = batch['source_ids'].to(device)
            attention_mask = batch['source_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['target_mask'].to(device)

            with torch.no_grad():
                outputs = MODEL(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                    decoder_attention_mask=decoder_attention_mask
                )

            val_loss += outputs.loss.item()
            val_batch_count += 1

        avg_train_loss = train_loss / train_batch_count
        avg_val_loss = val_loss / val_batch_count

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
        else:
            print(f'\nEarly stopping')
            break

        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)

        f.write(f'\nEpoch {epoch+1}/10 -> Train loss: {avg_train_loss}\tValidation loss: {avg_val_loss}\n')
        print(f'\n{epoch+1}/10 -> Train loss: {avg_train_loss}\tValidation loss: {avg_val_loss}\n')


# Simpan model dan tokenizer ke Google Drive
MODEL.save_pretrained(MODEL_PATH)
TOKENIZER.save_pretrained(TOKENIZER_PATH)

# Simpan plot grafik loss ke Google Drive
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plot_path = os.path.join(PROJECT_PATH, 'loss_plot.png')
plt.savefig(plot_path)  # Simpan plot sebagai file PNG
plt.show()

print(f'Final Train loss: {avg_train_loss}\tFinal Validation loss: {avg_val_loss}')
print(f'Model saved to: {MODEL_PATH}')
print(f'Tokenizer saved to: {TOKENIZER_PATH}')
print(f'Loss log saved to: {loss_file}')
print(f'Loss plot saved to: {plot_path}')

import gc
import torch

torch.cuda.empty_cache()
gc.collect

In [None]:
# Testing the Fine-tuned T5-base Model

import gc
import torch

torch.cuda.empty_cache()
gc.collect

tokenizer = T5Tokenizer.from_pretrained(TOKENIZER_PATH)
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)

print(f"Loaded fine-tuned T5-base model")
print(f"Model parameters: {model.num_parameters():,}")

In [None]:
## Example 1
# Menguji model dengan contoh sederhana

context = "Sushi is a special food from Japan. It uses rice with vinegar and fresh fish. Sometimes, people add cucumber, spicy sauce, or even aloe vera. Why do people eat raw fish? Because it is healthy, fresh, and light to eat. So, sushi is not only yummy, but also good for your body!"
answer = "Japan"
text = f"context: {context} answer: {answer} </s>"
print("Input:", text)

# Tokenisasi input
encoding = tokenizer.encode_plus(text, max_length=input_max_len, padding=True, return_tensors="pt")
print("Encoding keys:", encoding.keys())
input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

# Menghasilkan pertanyaan
model.eval()
beam_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=target_max_len,
    early_stopping=True,
    num_beams=5,
    num_return_sequences=3
)

# Mendekode dan mencetak hasil
print("\nGenerated Questions:")
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(sent)

In [None]:
## Example 2
# Menguji model dengan konteks yang lebih kompleks

context = """
I live in a simple, comfortable house on a mountain slope, facing a road. A big mosque is on the left, used for praying and Quran lessons. Kids play in a wide yard in front. My two-story house is painted ivory white with brown window frames and door. It suits four to six people. There’s a terrace for relaxing, a living room upon entry, and a family room on the left where we talk or watch TV. A door in the family room leads to the kitchen, dining room, and bathroom. The master bedroom, with a bathroom, is on the right of the dining room. The second floor has two bedrooms and a balcony with a beautiful mountain view.
"""
answer = "balcony"
text1 = f"context: {context} answer: {answer} </s>"
print("\nInput:", text1)

# Tokenisasi input
encoding = tokenizer.encode_plus(text1, max_length=input_max_len, padding=True, return_tensors="pt")
print("Encoding keys:", encoding.keys())
input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

# Menghasilkan pertanyaan
model.eval()
beam_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=target_max_len,
    early_stopping=True,
    num_beams=5,
    num_return_sequences=3
)

# Mendekode dan mencetak hasil
print("\nGenerated Questions:")
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(sent)

## Processing the Evaluation Metrics - Same logic as original

In [None]:
# ================================
# Processing the Evaluation Metrics - Same logic as original
# ================================

# Loading the Context from Tokenized Test Dataset
decoded_inputs = []
def decode_and_write_to_txt(dataset, tokenizer, output_file):
    for i in range(len(dataset)):
        decoded_input = tokenizer.decode(dataset.inputs[i]['input_ids'].squeeze(), skip_special_tokens=True)
        decoded_inputs.append(decoded_input)
    with open(output_file, 'w', encoding='utf-8') as f:
        for decoded_input in decoded_inputs:
            f.write(decoded_input + '\n')

src_test_preds_path = os.path.join(PROJECT_PATH, 'src_test_preds.txt')  # MODIFIED: Different filename
decode_and_write_to_txt(test_dataset, tokenizer, src_test_preds_path)

# Process input to save only context
def preprocess_text(text):
    context_part = text.split('context:')[1].strip()
    cleaned_context = context_part.split('answers:')[0].strip()
    cleaned_context = f'"{cleaned_context}"'
    return cleaned_context

src_test_preds_processed_path = os.path.join(PROJECT_PATH, 'src_test_preds_processed.txt')
with open(src_test_preds_path, 'r', encoding='utf-8') as infile, open(src_test_preds_processed_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        cleaned_context = preprocess_text(line)
        outfile.write(cleaned_context + '\n')

with open(src_test_preds_processed_path, 'r') as f:
    print(f'Processed context length: {len(f.readlines())}')

In [None]:
# Loading the Reference Target (Question) from Tokenized Test Dataset
decoded_target = []
def decode_and_write_to_txt(dataset, tokenizer, output_file):
    for i in range(len(dataset)):
        decoded_input = tokenizer.decode(dataset.targets[i]['input_ids'].squeeze(), skip_special_tokens=True)
        decoded_target.append(decoded_input)
    with open(output_file, 'w', encoding='utf-8') as f:
        for decoded_input in decoded_target:
            f.write(decoded_input + '\n')

test_ref_path = os.path.join(PROJECT_PATH, 'test_ref.txt')
decode_and_write_to_txt(test_dataset, tokenizer, test_ref_path)

# Process targets for text cleaning
test_ref_processed_path = os.path.join(PROJECT_PATH, 'test_ref_processed.txt')
dataset = []
with open(test_ref_path, encoding='utf-8') as f:
    for line in f.readlines():
        dataset.append(line.strip())

for idx, data in enumerate(dataset):
    if data.endswith('?'):
        dataset[idx] = data[:-1] + ' ?'
    else:
        dataset[idx] = data + ' ?'
    if data.find("'s") != -1:
        dataset[idx] = dataset[idx].replace("'s", " 's")
    dataset[idx] = dataset[idx].replace('  ', ' ')
    if data.find('question:') == 0:
        dataset[idx] = data[10:]
    if data[0] == ',':
        dataset[idx] = data[1:]
    dataset[idx] = dataset[idx].lstrip()

with open(test_ref_processed_path, 'w', encoding='utf-8') as f:
    for data in dataset:
        f.write(data.strip() + '\n')

with open(test_ref_processed_path, 'r') as f:
    print(f'Processed reference length: {len(f.readlines())}')

## Generate predictions with T5-small optimizations

In [None]:

generated_questions = []

for i in tqdm(range(len(test_dataset)), desc="Generating questions"):
    input_ids = test_dataset.inputs[i]['input_ids'].to(device)
    attention_mask = test_dataset.inputs[i]['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=96,
            num_beams=4,              # MODIFIED: Added beam search for better quality
            no_repeat_ngram_size=2,   # MODIFIED: Prevent repetition
            repetition_penalty=1.2,   # MODIFIED: Reduce repetitive outputs
            length_penalty=1.0,       # MODIFIED: Encourage appropriate length
            early_stopping=True       # MODIFIED: Stop when EOS is generated
        )
    generated_question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_questions.append(generated_question)

test_preds_path = os.path.join(PROJECT_PATH, 'test_preds.txt')
with open(test_preds_path, 'w', encoding='utf-8') as f:
    for question in generated_questions:
        f.write(question + '\n')

# Process predictions
test_preds_processed_path = os.path.join(PROJECT_PATH, 'test_preds_processed.txt')
dataset = []
with open(test_preds_path, encoding='utf-8') as f:
    for line in f.readlines():
        dataset.append(line.strip())

for idx, data in enumerate(dataset):
    if data.endswith('?'):
        dataset[idx] = data[:-1] + ' ?'
    else:
        dataset[idx] = data + ' ?'
    if data.find("'s") != -1:
        dataset[idx] = dataset[idx].replace("'s", " 's")
    dataset[idx] = dataset[idx].replace('  ', ' ')
    if data.find('question:') == 0:
        dataset[idx] = data[10:]
    if data[0] == ',':
        dataset[idx] = data[1:]
    dataset[idx] = dataset[idx].lstrip()

with open(test_preds_processed_path, 'w', encoding='utf-8') as f:
    for data in dataset:
        f.write(data.strip() + '\n')

with open(test_preds_processed_path, 'r') as f:
    print(f'Processed predictions length: {len(f.readlines())}')

## Evaluation

In [None]:
# Evaluation

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import seaborn as sns
import warnings
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import SmoothingFunction

context_full_path = os.path.join(PROJECT_PATH, 'src_test_preds.txt')
test_ref_processed_path = os.path.join(PROJECT_PATH, 'test_ref_processed.txt')
test_preds_processed_path = os.path.join(PROJECT_PATH, 'test_preds_processed.txt')
evaluation_result_path = os.path.join(EVALUATION_PATH, 'evaluation_result.txt')
evaluation_plot_path = os.path.join(EVALUATION_PATH, 'evaluation_plot.png')
metric_correlation_plot_path = os.path.join(EVALUATION_PATH, 'metric_correlation_plot.png')

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

def calculate_bleu(candidate, references):
    smoothing = SmoothingFunction().method1
    bleu_1 = sentence_bleu(references, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothing)
    bleu_2 = sentence_bleu(references, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing)
    bleu_3 = sentence_bleu(references, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing)
    bleu_4 = sentence_bleu(references, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)
    return bleu_1, bleu_2, bleu_3, bleu_4

def calculate_meteor(candidate, references):
    candidate_tok = candidate.split()
    references_tok = [ref.split() for ref in references]
    return meteor_score(references_tok, candidate_tok)

def calculate_rouge_l(candidate, references):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(references[0], candidate)
    return scores['rougeL'].fmeasure

# Load all necessary data
contexts_answers = read_file(context_full_path)
references = read_file(test_ref_processed_path)
candidates = read_file(test_preds_processed_path)

# Extract context and answer
contexts = []
answers = []
for line in contexts_answers:
    if line.startswith("context:"):
        ctx = line.split("context:")[1].split("answers:")[0].strip().strip('"')
        ans = line.split("answers:")[1].strip()
        contexts.append(ctx)
        answers.append(ans)
    else:
        contexts.append("")  # fallback
        answers.append("")

# Compute all metrics
bleu_scores = []
meteor_scores = []
rouge_l_scores = []

for ref, cand in tqdm(zip(references, candidates), desc="Computing metrics"):
    bleu_1, bleu_2, bleu_3, bleu_4 = calculate_bleu(cand, [ref])
    meteor = calculate_meteor(cand, [ref])
    rouge_l = calculate_rouge_l(cand, [ref])

    bleu_scores.append((bleu_1, bleu_2, bleu_3, bleu_4))
    meteor_scores.append(meteor)
    rouge_l_scores.append(rouge_l)

# Calculate averages
avg_bleu_1 = sum([score[0] for score in bleu_scores]) / len(bleu_scores) * 100
avg_bleu_2 = sum([score[1] for score in bleu_scores]) / len(bleu_scores) * 100
avg_bleu_3 = sum([score[2] for score in bleu_scores]) / len(bleu_scores) * 100
avg_bleu_4 = sum([score[3] for score in bleu_scores]) / len(bleu_scores) * 100
avg_meteor = sum(meteor_scores) / len(meteor_scores) * 100
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores) * 100

# Print results
print("\n" + "="*60)
print("="*60)
print(f'Average BLEU-1: {avg_bleu_1:.4f}%')
print(f'Average BLEU-2: {avg_bleu_2:.4f}%')
print(f'Average BLEU-3: {avg_bleu_3:.4f}%')
print(f'Average BLEU-4: {avg_bleu_4:.4f}%')
print(f'Average METEOR: {avg_meteor:.4f}%')
print(f'Average ROUGE-L: {avg_rouge_l:.4f}%')

# Save results to file
with open(evaluation_result_path, 'w', encoding='utf-8') as f:
    f.write("="*50 + "\n")
    f.write("="*50 + "\n")
    f.write(f'Average BLEU-1: {avg_bleu_1:.4f}%\n')
    f.write(f'Average BLEU-2: {avg_bleu_2:.4f}%\n')
    f.write(f'Average BLEU-3: {avg_bleu_3:.4f}%\n')
    f.write(f'Average BLEU-4: {avg_bleu_4:.4f}%\n')
    f.write(f'Average METEOR: {avg_meteor:.4f}%\n')
    f.write(f'Average ROUGE-L: {avg_rouge_l:.4f}%\n')

print(f'T5-Small evaluation results saved to: {evaluation_result_path}')

# Visualization Bar Chart
labels = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4', 'METEOR', 'ROUGE-L']
scores = [avg_bleu_1, avg_bleu_2, avg_bleu_3, avg_bleu_4, avg_meteor, avg_rouge_l]

plt.figure(figsize=(12, 7))
bars = plt.bar(labels, scores, color=['#4e79a7', '#59a14f', '#9c755f', '#f28e2b', '#e15759', '#76b7b2'])
plt.ylim(0, 100)
plt.title('Question Generation - Evaluation Metrics (%)')
plt.ylabel('Score (%)')

# Add values on top of bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 1, f'{height:.2f}%', ha='center', va='bottom')

plt.tight_layout()
plt.savefig(evaluation_plot_path, dpi=300)
plt.show()

print(f'T5-Small evaluation plot saved to: {evaluation_plot_path}')

# Show 5 example outputs
for i in range(5):
    print(f"Example #{i+1}")
    print(f"Context   : {contexts[i][:100]}..." if len(contexts[i]) > 100 else f"Context   : {contexts[i]}")
    print(f"Answer    : {answers[i]}")
    print(f"Reference : {references[i]}")
    print(f"T5-Small  : {candidates[i]}")
    print("---")

# Metric correlation plot
df = pd.DataFrame(bleu_scores, columns=['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4'])
df['METEOR'] = meteor_scores
df['ROUGE-L'] = rouge_l_scores

corr = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', center=0)
plt.title('Correlation between Evaluation Metrics')
plt.tight_layout()
plt.savefig(metric_correlation_plot_path, dpi=300)
plt.show()

print(f'T5-Small metric correlation plot saved to: {metric_correlation_plot_path}')

# Print summary statistics
print("\n" + "="*60)
print("DATASET ANALYSIS SUMMARY")
print("="*60)

for split, data in [('Train', train_df), ('Validation', val_df), ('Test', test_df)]:
    print(f"\n{split} Dataset:")
    print(f"  Size: {len(data):,} samples")

    # Target lengths
    target_lengths = [len(TOKENIZER.encode(f"question: {q}")) for q in data['question']]
    print(f"  Target length - Mean: {np.mean(target_lengths):.2f}, Max: {np.max(target_lengths)}")

    # Context word counts
    context_words = [len(str(context).split()) for context in data['context']]
    print(f"  Context words - Mean: {np.mean(context_words):.2f}, Max: {np.max(context_words)}")

    # Answer word counts
    answer_words = [len(str(answer).split()) for answer in data['answers']]
    print(f"  Answer words - Mean: {np.mean(answer_words):.2f}, Max: {np.max(answer_words)}")

    # Question word counts
    question_words = [len(str(q).split()) for q in data['question']]
    print(f"  Question words - Mean: {np.mean(question_words):.2f}, Max: {np.max(question_words)}")

# Loop (Finetune, Decode, Evaluate) Multi hyperparameter

In [None]:
import os
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Top 10 HPO1 loop 1 TPEsampler 64 kombinasi
top10_hparams1 = [
    {"learning_rate": 3e-05, "batch_size": 8, "weight_decay": 0.0001},
    {"learning_rate": 3e-05, "batch_size": 6, "weight_decay": 1e-05},
    {"learning_rate": 4e-05, "batch_size": 6, "weight_decay": 0.0001},
    {"learning_rate": 5e-05, "batch_size": 6, "weight_decay": 0.0001},
    {"learning_rate": 3e-05, "batch_size": 6, "weight_decay": 0.0001},
    {"learning_rate": 3e-05, "batch_size": 12, "weight_decay": 5e-05},
    {"learning_rate": 4e-05, "batch_size": 12, "weight_decay": 1e-05},
    {"learning_rate": 5e-05, "batch_size": 6, "weight_decay": 0.0001},
    {"learning_rate": 5e-05, "batch_size": 6, "weight_decay": 5e-05},
    {"learning_rate": 4e-05, "batch_size": 12, "weight_decay": 1e-05},
]

# top 10 HPO1 loop 2 Gridsearch
top10_hparams2 = [
    {"learning_rate": 3.2e-05, "batch_size": 8,  "weight_decay": 1.1e-4},
    {"learning_rate": 3e-05,   "batch_size": 7,  "weight_decay": 9e-05},
    {"learning_rate": 3e-05,   "batch_size": 6,  "weight_decay": 1e-4},
    {"learning_rate": 3.5e-05, "batch_size": 6,  "weight_decay": 9e-05},
    {"learning_rate": 3.2e-05, "batch_size": 6,  "weight_decay": 1e-4},
    {"learning_rate": 3.5e-05, "batch_size": 6,  "weight_decay": 1e-4},
    {"learning_rate": 3.5e-05, "batch_size": 7,  "weight_decay": 1.1e-4},
    {"learning_rate": 3.5e-05, "batch_size": 6,  "weight_decay": 1.1e-4},
    {"learning_rate": 3e-05,   "batch_size": 6,  "weight_decay": 1.1e-4},
    {"learning_rate": 3.5e-05, "batch_size": 7,  "weight_decay": 9e-05},
]

# Top 10 HPO 2 loop 1 TPEsampler 81 kombinasi
top10_hparams3 = [
    {"learning_rate": 5e-05, "batch_size": 8,  "weight_decay": 1e-05, "dropout_rate": 0.15},
    {"learning_rate": 5e-05, "batch_size": 8,  "weight_decay": 1e-05, "dropout_rate": 0.15},
    {"learning_rate": 5e-05, "batch_size": 8,  "weight_decay": 1e-04, "dropout_rate": 0.10},
    {"learning_rate": 5e-05, "batch_size": 8,  "weight_decay": 1e-05, "dropout_rate": 0.15},
    {"learning_rate": 5e-05, "batch_size": 6,  "weight_decay": 5e-05, "dropout_rate": 0.10},
    {"learning_rate": 5e-05, "batch_size": 12, "weight_decay": 5e-05, "dropout_rate": 0.10},
    {"learning_rate": 5e-05, "batch_size": 6,  "weight_decay": 1e-05, "dropout_rate": 0.10},
    {"learning_rate": 5e-05, "batch_size": 8,  "weight_decay": 1e-05, "dropout_rate": 0.15},
    {"learning_rate": 5e-05, "batch_size": 6,  "weight_decay": 1e-05, "dropout_rate": 0.10},
    {"learning_rate": 3e-05, "batch_size": 6,  "weight_decay": 1e-04, "dropout_rate": 0.10},
]


# top 10 HPO 2 loop 2 Gridsearch
top10_hparams4 = [
    {"learning_rate": 5e-05, "batch_size": 6, "weight_decay": 2e-05, "dropout_rate": 0.12},
    {"learning_rate": 5e-05, "batch_size": 7, "weight_decay": 1e-05, "dropout_rate": 0.12},
    {"learning_rate": 6e-05, "batch_size": 6, "weight_decay": 2e-05, "dropout_rate": 0.12},
    {"learning_rate": 5e-05, "batch_size": 7, "weight_decay": 1e-05, "dropout_rate": 0.10},
    {"learning_rate": 5e-05, "batch_size": 7, "weight_decay": 1e-05, "dropout_rate": 0.15},
    {"learning_rate": 6e-05, "batch_size": 7, "weight_decay": 1e-05, "dropout_rate": 0.10},
    {"learning_rate": 5e-05, "batch_size": 6, "weight_decay": 1e-05, "dropout_rate": 0.12},
    {"learning_rate": 5e-05, "batch_size": 6, "weight_decay": 2e-05, "dropout_rate": 0.12},
    {"learning_rate": 5e-05, "batch_size": 8, "weight_decay": 1e-05, "dropout_rate": 0.10},
    {"learning_rate": 6e-05, "batch_size": 7, "weight_decay": 1e-05, "dropout_rate": 0.15},
]

# top10_hparams 1/2/3/4
for i, params in enumerate(top10_hparams4):
    print(f"\ Run Model {i+1}")
    # lr, bs, wd = params["learning_rate"], params["batch_size"], params["weight_decay"] # without dropout config
    lr, bs, wd, do = params["learning_rate"], params["batch_size"], params["weight_decay"], params["dropout_rate"]

    # print(f"Hyperparameters: \nLr : {lr} \nBs : {bs} \nWd : {wd}\n") # without dropout config
    print(f"Hyperparameters: \nLr : {lr} \nBs : {bs} \nWd : {wd} \nDr : {do}\n")

    config = T5Config.from_pretrained("t5-base")
    config.dropout_rate = do
    model = T5ForConditionalGeneration.from_pretrained("t5-base", config=config).to(device)
    # model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device) # without dropout config
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8, weight_decay=wd)

    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=bs)

    # Path penyimpanan per model
    model_path_i = os.path.join(MODEL_PATH, f'model_{i}')
    tokenizer_path_i = os.path.join(TOKENIZER_PATH, f'tokenizer_{i}')
    evaluation_path_i = os.path.join(EVALUATION_PATH, f'model_{i}')
    os.makedirs(model_path_i, exist_ok=True)
    os.makedirs(tokenizer_path_i, exist_ok=True)
    os.makedirs(evaluation_path_i, exist_ok=True)

    # Variabel training
    train_losses, val_losses = [], []
    best_val_loss = float("inf")
    patience, counter = 3, 0
    best_model_path, best_tokenizer_path = model_path_i, tokenizer_path_i

    for epoch in range(20):
        model.train()
        total_train_loss = 0
        for batch in tqdm(train_loader, desc=f'Training Epoch {epoch+1}'):
            input_ids = batch['source_ids'].to(device)
            attention_mask = batch['source_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['target_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            labels=labels, decoder_attention_mask=decoder_attention_mask)
            optimizer.zero_grad()
            outputs.loss.backward()
            optimizer.step()
            total_train_loss += outputs.loss.item()

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f'Validation Epoch {epoch+1}'):
                input_ids = batch['source_ids'].to(device)
                attention_mask = batch['source_mask'].to(device)
                labels = batch['labels'].to(device)
                decoder_attention_mask = batch['target_mask'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                                labels=labels, decoder_attention_mask=decoder_attention_mask)
                total_val_loss += outputs.loss.item()

        avg_train = total_train_loss / len(train_loader)
        avg_val = total_val_loss / len(val_loader)
        train_losses.append(avg_train)
        val_losses.append(avg_val)

        print(f"Epoch {epoch+1}: Train Loss = {avg_train:.4f}, Val Loss = {avg_val:.4f}")

        if avg_val < best_val_loss:
            best_val_loss = avg_val
            counter = 0

            # Save best model
            model.save_pretrained(best_model_path)
            tokenizer.save_pretrained(best_tokenizer_path)
            print(f"Best model saved at epoch {epoch+1} with val loss {avg_val:.4f}")
        else:
            counter += 1
            print(f"No improvement. Patience counter: {counter}/{patience}")

            if counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

    import os

    # Decode Input (source/context) dari test_dataset
    def decode_inputs_and_save(dataset, tokenizer, output_path):
        decoded_inputs = []
        for i in range(len(dataset)):
            input_ids = dataset.inputs[i]['input_ids'].squeeze()
            decoded_input = tokenizer.decode(input_ids, skip_special_tokens=True)
            decoded_inputs.append(decoded_input)

        with open(output_path, 'w', encoding='utf-8') as f:
            for line in decoded_inputs:
                f.write(line.strip() + '\n')
        return decoded_inputs  # optionally returned for in-memory use

    # Preprocess Context Only (clean and extract from raw input)
    def preprocess_context(text):
        try:
            if 'context:' in text and 'answers:' in text:
                context_part = text.split('context:')[1]
                cleaned_context = context_part.split('answers:')[0].strip()
                return f'"{cleaned_context}"'
        except IndexError:
            pass
        return '""'

    def process_and_save_contexts(raw_input_path, processed_output_path):
        with open(raw_input_path, 'r', encoding='utf-8') as infile:
            raw_lines = infile.readlines()

        cleaned_contexts = [preprocess_context(line) for line in raw_lines]

        with open(processed_output_path, 'w', encoding='utf-8') as outfile:
            for line in cleaned_contexts:
                outfile.write(line.strip() + '\n')
        return cleaned_contexts

    # Decode Target (reference/question) dari test_dataset
    def decode_targets_and_save(dataset, tokenizer, output_path):
        decoded_targets = []
        for i in range(len(dataset)):
            target_ids = dataset.targets[i]['input_ids'].squeeze()
            decoded_target = tokenizer.decode(target_ids, skip_special_tokens=True)
            decoded_targets.append(decoded_target)

        with open(output_path, 'w', encoding='utf-8') as f:
            for line in decoded_targets:
                f.write(line.strip() + '\n')
        return decoded_targets

    # Preprocess Target Text (for BLEU/METEOR/ROUGE)
    def preprocess_target_text(text):
        text = text.strip()
        if text.endswith('?'):
            text = text[:-1] + ' ?'
        else:
            text += ' ?'
        text = text.replace("'s", " 's").replace('  ', ' ')
        if text.startswith('question:'):
            text = text[9:].strip()
        if text.startswith(','):
            text = text[1:]
        return text.lstrip()

    def process_and_save_targets(raw_target_path, processed_output_path):
        with open(raw_target_path, 'r', encoding='utf-8') as f:
            raw_lines = f.readlines()

        cleaned_targets = [preprocess_target_text(line) for line in raw_lines]

        with open(processed_output_path, 'w', encoding='utf-8') as f:
            for line in cleaned_targets:
                f.write(line.strip() + '\n')
        return cleaned_targets

    import os
    import torch
    import matplotlib.pyplot as plt
    import pandas as pd
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
    from nltk.translate.meteor_score import meteor_score
    from rouge_score import rouge_scorer
    import seaborn as sns
    from evaluate import load
    import re
    import logging

    # Konfigurasi logging
    logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')

    # Path output per model
    src_test_preds_path = os.path.join(model_path_i, 'src_test_preds.txt')
    src_test_preds_processed_path = os.path.join(model_path_i, 'src_test_preds_processed.txt')
    test_ref_path = os.path.join(model_path_i, 'test_ref.txt')
    test_ref_processed_path = os.path.join(model_path_i, 'test_ref_processed.txt')

    # 1. Decode & Simpan Input (berisi context+answer+instruksi)
    decode_inputs_and_save(test_dataset, tokenizer, src_test_preds_path)
    process_and_save_contexts(src_test_preds_path, src_test_preds_processed_path)

    # 2. Decode & Simpan Target (berisi pertanyaan ground-truth)
    decode_targets_and_save(test_dataset, tokenizer, test_ref_path)
    process_and_save_targets(test_ref_path, test_ref_processed_path)

    print(f"Decoded input & target saved to model_{i} folder")

    print(f"\n{'='*30} EVALUATING MODEL #{i+1} {'='*30}")

    tokenizer = T5Tokenizer.from_pretrained(tokenizer_path_i)
    model = T5ForConditionalGeneration.from_pretrained(model_path_i).to(device)

    model.eval()

    # Generate predictions
    generated_questions = []
    for j in range(len(test_dataset)):
        input_ids = test_dataset.inputs[j]['input_ids'].to(device)
        attention_mask = test_dataset.inputs[j]['attention_mask'].to(device)
        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=target_max_len
                )
        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_questions.append(question)

    # Save predictions
    preds_path = os.path.join(model_path_i, 'test_preds.txt')
    with open(preds_path, 'w', encoding='utf-8') as f:
        for q in generated_questions:
            f.write(q.strip() + '\n')

    # Utility
    def read_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return [line.strip() for line in f]

    def write_file(file_path, data_list):
        with open(file_path, 'w', encoding='utf-8') as f:
            for line in data_list:
                f.write(line.strip() + '\n')

    def preprocess_text_for_bleu(text):
      text = text.replace('’', "'")  # Ganti curly apostrophe dengan apostrophe biasa
      text = text.strip()
      if text.endswith('?'):    # Versi pake spasi (kebih besar skor evaluasinya)
          text = text[:-1] + ' ?'
      else:
          text = text + ' ?'
      if "'s" in text:
          text = text.replace("'s", " 's")
      text = text.replace('  ', ' ')
      if text.startswith('question:'):
          text = text[9:].strip()
      if text.startswith(','):
          text = text[1:]
      return text.strip()

    # Preprocess predictions
    processed_preds = [preprocess_text_for_bleu(q) for q in generated_questions]
    processed_preds_path = os.path.join(model_path_i, 'test_preds_processed.txt')
    write_file(processed_preds_path, processed_preds)

    # Load references (ref tetap sama dari test_ref_processed.txt)
    references = read_file(test_ref_processed_path)
    candidates = read_file(processed_preds_path)

    references_tokenized = [ref.split() for ref in references]
    candidates_tokenized = [cand.split() for cand in candidates]
    list_of_references = [[ref] for ref in references_tokenized]

    # METRICS
    smoother = SmoothingFunction().method4
    sacrebleu_score = sacrebleu.corpus_bleu(candidates, [references]).score

    meteor_scores = []
    for i, (r, c) in enumerate(zip(references, candidates)):
        try:
            r_tok = r.strip().split()
            c_tok = c.strip().split()
            score = meteor_score([r_tok], c_tok)  # Perbaikan di sini
            meteor_scores.append(score)
        except Exception as e:
            logging.error(f"[{i}] Error calculating METEOR for r: {r} | c: {c}")
            logging.error(f"[{i}] Exception: {str(e)}")
            meteor_scores.append(0.0)

    rouge_l_scores = [rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(r, c)['rougeL'].fmeasure for r, c in zip(references, candidates)]

    bleu_scores = []
    for ref_tok, cand_tok in zip(references_tokenized, candidates_tokenized):
        ref_list_tok = [ref_tok]
        bleu_1 = sentence_bleu(ref_list_tok, cand_tok, weights=(1, 0, 0, 0), smoothing_function=smoother)
        bleu_2 = sentence_bleu(ref_list_tok, cand_tok, weights=(0.5, 0.5, 0, 0), smoothing_function=smoother)
        bleu_3 = sentence_bleu(ref_list_tok, cand_tok, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoother)
        bleu_4 = sentence_bleu(ref_list_tok, cand_tok, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoother)
        bleu_scores.append((bleu_1, bleu_2, bleu_3, bleu_4))

    # Averages
    avg_sentence_bleu_1 = sum(b[0] for b in bleu_scores) / len(bleu_scores) * 100
    avg_sentence_bleu_2 = sum(b[1] for b in bleu_scores) / len(bleu_scores) * 100
    avg_sentence_bleu_3 = sum(b[2] for b in bleu_scores) / len(bleu_scores) * 100
    avg_sentence_bleu_4 = sum(b[3] for b in bleu_scores) / len(bleu_scores) * 100
    avg_meteor = sum(meteor_scores) / len(meteor_scores) * 100
    avg_rouge = sum(rouge_l_scores) / len(rouge_l_scores) * 100

    corpus_bleu4 = corpus_bleu(list_of_references, candidates_tokenized, weights=(0.25, 0.25, 0.25, 0.25)) * 100

    # Save Results
    # Save summary
    eval_txt_path = os.path.join(evaluation_path_i, 'evaluation_result.txt')
    with open(eval_txt_path, 'w', encoding='utf-8') as f:
        f.write(f"SacreBLEU: {sacrebleu_score:.2f}\n")
        f.write(f"METEOR: {avg_meteor:.2f}\n")
        f.write(f"ROUGE-L: {avg_rouge:.2f}\n")
        f.write(f"Sentence BLEU-4: {avg_sentence_bleu_4:.2f}\n")
        f.write(f"Corpus BLEU-4: {corpus_bleu4:.2f}\n")
    print(f"Evaluation result saved to: {eval_txt_path}")

    # Path untuk simpan plot
    evaluation_plot_path = os.path.join(evaluation_path_i, 'evaluation_result.png')

    # Buat subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

    # Primary Metrics
    primary_labels = ['SacreBLEU', 'METEOR', 'ROUGE-L']
    primary_scores = [sacrebleu_score, avg_meteor, avg_rouge]

    bars1 = ax1.bar(primary_labels, primary_scores, color=['#e15759', '#af7aa1', '#76b7b2'])
    ax1.set_ylim(0, 100)
    ax1.set_title('Evaluation Metrics')
    ax1.set_ylabel('Score (%)')

    for bar in bars1:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2, height + 1, f'{height:.2f}%', ha='center', va='bottom')

    # Sentence-Level BLEU Breakdown
    sentence_labels = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']
    sentence_scores = [avg_sentence_bleu_1, avg_sentence_bleu_2, avg_sentence_bleu_3, avg_sentence_bleu_4]

    bars2 = ax2.bar(sentence_labels, sentence_scores, color=['#4e79a7', '#59a14f', '#9c755f', '#f28e2b'])
    ax2.set_ylim(0, 100)
    ax2.set_title('Sentence-Level BLEU Metrics Analysis')
    ax2.set_ylabel('Score (%)')

    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2, height + 1, f'{height:.2f}%', ha='center', va='bottom')

    # Layout dan simpan
    plt.tight_layout()
    plt.savefig(evaluation_plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f"Evaluation plot png saved to: {evaluation_plot_path}")

    del model, tokenizer, optimizer
    torch.cuda.empty_cache()