In [1]:
%pip install biopython transformers torch datasets numpy scikit-learn evaluate

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
import numpy as np
import os
from Bio import SeqIO
import evaluate
import inspect

print("‚úÖ Imports complete")

‚úÖ Imports complete


In [3]:
# Clear GPU cache before starting
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"‚úÖ GPU cache cleared")
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"   Available memory: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)) / 1e9:.2f} GB")

‚úÖ GPU cache cleared
   GPU: NVIDIA GeForce RTX 4090
   Total GPU memory: 25.25 GB
   Available memory: 25.25 GB


In [4]:
tokenizer = AutoTokenizer.from_pretrained("google/fnet-base")
print("‚úÖ Tokenizer loaded")

‚úÖ Tokenizer loaded


In [5]:
class FASTADataset(Dataset):
    def __init__(self, fasta_file, tokenizer, max_length=512):
        print(f"Loading sequences from {fasta_file}...")
        self.sequences = []
        for record in SeqIO.parse(fasta_file, "fasta"):
            seq = str(record.seq)
            if len(seq) > 0:
                self.sequences.append(seq)
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        print(f"Loaded {len(self.sequences)} sequences")

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        spaced_seq = " ".join(list(seq))
        
        encoding = self.tokenizer(
            spaced_seq,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        )
        
        result = {k: v.squeeze(0) for k, v in encoding.items()}
        
        if 'attention_mask' not in result:
            result['attention_mask'] = torch.ones_like(result['input_ids'])
        
        return result

print("‚úÖ FASTADataset class defined")

‚úÖ FASTADataset class defined


In [6]:
data_folder = '/home/mluser/AFML_RISHABH/Project/10k sequences'

train_path = os.path.join(data_folder, "kinases_cluster_train_10k.fasta")
val_path   = os.path.join(data_folder, "kinases_cluster_val_10k.fasta")
test_path  = os.path.join(data_folder, "kinases_cluster_test_10k.fasta")

# Verify files exist
for path in [train_path, val_path, test_path]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    print(f"‚úì Found: {path}")

# Use reduced max_length to save memory
MAX_LENGTH = 512  # Reduced from 512

train_dataset = FASTADataset(train_path, tokenizer, max_length=MAX_LENGTH)
val_dataset   = FASTADataset(val_path, tokenizer, max_length=MAX_LENGTH)
test_dataset  = FASTADataset(test_path, tokenizer, max_length=MAX_LENGTH)

print("\n" + "="*50)
print(f"Dataset sizes: {len(train_dataset)}, {len(val_dataset)}, {len(test_dataset)}")
print(f"Max sequence length: {MAX_LENGTH}")
print("="*50)

# Test dataset
print("\nTesting dataset[0]...")
sample = train_dataset[0]
print("‚úì Sample retrieved successfully")
print(f"  Keys: {sample.keys()}")
print(f"  input_ids shape: {sample['input_ids'].shape}")
print(f"  First 20 tokens: {sample['input_ids'][:20].tolist()}")

‚úì Found: /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_train_10k.fasta
‚úì Found: /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_val_10k.fasta
‚úì Found: /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_test_10k.fasta
Loading sequences from /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_train_10k.fasta...
Loaded 7989 sequences
Loading sequences from /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_val_10k.fasta...
Loaded 1002 sequences
Loading sequences from /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_test_10k.fasta...
Loaded 1009 sequences

Dataset sizes: 7989, 1002, 1009
Max sequence length: 512

Testing dataset[0]...
‚úì Sample retrieved successfully
  Keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
  input_ids shape: torch.Size([512])
  First 20 tokens: [4, 94, 123, 100, 153, 266, 101, 66, 129, 66, 66, 66, 66, 66, 66, 66, 101, 70, 101, 66]


In [7]:
from transformers.models.fnet.modeling_fnet import FNetBasicFourierTransform

model = AutoModelForMaskedLM.from_pretrained("google/fnet-base")

# Patch FNet Fourier Transform for float32
class FNetSafeFourierTransform(FNetBasicFourierTransform):
    def __init__(self, config):
        super().__init__(config)

    def forward(self, hidden_states):
        hidden_states = hidden_states.to(torch.float32)
        outputs = torch.fft.fftn(hidden_states, dim=(-2, -1)).real
        return (outputs,)

model.fourier_transform = FNetSafeFourierTransform(model.config)

# Force FP32 and handle unexpected kwargs
def force_fp32_forward(original_forward):
    sig = inspect.signature(original_forward)
    
    def wrapped_forward(*args, **kwargs):
        valid_params = set(sig.parameters.keys())
        filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
        
        with torch.autocast(device_type='cuda', enabled=False):
            return original_forward(*args, **filtered_kwargs)
    
    return wrapped_forward

model.forward = force_fp32_forward(model.forward)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).to(torch.float32)
torch.set_float32_matmul_precision("high")

# Enable gradient checkpointing to save memory
if hasattr(model, 'gradient_checkpointing_enable'):
    model.gradient_checkpointing_enable()
    print("‚úì Gradient checkpointing enabled")

print(f"‚úÖ Model ready on {device}")

# Check memory usage
if torch.cuda.is_available():
    print(f"   GPU memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    print(f"   GPU memory reserved: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

‚úì Gradient checkpointing enabled
‚úÖ Model ready on cuda
   GPU memory allocated: 0.33 GB
   GPU memory reserved: 0.37 GB


  _C._set_float32_matmul_precision(precision)


In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Test the data collator
print("Testing data collator...")
batch = [train_dataset[i] for i in range(2)]
collated = data_collator(batch)
print(f"‚úì Collated batch keys: {collated.keys()}")
print(f"  input_ids shape: {collated['input_ids'].shape}")
print(f"  labels shape: {collated['labels'].shape}")

Testing data collator...
‚úì Collated batch keys: KeysView({'input_ids': tensor([[  4,  94,   6,  ...,   6, 101,   5],
        [  4,  94, 123,  ..., 101, 164,   5]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[-100, -100,  123,  ...,  101, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100]])})
  input_ids shape: torch.Size([2, 512])
  labels shape: torch.Size([2, 512])


In [10]:
from itertools import product
from transformers import TrainingArguments

# === Hyperparameter Ranges ===
learning_rates = [3e-5, 5e-5, 7e-5]
weight_decays = [0.01, 0.05]
batch_sizes = [1, 2]
gradient_accumulation_steps_list = [4, 8]
max_seq_lengths = [512]  # Fixed for this experiment
num_train_epochs = 10

# === Generate all combinations ===
hyperparameter_combinations = list(product(
    learning_rates,
    weight_decays,
    batch_sizes,
    gradient_accumulation_steps_list,
    max_seq_lengths
))
print(f"Total combinations to run: {len(hyperparameter_combinations)}")

# === TrainingArguments factory ===
def get_training_args(run_id, learning_rate, weight_decay, batch_size, grad_accum, max_seq_len, output_dir=None):
    if output_dir is None:
        output_dir = f"./KinaseFNet_10k_hparam_run_{run_id}"

    return TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=grad_accum,
        save_strategy="epoch",
        save_total_limit=num_train_epochs,
        logging_steps=100,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        fp16=False,
        bf16=False,
        eval_strategy="no",
        dataloader_num_workers=0,
        remove_unused_columns=False,
        gradient_checkpointing=True,
        optim="adamw_torch",
        max_grad_norm=1.0,
        logging_first_step=True,
        report_to="none",
    )


Total combinations to run: 24


In [11]:
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    mask = labels != -100
    preds = np.argmax(logits, axis=-1)
    preds = preds[mask]
    labels = labels[mask]
    return accuracy_metric.compute(predictions=preds, references=labels)

print("‚úÖ Metrics configured")

‚úÖ Metrics configured


In [12]:
from contextlib import nullcontext
import torch

# Clear any stale accelerator state
try:
    from accelerate.state import AcceleratorState
    if hasattr(AcceleratorState, '_shared_state') and AcceleratorState._shared_state:
        AcceleratorState._reset_state()
        print("‚úì Cleared accelerator state")
except Exception as e:
    print(f"Note: Could not clear accelerator state: {e}")

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("‚úì Cleared GPU cache")

# ‚úÖ Create TrainingArguments instance from your function (fixed argument names)
training_args = get_training_args(
    run_id=0,
    learning_rate=5e-5,
    weight_decay=0.01,
    batch_size=1,
    grad_accum=8,
    max_seq_len=512
)

# ‚úÖ Create trainer WITHOUT eval_dataset to avoid evaluation during training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Disable autocast for stability
trainer.autocast_smart_context_manager = nullcontext

print("‚úÖ Trainer created successfully (no evaluation during training)")

# Quick dataloader test
print("\nTesting trainer dataloader...")
try:
    train_dataloader = trainer.get_train_dataloader()
    test_batch = next(iter(train_dataloader))
    print(f"‚úì Dataloader test passed")
    print(f"  Batch input_ids shape: {test_batch['input_ids'].shape}")
    
    if torch.cuda.is_available():
        print(f"  GPU memory: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
except Exception as e:
    print(f"‚úó Dataloader test failed: {e}")
    import traceback
    traceback.print_exc()


‚úì Cleared GPU cache
‚úÖ Trainer created successfully (no evaluation during training)

Testing trainer dataloader...
‚úì Dataloader test passed
  Batch input_ids shape: torch.Size([1, 512])
  GPU memory: 0.33 GB


  trainer = Trainer(


In [20]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [21]:
import torch, gc, pandas as pd, os
from contextlib import nullcontext
from accelerate.state import AcceleratorState

# ========== CONFIG ==========
BASE_DIR = "hyperparam_runs"
os.makedirs(BASE_DIR, exist_ok=True)
MASTER_CSV = os.path.join(BASE_DIR, "summary.csv")

results = []
completed = set()

# --- Load completed runs if CSV exists ---
if os.path.exists(MASTER_CSV):
    prev_df = pd.read_csv(MASTER_CSV)
    for _, row in prev_df.iterrows():
        combo = (row["learning_rate"], row["weight_decay"], row["batch_size"], row["grad_accum"], row["max_seq_len"])
        completed.add(combo)
    results = prev_df.to_dict(orient="records")
    print(f"üîÅ Loaded {len(completed)} completed runs from {MASTER_CSV}")
else:
    print("üÜï Starting fresh ‚Äî no previous runs found.")

# --- MAIN LOOP ---
for i, (lr, wd, bs, grad_acc, max_len) in enumerate(hyperparameter_combinations, 1):
    combo = (lr, wd, bs, grad_acc, max_len)
    if combo in completed:
        print(f"‚è≠Ô∏è Skipping Run {i}: already done (lr={lr}, wd={wd}, bs={bs}, grad_acc={grad_acc}, len={max_len})")
        continue

    # Create a unique folder for this run
    run_name = f"run_{i:03d}_lr{lr}_wd{wd}_bs{bs}_ga{grad_acc}_len{max_len}"
    run_dir = os.path.join(BASE_DIR, run_name)
    os.makedirs(run_dir, exist_ok=True)

    print("\n" + "="*100)
    print(f"üèÅ Starting Run {i}/{len(hyperparameter_combinations)} ‚Üí {run_name}")
    print("="*100)

    # --- Reset accelerator cleanly ---
    try:
        if hasattr(AcceleratorState, "_shared_state") and AcceleratorState._shared_state:
            AcceleratorState._reset_state()
            print("‚úì Cleared accelerator state")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not clear accelerator state: {e}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print("‚úì Cleared GPU cache")

    # --- Create TrainingArguments for this run ---
    training_args = get_training_args(
        run_id=i,
        learning_rate=lr,
        weight_decay=wd,
        batch_size=bs,
        grad_accum=grad_acc,
        max_seq_len=max_len,
        output_dir=run_dir  # <--- store checkpoints in this folder
    )

    # --- Create Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    trainer.autocast_smart_context_manager = nullcontext

    # --- Train and Save ---
    try:
        train_result = trainer.train()
        status = "Success"
        print(f"‚úÖ Run {i} completed successfully!")

        # Save model + tokenizer + run logs
        trainer.save_model(run_dir)
        tokenizer.save_pretrained(run_dir)
        torch.save(training_args, os.path.join(run_dir, "training_args.pt"))
        print(f"üíæ Model + tokenizer saved to {run_dir}")

        # Save training log history
        if hasattr(trainer.state, "log_history"):
            pd.DataFrame(trainer.state.log_history).to_csv(os.path.join(run_dir, "log_history.csv"), index=False)
            print(f"üìä Saved log_history.csv for run {i}")
    except RuntimeError as e:
        status = "OOM" if "out of memory" in str(e).lower() else "Failed"
        print(f"‚ùå Run {i} failed: {status}")
    except Exception as e:
        status = f"Error: {str(e)[:80]}"
        print(f"‚ùå Run {i} crashed with error: {e}")
    finally:
        # Record the run summary
        results.append({
            "run_id": i,
            "learning_rate": lr,
            "weight_decay": wd,
            "batch_size": bs,
            "grad_accum": grad_acc,
            "max_seq_len": max_len,
            "output_dir": run_dir,
            "status": status,
        })
        completed.add(combo)

        # Save master summary CSV
        pd.DataFrame(results).to_csv(MASTER_CSV, index=False)
        print(f"üìÅ Saved master summary to {MASTER_CSV}")

        # Clear GPU memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        print("üßπ CUDA memory cleared.\n")

print("\n‚úÖ All hyperparameter runs completed!")
print(f"üì¶ Master summary available at: {MASTER_CSV}")


üîÅ Loaded 22 completed runs from hyperparam_runs/summary.csv
‚è≠Ô∏è Skipping Run 1: already done (lr=3e-05, wd=0.01, bs=1, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 2: already done (lr=3e-05, wd=0.01, bs=1, grad_acc=8, len=512)
‚è≠Ô∏è Skipping Run 3: already done (lr=3e-05, wd=0.01, bs=2, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 4: already done (lr=3e-05, wd=0.01, bs=2, grad_acc=8, len=512)
‚è≠Ô∏è Skipping Run 5: already done (lr=3e-05, wd=0.05, bs=1, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 6: already done (lr=3e-05, wd=0.05, bs=1, grad_acc=8, len=512)
‚è≠Ô∏è Skipping Run 7: already done (lr=3e-05, wd=0.05, bs=2, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 8: already done (lr=3e-05, wd=0.05, bs=2, grad_acc=8, len=512)
‚è≠Ô∏è Skipping Run 9: already done (lr=5e-05, wd=0.01, bs=1, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 10: already done (lr=5e-05, wd=0.01, bs=1, grad_acc=8, len=512)
‚è≠Ô∏è Skipping Run 11: already done (lr=5e-05, wd=0.01, bs=2, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 12

  trainer = Trainer(


Step,Training Loss
1,9.1999
100,8.5118
200,8.5898
300,8.4632
400,8.4503
500,8.4537
600,8.4003
700,8.438
800,8.3682
900,8.3698


‚úÖ Run 15 completed successfully!
üíæ Model + tokenizer saved to hyperparam_runs/run_015_lr5e-05_wd0.05_bs2_ga4_len512
üìä Saved log_history.csv for run 15
üìÅ Saved master summary to hyperparam_runs/summary.csv
üßπ CUDA memory cleared.

‚è≠Ô∏è Skipping Run 16: already done (lr=5e-05, wd=0.05, bs=2, grad_acc=8, len=512)
‚è≠Ô∏è Skipping Run 17: already done (lr=7e-05, wd=0.01, bs=1, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 18: already done (lr=7e-05, wd=0.01, bs=1, grad_acc=8, len=512)

üèÅ Starting Run 19/24 ‚Üí run_019_lr7e-05_wd0.01_bs2_ga4_len512
‚úì Cleared accelerator state
‚úì Cleared GPU cache


  trainer = Trainer(


Step,Training Loss
1,7.6866
100,7.4072
200,7.5867
300,7.4076
400,7.4243
500,7.4776
600,7.4197
700,7.4795
800,7.4293
900,7.4179


‚úÖ Run 19 completed successfully!
üíæ Model + tokenizer saved to hyperparam_runs/run_019_lr7e-05_wd0.01_bs2_ga4_len512
üìä Saved log_history.csv for run 19
üìÅ Saved master summary to hyperparam_runs/summary.csv
üßπ CUDA memory cleared.

‚è≠Ô∏è Skipping Run 20: already done (lr=7e-05, wd=0.01, bs=2, grad_acc=8, len=512)
‚è≠Ô∏è Skipping Run 21: already done (lr=7e-05, wd=0.05, bs=1, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 22: already done (lr=7e-05, wd=0.05, bs=1, grad_acc=8, len=512)
‚è≠Ô∏è Skipping Run 23: already done (lr=7e-05, wd=0.05, bs=2, grad_acc=4, len=512)
‚è≠Ô∏è Skipping Run 24: already done (lr=7e-05, wd=0.05, bs=2, grad_acc=8, len=512)

‚úÖ All hyperparameter runs completed!
üì¶ Master summary available at: hyperparam_runs/summary.csv


Evaluation of Models

In [4]:
import os
import torch
from torch.utils.data import ConcatDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
import math
from tqdm import tqdm
import inspect
import pandas as pd

In [22]:
# %%


# ============================================================
# SETUP
# ============================================================
base_dir = "/home/mluser/AFML_RISHABH/Project/hyperparam_runs"
results = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================================================
# PATCH FUNCTION
# ============================================================
def safe_forward(original_forward):
    sig = inspect.signature(original_forward)
    valid_keys = set(sig.parameters.keys())
    def wrapped_forward(*args, **kwargs):
        filtered = {k: v for k, v in kwargs.items() if k in valid_keys}
        return original_forward(*args, **filtered)
    return wrapped_forward

# ============================================================
# MERGE DATASETS (Assuming train_dataset, val_dataset, test_dataset exist)
# ============================================================
full_dataset = ConcatDataset([train_dataset, val_dataset, test_dataset])
print(f"Total samples in full dataset: {len(full_dataset):,}")



# ============================================================
# LOOP OVER ALL RUNS
# ============================================================
for run_name in sorted(os.listdir(base_dir)):
    run_path = os.path.join(base_dir, run_name)
    if not os.path.isdir(run_path):
        continue
    print("\n============================================================")
    print(f"Evaluating model: {run_name}")
    print("============================================================")

    try:
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(run_path)
        model = AutoModelForMaskedLM.from_pretrained(run_path)
        model.forward = safe_forward(model.forward)
        model = model.to(device).eval()

        # Set tokenizer in collator
        data_collator.tokenizer = tokenizer

        # DataLoader
        test_loader = DataLoader(full_dataset, batch_size=8, collate_fn=data_collator)

        total_loss = 0.0
        total_count = 0

        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"Evaluating {run_name}", leave=False):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                if loss is not None:
                    total_loss += loss.item() * batch["input_ids"].size(0)
                    total_count += batch["input_ids"].size(0)

        if total_count > 0:
            avg_loss = total_loss / total_count
            perplexity = math.exp(avg_loss)
            print(f"‚úÖ {run_name} ‚Äî Loss: {avg_loss:.4f}, Perplexity: {perplexity:.2f}")
        else:
            avg_loss, perplexity = None, None
            print(f"‚ö†Ô∏è {run_name} ‚Äî No valid batches.")

        results.append({
            "model_run": run_name,
            "avg_loss": avg_loss,
            "perplexity": perplexity
        })

    except Exception as e:
        print(f"‚ùå Error evaluating {run_name}: {e}")
        results.append({
            "model_run": run_name,
            "avg_loss": None,
            "perplexity": None
        })

# ============================================================
# SAVE RESULTS
# ============================================================
df = pd.DataFrame(results)
output_csv = os.path.join(base_dir, "evaluation_results.csv")
df.to_csv(output_csv, index=False)

print("\n============================================================")
print("All Evaluations Complete ‚úÖ")
print(f"Results saved to: {output_csv}")
print("============================================================")
print(df)


Total samples in full dataset: 10,000

Evaluating model: run_001_lr3e-05_wd0.01_bs1_ga4_len512


                                                                                                     

‚úÖ run_001_lr3e-05_wd0.01_bs1_ga4_len512 ‚Äî Loss: 2.1171, Perplexity: 8.31

Evaluating model: run_002_lr3e-05_wd0.01_bs1_ga8_len512


                                                                                                     

‚úÖ run_002_lr3e-05_wd0.01_bs1_ga8_len512 ‚Äî Loss: 2.0297, Perplexity: 7.61

Evaluating model: run_003_lr3e-05_wd0.01_bs2_ga4_len512


                                                                                                     

‚úÖ run_003_lr3e-05_wd0.01_bs2_ga4_len512 ‚Äî Loss: 1.9467, Perplexity: 7.01

Evaluating model: run_004_lr3e-05_wd0.01_bs2_ga8_len512


                                                                                                     

‚úÖ run_004_lr3e-05_wd0.01_bs2_ga8_len512 ‚Äî Loss: 1.9030, Perplexity: 6.71

Evaluating model: run_005_lr3e-05_wd0.05_bs1_ga4_len512


                                                                                                     

‚úÖ run_005_lr3e-05_wd0.05_bs1_ga4_len512 ‚Äî Loss: 2.1256, Perplexity: 8.38

Evaluating model: run_006_lr3e-05_wd0.05_bs1_ga8_len512


                                                                                                     

‚úÖ run_006_lr3e-05_wd0.05_bs1_ga8_len512 ‚Äî Loss: 2.1753, Perplexity: 8.80

Evaluating model: run_007_lr3e-05_wd0.05_bs2_ga4_len512


                                                                                                     

‚úÖ run_007_lr3e-05_wd0.05_bs2_ga4_len512 ‚Äî Loss: 2.0450, Perplexity: 7.73

Evaluating model: run_008_lr3e-05_wd0.05_bs2_ga8_len512


                                                                                                     

‚úÖ run_008_lr3e-05_wd0.05_bs2_ga8_len512 ‚Äî Loss: 2.0371, Perplexity: 7.67

Evaluating model: run_009_lr5e-05_wd0.01_bs1_ga4_len512


                                                                                                     

‚úÖ run_009_lr5e-05_wd0.01_bs1_ga4_len512 ‚Äî Loss: 2.0508, Perplexity: 7.77

Evaluating model: run_010_lr5e-05_wd0.01_bs1_ga8_len512


                                                                                                     

‚úÖ run_010_lr5e-05_wd0.01_bs1_ga8_len512 ‚Äî Loss: 1.9365, Perplexity: 6.93

Evaluating model: run_011_lr5e-05_wd0.01_bs2_ga4_len512


                                                                                                     

‚úÖ run_011_lr5e-05_wd0.01_bs2_ga4_len512 ‚Äî Loss: 1.8384, Perplexity: 6.29

Evaluating model: run_012_lr5e-05_wd0.01_bs2_ga8_len512


                                                                                                     

‚úÖ run_012_lr5e-05_wd0.01_bs2_ga8_len512 ‚Äî Loss: 1.7880, Perplexity: 5.98

Evaluating model: run_013_lr5e-05_wd0.05_bs1_ga4_len512


                                                                                                     

‚úÖ run_013_lr5e-05_wd0.05_bs1_ga4_len512 ‚Äî Loss: 1.7333, Perplexity: 5.66

Evaluating model: run_014_lr5e-05_wd0.05_bs1_ga8_len512


                                                                                                     

‚úÖ run_014_lr5e-05_wd0.05_bs1_ga8_len512 ‚Äî Loss: 1.7086, Perplexity: 5.52

Evaluating model: run_015_lr5e-05_wd0.05_bs2_ga4_len512


                                                                                                     

‚úÖ run_015_lr5e-05_wd0.05_bs2_ga4_len512 ‚Äî Loss: 1.9603, Perplexity: 7.10

Evaluating model: run_016_lr5e-05_wd0.05_bs2_ga8_len512


                                                                                                     

‚úÖ run_016_lr5e-05_wd0.05_bs2_ga8_len512 ‚Äî Loss: 2.1463, Perplexity: 8.55

Evaluating model: run_017_lr7e-05_wd0.01_bs1_ga4_len512


                                                                                                     

‚úÖ run_017_lr7e-05_wd0.01_bs1_ga4_len512 ‚Äî Loss: 1.9241, Perplexity: 6.85

Evaluating model: run_018_lr7e-05_wd0.01_bs1_ga8_len512


                                                                                                     

‚úÖ run_018_lr7e-05_wd0.01_bs1_ga8_len512 ‚Äî Loss: 1.8282, Perplexity: 6.22

Evaluating model: run_019_lr7e-05_wd0.01_bs2_ga4_len512


                                                                                                     

‚úÖ run_019_lr7e-05_wd0.01_bs2_ga4_len512 ‚Äî Loss: 1.8712, Perplexity: 6.50

Evaluating model: run_020_lr7e-05_wd0.01_bs2_ga8_len512


                                                                                                     

‚úÖ run_020_lr7e-05_wd0.01_bs2_ga8_len512 ‚Äî Loss: 1.9280, Perplexity: 6.88

Evaluating model: run_021_lr7e-05_wd0.05_bs1_ga4_len512


                                                                                                     

‚úÖ run_021_lr7e-05_wd0.05_bs1_ga4_len512 ‚Äî Loss: 1.8121, Perplexity: 6.12

Evaluating model: run_022_lr7e-05_wd0.05_bs1_ga8_len512


                                                                                                     

‚úÖ run_022_lr7e-05_wd0.05_bs1_ga8_len512 ‚Äî Loss: 1.7461, Perplexity: 5.73

Evaluating model: run_023_lr7e-05_wd0.05_bs2_ga4_len512


                                                                                                     

‚úÖ run_023_lr7e-05_wd0.05_bs2_ga4_len512 ‚Äî Loss: 1.6756, Perplexity: 5.34

Evaluating model: run_024_lr7e-05_wd0.05_bs2_ga8_len512


                                                                                                     

‚úÖ run_024_lr7e-05_wd0.05_bs2_ga8_len512 ‚Äî Loss: 2.0971, Perplexity: 8.14

All Evaluations Complete ‚úÖ
Results saved to: /home/mluser/AFML_RISHABH/Project/hyperparam_runs/evaluation_results.csv
                                model_run  avg_loss  perplexity
0   run_001_lr3e-05_wd0.01_bs1_ga4_len512  2.117117    8.307150
1   run_002_lr3e-05_wd0.01_bs1_ga8_len512  2.029656    7.611469
2   run_003_lr3e-05_wd0.01_bs2_ga4_len512  1.946687    7.005442
3   run_004_lr3e-05_wd0.01_bs2_ga8_len512  1.902977    6.705826
4   run_005_lr3e-05_wd0.05_bs1_ga4_len512  2.125601    8.377933
5   run_006_lr3e-05_wd0.05_bs1_ga8_len512  2.175255    8.804427
6   run_007_lr3e-05_wd0.05_bs2_ga4_len512  2.045010    7.729234
7   run_008_lr3e-05_wd0.05_bs2_ga8_len512  2.037138    7.668630
8   run_009_lr5e-05_wd0.01_bs1_ga4_len512  2.050794    7.774068
9   run_010_lr5e-05_wd0.01_bs1_ga8_len512  1.936474    6.934260
10  run_011_lr5e-05_wd0.01_bs2_ga4_len512  1.838383    6.286367
11  run_012_lr5e-05_wd0.01_bs2_ga8



## 4 more hyperparamters

In [13]:
import torch, gc, pandas as pd, os
from contextlib import nullcontext
from accelerate.state import AcceleratorState

# ========== CONFIG ==========
BASE_DIR = "/home/mluser/AFML_RISHABH/Project/hyperparam_runs"
os.makedirs(BASE_DIR, exist_ok=True)
MASTER_CSV = os.path.join(BASE_DIR, "summary.csv")

results = []
completed = set()

# --- Load completed runs if CSV exists ---
if os.path.exists(MASTER_CSV):
    prev_df = pd.read_csv(MASTER_CSV)
    for _, row in prev_df.iterrows():
        combo = (row["learning_rate"], row["weight_decay"], row["batch_size"], row["grad_accum"], row["max_seq_len"])
        completed.add(combo)
    results = prev_df.to_dict(orient="records")
    print(f"üîÅ Loaded {len(completed)} completed runs from {MASTER_CSV}")
else:
    print("üÜï Starting fresh ‚Äî no previous runs found.")

# ========== NEW HYPERPARAMETER COMBINATIONS ==========
hyperparameter_combinations = [
    # Run A ‚Äî push LR slightly higher
    (8e-5, 0.05, 2, 4, 512),

    # Run B ‚Äî reduce regularization a bit
    (7e-5, 0.03, 2, 4, 512),

    # Run C ‚Äî change batch/grad_acc balance
    (7e-5, 0.05, 4, 2, 512),

    # Run D ‚Äî longer context window test
    (7e-5, 0.05, 1, 8, 768),
]

# --- MAIN LOOP ---
for i, (lr, wd, bs, grad_acc, max_len) in enumerate(hyperparameter_combinations, 1):
    combo = (lr, wd, bs, grad_acc, max_len)
    if combo in completed:
        print(f"‚è≠Ô∏è Skipping Run {i}: already done (lr={lr}, wd={wd}, bs={bs}, grad_acc={grad_acc}, len={max_len})")
        continue

    # Create a unique folder for this run
    run_name = f"run_next_{i:03d}_lr{lr}_wd{wd}_bs{bs}_ga{grad_acc}_len{max_len}"
    run_dir = os.path.join(BASE_DIR, run_name)
    os.makedirs(run_dir, exist_ok=True)

    print("\n" + "="*100)
    print(f"üèÅ Starting Run {i}/{len(hyperparameter_combinations)} ‚Üí {run_name}")
    print("="*100)

    # --- Reset accelerator cleanly ---
    try:
        if hasattr(AcceleratorState, "_shared_state") and AcceleratorState._shared_state:
            AcceleratorState._reset_state()
            print("‚úì Cleared accelerator state")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not clear accelerator state: {e}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print("‚úì Cleared GPU cache")

    # --- Create TrainingArguments for this run ---
    training_args = get_training_args(
        run_id=i,
        learning_rate=lr,
        weight_decay=wd,
        batch_size=bs,
        grad_accum=grad_acc,
        max_seq_len=max_len,
        output_dir=run_dir  # <--- store checkpoints in this folder
    )

    # --- Create Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    trainer.autocast_smart_context_manager = nullcontext

    # --- Train and Save ---
    try:
        train_result = trainer.train()
        status = "Success"
        print(f"‚úÖ Run {i} completed successfully!")

        # Save model + tokenizer + run logs
        trainer.save_model(run_dir)
        tokenizer.save_pretrained(run_dir)
        torch.save(training_args, os.path.join(run_dir, "training_args.pt"))
        print(f"üíæ Model + tokenizer saved to {run_dir}")

        # Save training log history
        if hasattr(trainer.state, "log_history"):
            pd.DataFrame(trainer.state.log_history).to_csv(os.path.join(run_dir, "log_history.csv"), index=False)
            print(f"üìä Saved log_history.csv for run {i}")
    except RuntimeError as e:
        status = "OOM" if "out of memory" in str(e).lower() else "Failed"
        print(f"‚ùå Run {i} failed: {status}")
    except Exception as e:
        status = f"Error: {str(e)[:80]}"
        print(f"‚ùå Run {i} crashed with error: {e}")
    finally:
        # Record the run summary
        results.append({
            "run_id": i,
            "learning_rate": lr,
            "weight_decay": wd,
            "batch_size": bs,
            "grad_accum": grad_acc,
            "max_seq_len": max_len,
            "output_dir": run_dir,
            "status": status,
        })
        completed.add(combo)

        # Save master summary CSV
        pd.DataFrame(results).to_csv(MASTER_CSV, index=False)
        print(f"üìÅ Saved master summary to {MASTER_CSV}")

        # Clear GPU memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        print("üßπ CUDA memory cleared.\n")

print("\n‚úÖ All hyperparameter runs completed!")
print(f"üì¶ Master summary available at: {MASTER_CSV}")


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


üîÅ Loaded 24 completed runs from /home/mluser/AFML_RISHABH/Project/hyperparam_runs/summary.csv

üèÅ Starting Run 1/4 ‚Üí run_next_001_lr8e-05_wd0.05_bs2_ga4_len512
‚úì Cleared accelerator state
‚úì Cleared GPU cache


Step,Training Loss
1,15.5424
100,10.4999
200,10.3755
300,10.2721
400,10.1344
500,10.0065
600,9.8942
700,9.866
800,9.7544
900,9.681


‚úÖ Run 1 completed successfully!
üíæ Model + tokenizer saved to /home/mluser/AFML_RISHABH/Project/hyperparam_runs/run_next_001_lr8e-05_wd0.05_bs2_ga4_len512
üìä Saved log_history.csv for run 1
üìÅ Saved master summary to /home/mluser/AFML_RISHABH/Project/hyperparam_runs/summary.csv
üßπ CUDA memory cleared.


üèÅ Starting Run 2/4 ‚Üí run_next_002_lr7e-05_wd0.03_bs2_ga4_len512
‚úì Cleared accelerator state
‚úì Cleared GPU cache


  trainer = Trainer(


Step,Training Loss
1,9.0259
100,8.2491
200,8.3508
300,8.1863
400,8.1938
500,8.1816
600,8.1328
700,8.1849
800,8.1115
900,8.0924


‚úÖ Run 2 completed successfully!
üíæ Model + tokenizer saved to /home/mluser/AFML_RISHABH/Project/hyperparam_runs/run_next_002_lr7e-05_wd0.03_bs2_ga4_len512
üìä Saved log_history.csv for run 2
üìÅ Saved master summary to /home/mluser/AFML_RISHABH/Project/hyperparam_runs/summary.csv
üßπ CUDA memory cleared.


üèÅ Starting Run 3/4 ‚Üí run_next_003_lr7e-05_wd0.05_bs4_ga2_len512
‚úì Cleared accelerator state
‚úì Cleared GPU cache


  trainer = Trainer(


Step,Training Loss
1,4.1254
100,3.8173
200,3.8824
300,3.7873
400,3.8169
500,3.8226
600,3.8403
700,3.8473
800,3.817
900,3.82


‚úÖ Run 3 completed successfully!
üíæ Model + tokenizer saved to /home/mluser/AFML_RISHABH/Project/hyperparam_runs/run_next_003_lr7e-05_wd0.05_bs4_ga2_len512
üìä Saved log_history.csv for run 3
üìÅ Saved master summary to /home/mluser/AFML_RISHABH/Project/hyperparam_runs/summary.csv
üßπ CUDA memory cleared.


üèÅ Starting Run 4/4 ‚Üí run_next_004_lr7e-05_wd0.05_bs1_ga8_len768
‚úì Cleared accelerator state
‚úì Cleared GPU cache


  trainer = Trainer(


Step,Training Loss
1,15.0171
100,14.2179
200,14.4153
300,14.0328
400,14.2319
500,14.2776


üìÅ Saved master summary to /home/mluser/AFML_RISHABH/Project/hyperparam_runs/summary.csv
üßπ CUDA memory cleared.



KeyboardInterrupt: 

## Evaluation of the 3 HPs

In [14]:
# %%
import torch, os, math, pandas as pd, inspect
from torch.utils.data import ConcatDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM
from tqdm import tqdm

# ============================================================
# SETUP
# ============================================================
base_dir = "/home/mluser/AFML_RISHABH/Project/hyperparam_runs"
results = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================================================
# PATCH FUNCTION
# ============================================================
def safe_forward(original_forward):
    sig = inspect.signature(original_forward)
    valid_keys = set(sig.parameters.keys())
    def wrapped_forward(*args, **kwargs):
        filtered = {k: v for k, v in kwargs.items() if k in valid_keys}
        return original_forward(*args, **filtered)
    return wrapped_forward

# ============================================================
# MERGE DATASETS (Assuming train_dataset, val_dataset, test_dataset exist)
# ============================================================
full_dataset = ConcatDataset([train_dataset, val_dataset, test_dataset])
print(f"Total samples in full dataset: {len(full_dataset):,}")

# ============================================================
# LOAD EXISTING CSV (if exists)
# ============================================================
output_csv = os.path.join(base_dir, "evaluation_results.csv")
if os.path.exists(output_csv):
    existing_df = pd.read_csv(output_csv)
    existing_runs = set(existing_df["model_run"])
    print(f"üîÅ Loaded existing evaluation results ({len(existing_df)} runs)")
else:
    existing_df = pd.DataFrame(columns=["model_run", "avg_loss", "perplexity"])
    existing_runs = set()
    print("üÜï No previous evaluation file found ‚Äî starting fresh.")

# ============================================================
# LOOP OVER NEW RUNS (only run_next_ folders)
# ============================================================
new_results = []

for run_name in sorted(os.listdir(base_dir)):
    run_path = os.path.join(base_dir, run_name)
    if not os.path.isdir(run_path):
        continue
    if not run_name.startswith("run_next_"):
        continue
    if run_name in existing_runs:
        print(f"‚è≠Ô∏è Skipping {run_name} ‚Äî already in evaluation file.")
        continue

    print("\n============================================================")
    print(f"Evaluating model: {run_name}")
    print("============================================================")

    try:
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(run_path)
        model = AutoModelForMaskedLM.from_pretrained(run_path)
        model.forward = safe_forward(model.forward)
        model = model.to(device).eval()

        # DataLoader
        test_loader = DataLoader(full_dataset, batch_size=8, collate_fn=data_collator)

        total_loss = 0.0
        total_count = 0

        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"Evaluating {run_name}", leave=False):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                if loss is not None:
                    total_loss += loss.item() * batch["input_ids"].size(0)
                    total_count += batch["input_ids"].size(0)

        if total_count > 0:
            avg_loss = total_loss / total_count
            perplexity = math.exp(avg_loss)
            print(f"‚úÖ {run_name} ‚Äî Loss: {avg_loss:.4f}, Perplexity: {perplexity:.2f}")
        else:
            avg_loss, perplexity = None, None
            print(f"‚ö†Ô∏è {run_name} ‚Äî No valid batches.")

        new_results.append({
            "model_run": run_name,
            "avg_loss": avg_loss,
            "perplexity": perplexity
        })

    except Exception as e:
        print(f"‚ùå Error evaluating {run_name}: {e}")
        new_results.append({
            "model_run": run_name,
            "avg_loss": None,
            "perplexity": None
        })

# ============================================================
# SAVE UPDATED RESULTS
# ============================================================
if new_results:
    new_df = pd.DataFrame(new_results)
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    combined_df.to_csv(output_csv, index=False)
    print("\n============================================================")
    print("‚úÖ Evaluation Complete for New Runs")
    print("============================================================")
    print(new_df)
else:
    print("\n‚úÖ No new 'run_next_' models found or all already evaluated.")

print(f"\nüìÅ Results saved to: {output_csv}")


Total samples in full dataset: 10,000
üîÅ Loaded existing evaluation results (24 runs)

Evaluating model: run_next_001_lr8e-05_wd0.05_bs2_ga4_len512


                                                                                                          

‚úÖ run_next_001_lr8e-05_wd0.05_bs2_ga4_len512 ‚Äî Loss: 2.0265, Perplexity: 7.59

Evaluating model: run_next_002_lr7e-05_wd0.03_bs2_ga4_len512


                                                                                                          

‚úÖ run_next_002_lr7e-05_wd0.03_bs2_ga4_len512 ‚Äî Loss: 1.8955, Perplexity: 6.66

Evaluating model: run_next_003_lr7e-05_wd0.05_bs4_ga2_len512


                                                                                                          

‚úÖ run_next_003_lr7e-05_wd0.05_bs4_ga2_len512 ‚Äî Loss: 1.7844, Perplexity: 5.96

Evaluating model: run_next_004_lr7e-05_wd0.05_bs1_ga8_len768
‚ùå Error evaluating run_next_004_lr7e-05_wd0.05_bs1_ga8_len768: Unrecognized model in /home/mluser/AFML_RISHABH/Project/hyperparam_runs/run_next_004_lr7e-05_wd0.05_bs1_ga8_len768. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: aimv2, aimv2_vision_model, albert, align, altclip, apertus, arcee, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, bitnet, blenderbot, blenderbot-small, blip, blip-2, blip_2_qformer, bloom, blt, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, cohere2_vision, colpali, colqwen2, conditional_detr, convbert, convnext, 



## Eval on val + text data

In [18]:
# %%


# ============================================================
# SETUP
# ============================================================
base_dir = "/home/mluser/AFML_RISHABH/Project/hyperparam_runs"
results = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================================================
# PATCH FUNCTION
# ============================================================
def safe_forward(original_forward):
    sig = inspect.signature(original_forward)
    valid_keys = set(sig.parameters.keys())
    def wrapped_forward(*args, **kwargs):
        filtered = {k: v for k, v in kwargs.items() if k in valid_keys}
        return original_forward(*args, **filtered)
    return wrapped_forward

# ============================================================
# MERGE DATASETS (Assuming train_dataset, val_dataset, test_dataset exist)
# ============================================================
full_dataset = ConcatDataset([val_dataset, test_dataset])
print(f"Total samples in full dataset: {len(full_dataset):,}")



# ============================================================
# LOOP OVER ALL RUNS
# ============================================================
for run_name in sorted(os.listdir(base_dir)):
    run_path = os.path.join(base_dir, run_name)
    if not os.path.isdir(run_path):
        continue
    print("\n============================================================")
    print(f"Evaluating model: {run_name}")
    print("============================================================")

    try:
        # Load model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(run_path)
        model = AutoModelForMaskedLM.from_pretrained(run_path)
        model.forward = safe_forward(model.forward)
        model = model.to(device).eval()

        # Set tokenizer in collator
        data_collator.tokenizer = tokenizer

        # DataLoader
        test_loader = DataLoader(full_dataset, batch_size=8, collate_fn=data_collator)

        total_loss = 0.0
        total_count = 0

        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"Evaluating {run_name}", leave=False):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                if loss is not None:
                    total_loss += loss.item() * batch["input_ids"].size(0)
                    total_count += batch["input_ids"].size(0)

        if total_count > 0:
            avg_loss = total_loss / total_count
            perplexity = math.exp(avg_loss)
            print(f"‚úÖ {run_name} ‚Äî Loss: {avg_loss:.4f}, Perplexity: {perplexity:.2f}")
        else:
            avg_loss, perplexity = None, None
            print(f"‚ö†Ô∏è {run_name} ‚Äî No valid batches.")

        results.append({
            "model_run": run_name,
            "avg_loss": avg_loss,
            "perplexity": perplexity
        })

    except Exception as e:
        print(f"‚ùå Error evaluating {run_name}: {e}")
        results.append({
            "model_run": run_name,
            "avg_loss": None,
            "perplexity": None
        })

# ============================================================
# SAVE RESULTS
# ============================================================
df = pd.DataFrame(results)
output_csv = os.path.join(base_dir, "evaluation_results_test_val.csv")
df.to_csv(output_csv, index=False)

print("\n============================================================")
print("All Evaluations Complete ‚úÖ")
print(f"Results saved to: {output_csv}")
print("============================================================")
print(df)


Total samples in full dataset: 2,011

Evaluating model: run_001_lr3e-05_wd0.01_bs1_ga4_len512


                                                                                                   

‚úÖ run_001_lr3e-05_wd0.01_bs1_ga4_len512 ‚Äî Loss: 2.2554, Perplexity: 9.54

Evaluating model: run_002_lr3e-05_wd0.01_bs1_ga8_len512


                                                                                                   

‚úÖ run_002_lr3e-05_wd0.01_bs1_ga8_len512 ‚Äî Loss: 2.2208, Perplexity: 9.21

Evaluating model: run_003_lr3e-05_wd0.01_bs2_ga4_len512


                                                                                                   

‚úÖ run_003_lr3e-05_wd0.01_bs2_ga4_len512 ‚Äî Loss: 2.1862, Perplexity: 8.90

Evaluating model: run_004_lr3e-05_wd0.01_bs2_ga8_len512


                                                                                                   

‚úÖ run_004_lr3e-05_wd0.01_bs2_ga8_len512 ‚Äî Loss: 2.1850, Perplexity: 8.89

Evaluating model: run_005_lr3e-05_wd0.05_bs1_ga4_len512


                                                                                                   

‚úÖ run_005_lr3e-05_wd0.05_bs1_ga4_len512 ‚Äî Loss: 2.2533, Perplexity: 9.52

Evaluating model: run_006_lr3e-05_wd0.05_bs1_ga8_len512


                                                                                                   

‚úÖ run_006_lr3e-05_wd0.05_bs1_ga8_len512 ‚Äî Loss: 2.2839, Perplexity: 9.82

Evaluating model: run_007_lr3e-05_wd0.05_bs2_ga4_len512


                                                                                                   

‚úÖ run_007_lr3e-05_wd0.05_bs2_ga4_len512 ‚Äî Loss: 2.2136, Perplexity: 9.15

Evaluating model: run_008_lr3e-05_wd0.05_bs2_ga8_len512


                                                                                                   

‚úÖ run_008_lr3e-05_wd0.05_bs2_ga8_len512 ‚Äî Loss: 2.2111, Perplexity: 9.13

Evaluating model: run_009_lr5e-05_wd0.01_bs1_ga4_len512


                                                                                                   

‚úÖ run_009_lr5e-05_wd0.01_bs1_ga4_len512 ‚Äî Loss: 2.2117, Perplexity: 9.13

Evaluating model: run_010_lr5e-05_wd0.01_bs1_ga8_len512


                                                                                                   

‚úÖ run_010_lr5e-05_wd0.01_bs1_ga8_len512 ‚Äî Loss: 2.1810, Perplexity: 8.86

Evaluating model: run_011_lr5e-05_wd0.01_bs2_ga4_len512


                                                                                                   

‚úÖ run_011_lr5e-05_wd0.01_bs2_ga4_len512 ‚Äî Loss: 2.1399, Perplexity: 8.50

Evaluating model: run_012_lr5e-05_wd0.01_bs2_ga8_len512


                                                                                                   

‚úÖ run_012_lr5e-05_wd0.01_bs2_ga8_len512 ‚Äî Loss: 2.1479, Perplexity: 8.57

Evaluating model: run_013_lr5e-05_wd0.05_bs1_ga4_len512


                                                                                                   

‚úÖ run_013_lr5e-05_wd0.05_bs1_ga4_len512 ‚Äî Loss: 2.1355, Perplexity: 8.46

Evaluating model: run_014_lr5e-05_wd0.05_bs1_ga8_len512


                                                                                                   

‚úÖ run_014_lr5e-05_wd0.05_bs1_ga8_len512 ‚Äî Loss: 2.1631, Perplexity: 8.70

Evaluating model: run_015_lr5e-05_wd0.05_bs2_ga4_len512


                                                                                                   

‚úÖ run_015_lr5e-05_wd0.05_bs2_ga4_len512 ‚Äî Loss: 2.1864, Perplexity: 8.90

Evaluating model: run_016_lr5e-05_wd0.05_bs2_ga8_len512


                                                                                                   

‚úÖ run_016_lr5e-05_wd0.05_bs2_ga8_len512 ‚Äî Loss: 2.2654, Perplexity: 9.63

Evaluating model: run_017_lr7e-05_wd0.01_bs1_ga4_len512


                                                                                                   

‚úÖ run_017_lr7e-05_wd0.01_bs1_ga4_len512 ‚Äî Loss: 2.1526, Perplexity: 8.61

Evaluating model: run_018_lr7e-05_wd0.01_bs1_ga8_len512


                                                                                                   

‚úÖ run_018_lr7e-05_wd0.01_bs1_ga8_len512 ‚Äî Loss: 2.1420, Perplexity: 8.52

Evaluating model: run_019_lr7e-05_wd0.01_bs2_ga4_len512


                                                                                                   

‚úÖ run_019_lr7e-05_wd0.01_bs2_ga4_len512 ‚Äî Loss: 2.1679, Perplexity: 8.74

Evaluating model: run_020_lr7e-05_wd0.01_bs2_ga8_len512


                                                                                                   

‚úÖ run_020_lr7e-05_wd0.01_bs2_ga8_len512 ‚Äî Loss: 2.1710, Perplexity: 8.77

Evaluating model: run_021_lr7e-05_wd0.05_bs1_ga4_len512


                                                                                                   

‚úÖ run_021_lr7e-05_wd0.05_bs1_ga4_len512 ‚Äî Loss: 2.1292, Perplexity: 8.41

Evaluating model: run_022_lr7e-05_wd0.05_bs1_ga8_len512


                                                                                                   

‚úÖ run_022_lr7e-05_wd0.05_bs1_ga8_len512 ‚Äî Loss: 2.1377, Perplexity: 8.48

Evaluating model: run_023_lr7e-05_wd0.05_bs2_ga4_len512


                                                                                                   

‚úÖ run_023_lr7e-05_wd0.05_bs2_ga4_len512 ‚Äî Loss: 2.1224, Perplexity: 8.35

Evaluating model: run_024_lr7e-05_wd0.05_bs2_ga8_len512


                                                                                                   

‚úÖ run_024_lr7e-05_wd0.05_bs2_ga8_len512 ‚Äî Loss: 2.2357, Perplexity: 9.35

Evaluating model: run_next_001_lr8e-05_wd0.05_bs2_ga4_len512


                                                                                                        

‚úÖ run_next_001_lr8e-05_wd0.05_bs2_ga4_len512 ‚Äî Loss: 2.2050, Perplexity: 9.07

Evaluating model: run_next_002_lr7e-05_wd0.03_bs2_ga4_len512


                                                                                                        

‚úÖ run_next_002_lr7e-05_wd0.03_bs2_ga4_len512 ‚Äî Loss: 2.1658, Perplexity: 8.72

Evaluating model: run_next_003_lr7e-05_wd0.05_bs4_ga2_len512


                                                                                                        

‚úÖ run_next_003_lr7e-05_wd0.05_bs4_ga2_len512 ‚Äî Loss: 2.1272, Perplexity: 8.39

All Evaluations Complete ‚úÖ
Results saved to: /home/mluser/AFML_RISHABH/Project/hyperparam_runs/evaluation_results_test_val.csv
                                     model_run  avg_loss  perplexity
0        run_001_lr3e-05_wd0.01_bs1_ga4_len512  2.255375    9.538872
1        run_002_lr3e-05_wd0.01_bs1_ga8_len512  2.220805    9.214748
2        run_003_lr3e-05_wd0.01_bs2_ga4_len512  2.186218    8.901483
3        run_004_lr3e-05_wd0.01_bs2_ga8_len512  2.185007    8.890711
4        run_005_lr3e-05_wd0.05_bs1_ga4_len512  2.253300    9.519099
5        run_006_lr3e-05_wd0.05_bs1_ga8_len512  2.283941    9.815283
6        run_007_lr3e-05_wd0.05_bs2_ga4_len512  2.213591    9.148512
7        run_008_lr3e-05_wd0.05_bs2_ga8_len512  2.211147    9.126174
8        run_009_lr5e-05_wd0.01_bs1_ga4_len512  2.211704    9.131262
9        run_010_lr5e-05_wd0.01_bs1_ga8_len512  2.180987    8.855045
10       run_011_lr5e-05_wd0.

