In [1]:
# ===========================================================
# ESM2 MASKED LANGUAGE MODEL TRAINING + EVALUATION
# ===========================================================

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import Dataset, DatasetDict
from Bio import SeqIO
import pandas as pd
import os
import math
from itertools import product  # Import 'product'
import logging

# Suppress excessive logging from transformers
logging.basicConfig(level=logging.INFO)
logging.getLogger("transformers").setLevel(logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%pip install accelerate --upgrade

print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    print(f"PyTorch built with CUDA Version: {torch.version.cuda}")

Note: you may need to restart the kernel to use updated packages.
PyTorch Version: 2.9.0+cu128
CUDA Available: True
Device Name: NVIDIA GeForce RTX 4090
PyTorch built with CUDA Version: 12.8


In [3]:
def load_fasta_file(file_path):
    """
    Loads a single FASTA file into a Pandas DataFrame.
    """
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        seq = str(record.seq)
        if len(seq) > 0:
            sequences.append(seq)
            
    print(f" Loaded {len(sequences)} sequences from {file_path}")
    return pd.DataFrame({"sequence": sequences})

# --- Load all data ---
# !!! UPDATE THIS PATH TO MATCH YOUR SYSTEM !!!
base_path = r"/home/mluser/AFML_RISHABH/Project/10k sequences"
train_df = load_fasta_file(os.path.join(base_path, "kinases_cluster_train_10k.fasta"))
val_df   = load_fasta_file(os.path.join(base_path, "kinases_cluster_val_10k.fasta"))
test_df  = load_fasta_file(os.path.join(base_path, "kinases_cluster_test_10k.fasta"))

print("\nTraining data sample:")
print(train_df.head())

 Loaded 7989 sequences from /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_train_10k.fasta
 Loaded 1002 sequences from /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_val_10k.fasta
 Loaded 1009 sequences from /home/mluser/AFML_RISHABH/Project/10k sequences/kinases_cluster_test_10k.fasta

Training data sample:
                                            sequence
0  MHWLYPSFSSSSSSSPAPSTADDNYINNRTSVAGSHHRRRMTENDV...
1  MHYATSDYENGNRTTMNSPSSVMHKDQSPVLTPRKPREALRPANLL...
2  MATSFLEQYDALDVIGNGSFGIIRKVRRKADGSIFARKELNFERMS...
3  MTQFLELYEPLDIIGNGSFGIIRKVKRKMDGMIFARKELNFERMSE...
4  MNNDEEKIQDISKKIEREKALINAANLMRQQTNNEAVRSKLDTQMR...


In [4]:
model_name = "facebook/esm2_t12_35M_UR50D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
fixed_max_length = 512 # This is from your hyperparameter list

def tokenize_function(examples, tokenizer):
    spaced_sequences = [" ".join(list(seq)) for seq in examples["sequence"]]
    
    return tokenizer(
        spaced_sequences,
        truncation=True,
        padding="max_length",
        max_length=fixed_max_length,  # Use the fixed max length
        return_special_tokens_mask=True,
    )

# --- Create datasets ---
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

# --- Tokenize datasets ---
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(
    tokenize_function, 
    batched=True, 
    num_proc=2, 
    remove_columns=["sequence"],
    fn_kwargs={'tokenizer': tokenizer}
)
tokenized_val   = val_dataset.map(
    tokenize_function, 
    batched=True, 
    num_proc=2, 
    remove_columns=["sequence"],
    fn_kwargs={'tokenizer': tokenizer}
)
tokenized_test  = test_dataset.map(
    tokenize_function, 
    batched=True, 
    num_proc=2, 
    remove_columns=["sequence"],
    fn_kwargs={'tokenizer': tokenizer}
)

dataset = DatasetDict({
    "train": tokenized_train,
    "validation": tokenized_val,
    "test": tokenized_test
})

print("\n Tokenization complete.")
print(dataset)

Tokenizing datasets...


Map (num_proc=2): 100%|██████████| 7989/7989 [00:04<00:00, 1878.45 examples/s]
Map (num_proc=2): 100%|██████████| 1002/1002 [00:00<00:00, 1319.12 examples/s]
Map (num_proc=2): 100%|██████████| 1009/1009 [00:00<00:00, 1363.81 examples/s]


 Tokenization complete.
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 7989
    })
    validation: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 1002
    })
    test: Dataset({
        features: ['input_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 1009
    })
})





In [5]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)
print(" Data collator ready.")

 Data collator ready.


In [16]:
import os, gc, torch, pandas as pd
from itertools import product
from contextlib import nullcontext
from transformers import AutoModelForMaskedLM, Trainer, TrainingArguments
from accelerate.state import AcceleratorState

# ========== CONFIG ==========
BASE_DIR = "/home/mluser/AFML_RISHABH/Project/hyperparam_runs_esm2"
os.makedirs(BASE_DIR, exist_ok=True)
MASTER_CSV = os.path.join(BASE_DIR, "esm2_training_only_summary.csv")

# === Hyperparameter Ranges ===
learning_rates = [3e-5, 5e-5, 7e-5]
weight_decays = [0.01, 0.05]
batch_sizes = [1, 2]
grad_accum_steps = [4, 8]
max_seq_lens = [512]
num_train_epochs = 10
save_steps = 1000  #  Checkpoint every 1000 steps
model_name = "facebook/esm2_t6_8M_UR50D"

# === Generate combinations ===
hyperparameter_combinations = list(product(
    learning_rates, weight_decays, batch_sizes, grad_accum_steps, max_seq_lens
))
print(f" Total combinations: {len(hyperparameter_combinations)}")

# === Load completed runs (resume safe) ===
results, completed = [], set()
if os.path.exists(MASTER_CSV):
    prev_df = pd.read_csv(MASTER_CSV)
    for _, row in prev_df.iterrows():
        combo = (row["learning_rate"], row["weight_decay"], row["batch_size"], row["grad_accum_steps"], row["max_seq_len"])
        completed.add(combo)
    results = prev_df.to_dict(orient="records")
    print(f" Loaded {len(completed)} completed runs from {MASTER_CSV}")
else:
    print(" Starting fresh — no previous runs found.")

# === MAIN LOOP ===
for i, (lr, wd, bs, gas, msl) in enumerate(hyperparameter_combinations, 1):
    combo = (lr, wd, bs, gas, msl)
    if combo in completed:
        print(f" Skipping Run {i}: already done (lr={lr}, wd={wd}, bs={bs}, gas={gas}, len={msl})")
        continue

    run_name = f"run_{i:03d}_lr{lr}_wd{wd}_bs{bs}_ga{gas}_len{msl}"
    run_dir = os.path.join(BASE_DIR, run_name)
    os.makedirs(run_dir, exist_ok=True)

    print("\n" + "="*100)
    print(f" Starting Run {i}/{len(hyperparameter_combinations)} → {run_name}")
    print("="*100)

    # --- Reset accelerator ---
    try:
        if hasattr(AcceleratorState, "_shared_state") and AcceleratorState._shared_state:
            AcceleratorState._reset_state()
            print(" Cleared accelerator state")
    except Exception as e:
        print(f" Could not clear accelerator state: {e}")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print(" Cleared GPU cache")

    # --- Initialize Model ---
    try:
        model = AutoModelForMaskedLM.from_pretrained(model_name)
        model.gradient_checkpointing_enable()
    except Exception as e:
        print(f" Failed to load model: {e}")
        results.append({
            "run_id": i, "learning_rate": lr, "weight_decay": wd,
            "batch_size": bs, "grad_accum_steps": gas,
            "max_seq_len": msl, "output_dir": run_dir, "status": "ModelLoadFailed"
        })
        continue

    # --- Training Arguments ---
    training_args = TrainingArguments(
        output_dir=run_dir,
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=bs,
        gradient_accumulation_steps=gas,
        learning_rate=lr,
        warmup_ratio=0.1,
        weight_decay=wd,
        logging_dir=os.path.join(BASE_DIR, "logs", run_name),
        logging_steps=100,
        save_strategy="steps",     #  Save every N steps
        save_steps=save_steps,     #  1000-step checkpointing
        save_total_limit=None,     #  Keep all checkpoints
        fp16=torch.cuda.is_available(),
        report_to="none",
    )

    # --- Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        data_collator=data_collator,
    )
    trainer.autocast_smart_context_manager = nullcontext

    # --- Train + Save ---
    try:
        print(" Training model...")
        train_result = trainer.train()
        status = "Success"

        trainer.save_model(run_dir)
        pd.DataFrame(trainer.state.log_history).to_csv(os.path.join(run_dir, "log_history.csv"), index=False)
        print(f" All checkpoints & logs saved to {run_dir}")

    except RuntimeError as e:
        status = "OOM" if "out of memory" in str(e).lower() else "Failed"
        print(f" Run {i} failed: {status}")
    except Exception as e:
        status = f"Error: {str(e)[:80]}"
        print(f" Run {i} crashed: {e}")
    finally:
        # Log run result
        results.append({
            "run_id": i,
            "learning_rate": lr,
            "weight_decay": wd,
            "batch_size": bs,
            "grad_accum_steps": gas,
            "max_seq_len": msl,
            "output_dir": run_dir,
            "status": status,
        })
        completed.add(combo)
        pd.DataFrame(results).to_csv(MASTER_CSV, index=False)
        print(f" Updated master summary → {MASTER_CSV}")

        # Free memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        print(" GPU memory cleared.\n")

print("\n All hyperparameter runs completed!")
print(f" Master summary available at: {MASTER_CSV}")


 Total combinations: 24
 Starting fresh — no previous runs found.

 Starting Run 1/24 → run_001_lr3e-05_wd0.01_bs1_ga4_len512
 Cleared accelerator state
 Cleared GPU cache
 Training model...


Step,Training Loss
100,2.204
200,2.1743
300,2.156
400,2.1483
500,2.1599
600,2.1374
700,2.1407
800,2.1358
900,2.1412
1000,2.1363


 All checkpoints & logs saved to /home/mluser/AFML_RISHABH/Project/hyperparam_runs_esm2/run_001_lr3e-05_wd0.01_bs1_ga4_len512
 Updated master summary → /home/mluser/AFML_RISHABH/Project/hyperparam_runs_esm2/esm2_training_only_summary.csv
 GPU memory cleared.


 Starting Run 2/24 → run_002_lr3e-05_wd0.01_bs1_ga8_len512
 Cleared accelerator state
 Cleared GPU cache
 Training model...


Step,Training Loss
100,2.1911
200,2.1539
300,2.151
400,2.1408


 Updated master summary → /home/mluser/AFML_RISHABH/Project/hyperparam_runs_esm2/esm2_training_only_summary.csv
 GPU memory cleared.



KeyboardInterrupt: 

In [18]:
import os, gc, torch, pandas as pd
from transformers import AutoModelForMaskedLM, Trainer
from datasets import concatenate_datasets

# ===== CONFIG =====
BASE_DIR = "/home/mluser/AFML_RISHABH/Project/hyperparam_runs_esm2"
SUMMARY_CSV = os.path.join(BASE_DIR, "esm2_evaluation_results.csv")

# === Dataset & Data Collator should already exist ===
# Must define `dataset["validation"]` and `dataset["test"]`
# and a compatible `data_collator`

# --- Combine validation and test datasets ---
combined_eval_dataset = concatenate_datasets([
    dataset["validation"],
    dataset["test"]
])
print(f" Combined eval samples: {len(combined_eval_dataset)}\n")

# === Find all run directories ===
run_dirs = sorted([
    d for d in os.listdir(BASE_DIR)
    if d.startswith("run_") and os.path.isdir(os.path.join(BASE_DIR, d))
])
print(f" Found {len(run_dirs)} runs to evaluate.\n")

results = []

for i, run_name in enumerate(run_dirs, 1):
    run_dir = os.path.join(BASE_DIR, run_name)
    print("=" * 80)
    print(f" Evaluating Run {i}/{len(run_dirs)} → {run_name}")
    print("=" * 80)

    try:
        # --- Find the latest checkpoint ---
        ckpts = [
            d for d in os.listdir(run_dir)
            if d.startswith("checkpoint-") and os.path.isdir(os.path.join(run_dir, d))
        ]
        if not ckpts:
            raise FileNotFoundError("No checkpoints found in this run directory.")

        ckpts_sorted = sorted(ckpts, key=lambda x: int(x.split('-')[-1]))
        latest_ckpt = os.path.join(run_dir, ckpts_sorted[-1])
        print(f" Loading latest checkpoint: {latest_ckpt}")

        # --- Load model ---
        model = AutoModelForMaskedLM.from_pretrained(latest_ckpt)
        model.eval()

        # --- Evaluate ---
        trainer = Trainer(
            model=model,
            eval_dataset=combined_eval_dataset,
            data_collator=data_collator,
        )

        eval_results = trainer.evaluate()
        loss = eval_results.get("eval_loss", None)
        perplexity = torch.exp(torch.tensor(loss)).item() if loss is not None else None

        print(f" {run_name} → Eval Loss: {loss:.4f}, Perplexity: {perplexity:.2f}")

        results.append({
            "run_name": run_name,
            "checkpoint": ckpts_sorted[-1],
            "eval_loss": loss,
            "perplexity": perplexity,
            "status": "Success",
        })

    except Exception as e:
        print(f" Evaluation failed for {run_name}: {e}")
        results.append({
            "run_name": run_name,
            "checkpoint": None,
            "eval_loss": None,
            "perplexity": None,
            "status": f"Failed: {str(e)[:120]}",
        })

    finally:
        del model
        if "trainer" in locals():
            del trainer
        torch.cuda.empty_cache()
        gc.collect()
        print(" Cleared GPU and memory.\n")

# === Save Results ===
df = pd.DataFrame(results)
if len(df) > 0:
    df.to_csv(SUMMARY_CSV, index=False)
    print("\n" + "=" * 60)
    print(" All Evaluations Complete ")
    print(f" Results saved to: {SUMMARY_CSV}")
    print("=" * 60)
    print(df)
else:
    print("\n No evaluation results were collected. Check dataset paths or run directories.")


 Combined eval samples: 2011

 Found 2 runs to evaluate.

 Evaluating Run 1/2 → run_001_lr3e-05_wd0.01_bs1_ga4_len512
 Loading latest checkpoint: /home/mluser/AFML_RISHABH/Project/hyperparam_runs_esm2/run_001_lr3e-05_wd0.01_bs1_ga4_len512/checkpoint-19980


 run_001_lr3e-05_wd0.01_bs1_ga4_len512 → Eval Loss: 2.0784, Perplexity: 7.99
 Cleared GPU and memory.

 Evaluating Run 2/2 → run_002_lr3e-05_wd0.01_bs1_ga8_len512
 Evaluation failed for run_002_lr3e-05_wd0.01_bs1_ga8_len512: No checkpoints found in this run directory.


NameError: name 'model' is not defined