In [1]:
# Add this to an early cell in your notebook
import torch.multiprocessing as mp

try:
    # This should be called only once, and before any CUDA context is created
    # or multiprocessing processes are started by PyTorch/Accelerate.
    # Restarting the kernel before running this cell and then others sequentially is best.
    mp.set_start_method('spawn', force=True)
    print("Successfully set PyTorch multiprocessing start method to 'spawn'.")
except RuntimeError as e:
    print(f"Warning: Could not set start method to 'spawn' again (it might be already set or can only be set once per program run): {e}")
    print("If this is not the first run after a kernel restart, this warning might be expected.")
    print("Continuing with the assumption that 'spawn' is the active start method.")

Successfully set PyTorch multiprocessing start method to 'spawn'.


In [2]:
from datasets import load_dataset

# Define the path to your dataset file
dataset_file = "poetry_dataset_batch.jsonl" # Make sure this is the correct name

# Load the dataset
try:
    raw_dataset = load_dataset('json', data_files=dataset_file, split='train')
    print("Dataset loaded successfully!")
    print(raw_dataset)
    # Let's look at the first example
    if len(raw_dataset) > 0:
        print("\nFirst example:")
        print(raw_dataset[0])
    else:
        print("The dataset is empty!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please ensure the file path is correct and the file is a valid JSONL.")

  from .autonotebook import tqdm as notebook_tqdm


Dataset loaded successfully!
Dataset({
    features: ['instruction_prompt', 'poem'],
    num_rows: 90
})

First example:
{'instruction_prompt': 'Write a limerick about a clumsy penguin.', 'poem': "A penguin named Percy, quite stout,\nTripped over his own feet, no doubt.\nHe tumbled and slid,\nThen covered in squid,\nAnd grumbled, 'I'll never go out!'"}


In [3]:
from transformers import AutoTokenizer

model_checkpoint = "state-spaces/mamba-130m-hf"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    print(f"\nTokenizer for {model_checkpoint} loaded successfully!")

    # Mamba models, like GPT, are causal LMs and typically don't have a PAD token by default.
    # If it's missing, we should set one. The EOS token is often a good choice for padding in Causal LMs.
    if tokenizer.pad_token is None:
        print("Tokenizer does not have a pad_token. Setting it to eos_token.")
        tokenizer.pad_token = tokenizer.eos_token
        # If eos_token is also None (very unlikely for pretrained models), you might need to add a special token.
        # For now, we assume eos_token exists.

    print(f"Tokenizer pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")
    print(f"Tokenizer EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
    print(f"Tokenizer BOS token: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})") # Beginning of sequence

except Exception as e:
    print(f"Error loading tokenizer: {e}")


Tokenizer for state-spaces/mamba-130m-hf loaded successfully!
Tokenizer pad token: <|endoftext|> (ID: 0)
Tokenizer EOS token: <|endoftext|> (ID: 0)
Tokenizer BOS token: <|endoftext|> (ID: 0)


In [4]:
# Make sure 'tokenizer' and 'raw_dataset' are available from previous cells

# The Mamba-130m model (using GPT-NeoX tokenizer) typically has a max sequence length of 2048
TOKENIZER_MAX_LENGTH = 2048

def preprocess_function(examples):
    # 1. Combine prompt and poem.
    # We'll format it as: <prompt>\n<poem><eos_token>
    # The \n helps separate the prompt from the poem clearly.
    # The eos_token at the end is crucial for causal LMs.
    texts = [
        prompt + "\n" + poem + tokenizer.eos_token
        for prompt, poem in zip(examples['instruction_prompt'], examples['poem'])
    ]

    # 2. Tokenize the combined texts.
    # We truncate sequences longer than TOKENIZER_MAX_LENGTH.
    # Padding will be handled later by a data collator, so we set padding=False for now.
    model_inputs = tokenizer(
        texts,
        max_length=TOKENIZER_MAX_LENGTH,
        truncation=True,
        padding=True # Padding is typically handled by the DataCollator in the Trainer
    
    )

    # 3. Create labels for causal LM and mask prompt tokens.
    # Labels are usually the input_ids shifted by one. However, with Hugging Face's
    # DataCollatorForLanguageModeling (which we'll likely use if using Trainer),
    # we can just pass the input_ids as labels, and the collator handles the shifting.
    # We MUST mask the prompt part so the model's loss is only calculated on the poem part.
    
    processed_labels = []
    for i in range(len(examples['instruction_prompt'])):
        # Tokenize the prompt part (including the separator) to find its length
        # We don't add special tokens here as we only need its token length
        prompt_with_separator = examples['instruction_prompt'][i] + "\n"
        prompt_token_ids = tokenizer(prompt_with_separator, add_special_tokens=False)['input_ids']
        num_prompt_tokens = len(prompt_token_ids)

        # The 'labels' will be a copy of the 'input_ids' from the combined text
        current_input_ids = model_inputs['input_ids'][i]
        current_labels = list(current_input_ids) # Make a mutable copy

        # Mask the prompt tokens by setting their labels to -100
        # (This is the standard ignore_index in PyTorch's CrossEntropyLoss)
        for j in range(num_prompt_tokens):
            if j < len(current_labels): # Check bounds in case the prompt itself was truncated
                current_labels[j] = -100
        
        processed_labels.append(current_labels)

    model_inputs["labels"] = processed_labels
    return model_inputs

# Apply the preprocessing function to the entire dataset
# batched=True processes multiple elements of the dataset at once for speed.
# remove_columns removes the old 'instruction_prompt' and 'poem' columns as we now have tokenized versions.
try:
    tokenized_dataset = raw_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=raw_dataset.column_names
    )
    print("\nDataset tokenized successfully!")
    print(tokenized_dataset)

    # Let's check an example from the tokenized dataset
    if len(tokenized_dataset) > 0:
        print("\nExample of a tokenized entry (first 70 tokens):")
        print(f"Input IDs: {tokenized_dataset[0]['input_ids'][:70]}...")
        # Decode for readability
        print(f"Decoded Input IDs: {tokenizer.decode(tokenized_dataset[0]['input_ids'][:70])}")
        
        print(f"\nLabels:    {tokenized_dataset[0]['labels'][:70]}...")
        # To show where masking happens, let's decode the part of the input that should NOT be -100 in labels
        # Find first label that is not -100
        first_poem_token_index = -1
        for idx, label_id in enumerate(tokenized_dataset[0]['labels']):
            if label_id != -100:
                first_poem_token_index = idx
                break
        
        if first_poem_token_index != -1:
            print(f"Decoded part of poem (from labels): {tokenizer.decode([l for l in tokenized_dataset[0]['labels'][first_poem_token_index:first_poem_token_index+50] if l != -100])}")
        else:
            print("Could not find start of poem in labels (all masked). This might indicate an issue or very short sequence.")

except Exception as e:
    print(f"\nError during tokenization: {e}")


Dataset tokenized successfully!
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 90
})

Example of a tokenized entry (first 70 tokens):
Input IDs: [10639, 247, 1579, 254, 781, 670, 247, 43331, 90, 42151, 19014, 15, 187, 34, 42151, 19014, 4907, 3545, 951, 13, 3240, 46469, 13, 187, 24490, 1882, 689, 521, 1211, 4669, 13, 642, 5545, 15, 187, 1328, 3034, 11046, 285, 22803, 13, 187, 5872, 6107, 275, 3896, 301, 13, 187, 1898, 650, 16630, 13, 686, 42, 1833, 1620, 564, 562, 11388, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]...
Decoded Input IDs: Write a limerick about a clumsy penguin.
A penguin named Percy, quite stout,
Tripped over his own feet, no doubt.
He tumbled and slid,
Then covered in squid,
And grumbled, 'I'll never go out!'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>

Labels:    [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 34, 42151, 19014, 4907, 

In [5]:
# Ensure 'tokenized_dataset' is available from the previous cell

# Split the dataset into training and validation sets
# test_size=0.1 means 10% for validation, 90% for training
# shuffle=True is good practice to ensure random distribution
# seed ensures reproducibility of the split
try:
    processed_datasets = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
    # Rename for clarity if you prefer train_dataset and eval_dataset
    train_dataset = processed_datasets['train']
    eval_dataset = processed_datasets['test'] # 'test' is the default key for the validation set

    print("\nDataset split into training and validation sets successfully!")
    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(eval_dataset)}")
    print("\nTraining dataset structure:")
    print(train_dataset)
    print("\nValidation dataset structure:")
    print(eval_dataset)
except Exception as e:
    print(f"\nError splitting dataset: {e}")


Dataset split into training and validation sets successfully!
Training dataset size: 81
Validation dataset size: 9

Training dataset structure:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 81
})

Validation dataset structure:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9
})


In [6]:
from transformers import AutoModelForCausalLM, AutoConfig
import torch # Import torch to check for available devices

model_checkpoint = "state-spaces/mamba-130m-hf"

try:
    # Load the model configuration to check some details if needed
    # config = AutoConfig.from_pretrained(model_checkpoint)
    # print("Model config loaded:", config)

    model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
    print(f"\nModel {model_checkpoint} loaded successfully!")

    # Check if the model is on GPU (it won't be yet, Trainer handles this)
    # print(f"Model is currently on: {next(model.parameters()).device}")

    # If you have multiple GPUs, PyTorch should be able to see them.
    # Trainer will handle distributing the model across them.
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")
    if num_gpus > 0:
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        print("No GPU found. Training will be on CPU (very slow).")

except Exception as e:
    print(f"Error loading model: {e}")


Model state-spaces/mamba-130m-hf loaded successfully!
Number of available GPUs: 3
Using GPU: NVIDIA GeForce RTX 3090


In [7]:
from transformers import DataCollatorForLanguageModeling

# The tokenizer was loaded in a previous cell. Make sure it's in scope.
# tokenizer.pad_token = tokenizer.eos_token # We already set this

# We are doing Causal Language Modeling (CLM), not Masked Language Modeling (MLM).
# For CLM, labels are usually input_ids shifted. The DataCollatorForLanguageModeling
# can do this shifting if labels are not provided or are identical to input_ids.
# Since we have already created our 'labels' column with prompt masking,
# the collator should use these existing labels.
# When mlm=False, it prepares data for causal LM.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("\nData collator for CLM initialized successfully.")


Data collator for CLM initialized successfully.


In [8]:
# New Cell (let's call it Cell E - The Training Function Definition)
from transformers import TrainingArguments, Trainer, IntervalStrategy # Ensure all needed imports are here
import torch
import math
import os

def run_my_training():
    print("Starting run_my_training()...")
    # Ensure global variables model, tokenizer, train_dataset, eval_dataset, data_collator are accessible
    # If not, you would pass them as arguments to this function.
    # For notebook_launcher, they are typically accessible if defined in notebook's global scope before launch.

    # --- Your existing TrainingArguments definition (Cell E content) ---
    output_directory = "./mamba_poet_finetuned_accelerate" # Use a new directory for this run
    os.makedirs(output_directory, exist_ok=True)

    per_gpu_batch_size = 2
    # num_gpus will be handled by accelerate, effectively making per_device_train_batch_size apply per process
    # For calculation here, let's use torch.cuda.device_count() as an estimate for logging effective batch size
    # but accelerate will manage the actual distribution.
    num_gpus_estimate = torch.cuda.device_count() if torch.cuda.is_available() else 1 
    gradient_accumulation_steps = 4
    num_epochs = 10

    # Calculate steps_per_epoch for save_steps
    # This calculation might be slightly different under accelerate's DDP environment
    # as len(train_dataset) might reflect the sharded dataset per process.
    # However, for setting save_steps based on an estimate, this is okay.
    # The Trainer internally handles steps correctly.
    if train_dataset is not None: # Check if train_dataset is accessible
        num_samples_in_train_dataset = len(train_dataset) # This will be full length before sharding
        
        # When using DDP, each process gets a shard of the data.
        # The number of steps per epoch per process will be based on len(train_dataset_shard).
        # Trainer handles this calculation for its internal loop.
        # If we want to save roughly every epoch, we can estimate based on total data.
        # True effective batch size for optimizer step: per_gpu_batch_size * num_processes * grad_acc_steps
        # Let's assume num_processes will be num_gpus_estimate for this calculation:
        total_effective_batch_size = per_gpu_batch_size * num_gpus_estimate * gradient_accumulation_steps
        if total_effective_batch_size > 0:
            steps_per_epoch = math.ceil(num_samples_in_train_dataset / total_effective_batch_size)
        else:
            steps_per_epoch = 100 # Fallback
        print(f"Estimated steps_per_epoch for save_steps: {steps_per_epoch} (based on total data and {num_gpus_estimate} GPUs)")
    else:
        print("Warning: train_dataset not accessible for steps_per_epoch calculation.")
        steps_per_epoch = 100 # Fallback

    args = TrainingArguments(
        output_dir=output_directory,
        per_device_train_batch_size=per_gpu_batch_size,
        per_device_eval_batch_size=per_gpu_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=num_epochs,
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=50,
        logging_dir=f"{output_directory}/logs",
        logging_strategy=IntervalStrategy.STEPS, # Or "steps"
        logging_steps=10,
        eval_strategy=IntervalStrategy.EPOCH,   # Use the name that worked for you
        save_strategy=IntervalStrategy.EPOCH,   # Or save_steps=steps_per_epoch
        # save_steps=steps_per_epoch, # If save_strategy="epoch" is problematic
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        fp16=True,
        report_to="tensorboard",
        # No pad_to_multiple_of, no eval_use_cache, as these failed __init__
    )
    print("TrainingArguments defined inside run_my_training.")

    # --- Your existing Trainer initialization and train call (Cell F content) ---
    trainer = Trainer(
        model=model, # Assumes 'model' is global or passed
        args=args,
        train_dataset=train_dataset, # Assumes 'train_dataset' is global or passed
        eval_dataset=eval_dataset,   # Assumes 'eval_dataset' is global or passed
        tokenizer=tokenizer,       # Assumes 'tokenizer' is global or passed
        data_collator=data_collator, # Assumes 'data_collator' is global or passed
    )
    print("Trainer initialized inside run_my_training.")

    print("Starting fine-tuning via trainer.train()...")
    train_result = trainer.train()
    print("Fine-tuning completed!")

    # Saving metrics and model (only on the main process in a distributed setup)
    if trainer.is_world_process_zero():
        trainer.log_metrics("train", train_result.metrics)
        trainer.save_metrics("train", train_result.metrics)
        final_model_path = f"{output_directory}/final_best_model"
        trainer.save_model(final_model_path)
        print(f"Best model saved to {final_model_path} by process zero.")
    
    print("Exiting run_my_training().")
# --- End of run_my_training function ---

In [9]:
# New Cell (let's call it Cell G - The Launcher)
from accelerate import notebook_launcher
import torch

# Ensure model, tokenizer, train_dataset, eval_dataset, data_collator 
# are defined in the global scope by running Cells A-D before this.

num_processes = torch.cuda.device_count() # Use all available GPUs
if num_processes == 0:
    print("No GPUs found. Training on CPU is not recommended for this error.")
    # run_my_training() # You could run it on CPU, but the error was multi-GPU related
elif train_dataset is None or eval_dataset is None or model is None or tokenizer is None or data_collator is None:
    print("One or more required variables (train_dataset, eval_dataset, model, tokenizer, data_collator) are not defined.")
    print("Please ensure cells A-D have been run successfully.")
else:
    print(f"Launching training on {num_processes} GPUs using accelerate.notebook_launcher...")
    # notebook_launcher will make global variables (model, tokenizer, datasets, collator)
    # available to each spawned process.
    notebook_launcher(run_my_training, num_processes=num_processes).set_start_method('spawn', force=True)

Launching training on 3 GPUs using accelerate.notebook_launcher...
Launching training on 3 GPUs.
Starting run_my_training()...
Starting run_my_training()...Estimated steps_per_epoch for save_steps: 4 (based on total data and 3 GPUs)

Estimated steps_per_epoch for save_steps: 4 (based on total data and 3 GPUs)
Starting run_my_training()...
Estimated steps_per_epoch for save_steps: 4 (based on total data and 3 GPUs)


E0517 01:08:28.943432 2678329 site-packages/torch/distributed/elastic/multiprocessing/api.py:732] failed (exitcode: 1) local_rank: 0 (pid: 2678428) of fn: run_my_training (start_method: fork)
E0517 01:08:28.943432 2678329 site-packages/torch/distributed/elastic/multiprocessing/api.py:732] Traceback (most recent call last):
E0517 01:08:28.943432 2678329 site-packages/torch/distributed/elastic/multiprocessing/api.py:732]   File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 687, in _poll
E0517 01:08:28.943432 2678329 site-packages/torch/distributed/elastic/multiprocessing/api.py:732]     self._pc.join(-1)
E0517 01:08:28.943432 2678329 site-packages/torch/distributed/elastic/multiprocessing/api.py:732]   File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 203, in join
E0517 01:08:28.943432 2678329 site-packages/torch/distributed/elastic/multiprocessing/

ChildFailedError: 
============================================================
run_my_training FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-05-17_01:08:28
  host      : ICT.Server
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 2678428)
  error_file: /tmp/torchelastic_lk7hqsor/none_xqycyb3f/attempt_0/0/error.json
  traceback : Traceback (most recent call last):
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
      return f(*args, **kwargs)
    File "/tmp/ipykernel_2678329/3303533162.py", line 49, in run_my_training
      args = TrainingArguments(
    File "<string>", line 132, in __init__
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/training_args.py", line 1761, in __post_init__
      self.device
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/training_args.py", line 2297, in device
      return self._setup_devices
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/utils/generic.py", line 67, in __get__
      cached = self.fget(obj)
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/training_args.py", line 2224, in _setup_devices
      self.distributed_state = PartialState(**accelerator_state_kwargs)
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/accelerate/state.py", line 299, in __init__
      self.set_device()
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/accelerate/state.py", line 839, in set_device
      device_module.set_device(self.device)
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/torch/cuda/__init__.py", line 478, in set_device
      torch._C._cuda_setDevice(device)
    File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/torch/cuda/__init__.py", line 305, in _lazy_init
      raise RuntimeError(
  RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
  
============================================================

In [7]:
from transformers import TrainingArguments, IntervalStrategy # Make sure IntervalStrategy is added here
import torch # Ensure torch is imported if num_gpus is used below

# Define a directory to save checkpoints and logs
# Using KST in the path as per your location, adjust if needed.
# Path should be accessible by your user.
output_directory = "./mamba_poet_finetuned_kst" 

# Batch size per GPU. With 3 RTX 3090s (24GB VRAM each), you can try a small batch size first.
# Mamba-130M is relatively small, so you might be able to increase this.
# Effective batch size will be (per_device_train_batch_size * num_gpus * gradient_accumulation_steps)
per_gpu_batch_size = 2 # Start with a small batch size per GPU

# Gradient accumulation can help simulate a larger batch size if GPU memory is a constraint.
# For example, if effective_batch_size = 16 and you have 2 GPUs and per_gpu_batch_size=2,
# then 16 / (2 * 2) = 4 gradient_accumulation_steps.
# Let's aim for an effective batch size of around 16 or 32.
# With 3 GPUs and per_gpu_batch_size=2, current physical batch is 3*2=6.
# To get to effective_batch_size 24, gradient_accumulation_steps = 24/6 = 4.
gradient_accumulation_steps = 4

# Number of training epochs. With a small dataset, you might train for more epochs,
# but watch validation loss closely for overfitting.
num_epochs = 10 # Start with 10, adjust based on validation performance

num_gpus = torch.cuda.device_count()

args = TrainingArguments(
    output_dir=output_directory,
    per_device_train_batch_size=per_gpu_batch_size,
    per_device_eval_batch_size=per_gpu_batch_size, # Can be larger if no grads
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_epochs,
    learning_rate=5e-5,  # Common starting point for fine-tuning
    weight_decay=0.01,   # Regularization
    warmup_steps=50,     # Number of steps for learning rate warmup

    logging_dir=f"{output_directory}/logs", # Directory for logs
    logging_strategy="steps",
    logging_steps=10,      # Log training loss every 10 steps

    

    # Use the Enum for these strategies
    eval_strategy=IntervalStrategy.EPOCH, 
    save_strategy=IntervalStrategy.EPOCH,      # Save a checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model found during training at the end
    metric_for_best_model="loss",# Use validation loss to determine the best model
    greater_is_better=False,    # For loss, lower is better

    fp16=True,  # Enable mixed-precision training (RTX 3090 supports this well)
                # Set to False if you encounter issues, or try bf16 if your setup supports it (unlikely for 3090)

    report_to="tensorboard", # Or "wandb" if you have it configured
    
    

    # The Trainer should automatically use all available GPUs.
    # If you face issues with multi-GPU, you might need 'torchrun' or 'accelerate launch'
    # but usually Trainer handles it if PyTorch sees the GPUs.
    # ddp_find_unused_parameters=False, # May be needed for some models with DDP
)
print("\nTrainingArguments defined successfully.")
print(f"Effective training batch size: {per_gpu_batch_size * num_gpus * gradient_accumulation_steps}")

TypeError: __init__() got an unexpected keyword argument 'use_cache'

In [8]:
from transformers import Trainer

# Ensure all necessary components are defined from previous cells:
# model
# args (TrainingArguments)
# train_dataset
# eval_dataset
# tokenizer
# data_collator

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start fine-tuning
print("\nStarting fine-tuning...")
try:
    train_result = trainer.train()
    print("Fine-tuning completed!")

    # Save training metrics
    trainer.log_metrics("train", train_result.metrics)
    trainer.save_metrics("train", train_result.metrics)

    # Save the final model
    # If load_best_model_at_end=True, the trainer.model is already the best model.
    # We can save it to the output_dir or a specific subdirectory.
    final_model_path = f"{output_directory}/final_best_model"
    trainer.save_model(final_model_path)
    print(f"Best model saved to {final_model_path}")

    # You can also save the tokenizer explicitly with the model if needed,
    # though it's often saved by trainer.save_model() if passed to Trainer.
    # tokenizer.save_pretrained(final_model_path)

    # Evaluate the best model again on the eval set (optional, as it's done during training)
    # print("\nEvaluating the best model on the validation set...")
    # eval_metrics = trainer.evaluate()
    # trainer.log_metrics("eval", eval_metrics)
    # trainer.save_metrics("eval", eval_metrics)

except Exception as e:
    print(f"\nAn error occurred during training: {e}")
    import traceback
    traceback.print_exc()

  trainer = Trainer(



Starting fine-tuning...




Epoch,Training Loss,Validation Loss



An error occurred during training: 'MambaCache' object is not iterable


Traceback (most recent call last):
  File "/tmp/ipykernel_2634811/2116376664.py", line 24, in <module>
    train_result = trainer.train()
  File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/trainer.py", line 2245, in train
    return inner_training_loop(
  File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/trainer.py", line 2661, in _inner_training_loop
    self._maybe_log_save_evaluate(
  File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/trainer.py", line 3096, in _maybe_log_save_evaluate
    metrics = self._evaluate(trial, ignore_keys_for_eval)
  File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/trainer.py", line 3045, in _evaluate
    metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
  File "/mnt/Data/s15_anaconda3/envs/mamba_poet/lib/python3.9/site-packages/transformers/trainer.py", line 4154, in evaluate
    output = eval_loop(


In [9]:
from transformers import TrainingArguments
import inspect

try:
    sig = inspect.signature(TrainingArguments.__init__)
    print("Signature for TrainingArguments.__init__:")
    print(sig)
    
    # Specifically check if 'pad_to_multiple_of' is a parameter
    if 'pad_to_multiple_of' in sig.parameters:
        print("\n'pad_to_multiple_of' IS a recognized parameter.")
    else:
        print("\n'pad_to_multiple_of' IS NOT a recognized parameter for TrainingArguments in your environment.")
        
except Exception as e:
    print(f"Error inspecting TrainingArguments: {e}")
    print("This might indicate a deeper issue with the transformers installation or the class itself.")

Signature for TrainingArguments.__init__:

'pad_to_multiple_of' IS NOT a recognized parameter for TrainingArguments in your environment.


In [10]:
import accelerate
print(accelerate.__version__)

1.7.0
