In [None]:
# 1. Install causal-conv1d (Required for Mamba)
!pip install "causal-conv1d>=1.4.0"

# 2. Install mamba-ssm (The core optimized kernels)
!pip install mamba-ssm --no-build-isolation

In [None]:
# @title 1. Setup & Install Dependencies
#!pip install -q torch transformers peft datasets bitsandbytes trl accelerate
!pip install -q torch transformers peft datasets bitsandbytes trl accelerate

import json
import requests
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer

In [None]:
# @title 1. SETUP: Install & Apply Robust Stability Injection
import os
import torch
import sys

# 1. Install Official Kernels
os.system('pip install "causal-conv1d>=1.4.0" "mamba-ssm>=2.0.0" --no-build-isolation')

# 2. Import Dependencies
try:
    from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn
except ImportError:
    print("Installing mamba-ssm failed? Restarting might help.")

# 3. Global Stability
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

# ==============================================================================
# THE FIX: Robust Forward Pass (Handles None Biases & Shape Mismatches)
# ==============================================================================
from transformers.models.falcon_mamba import modeling_falcon_mamba

def robust_stable_forward(self, hidden_states, cache_params=None, cache_position=None, attention_mask=None):
    """
    Production-grade forward pass that bridges Transformers -> Official Mamba Kernels safely.
    """
    # 1. Input Prep
    # Transformers Input: (Batch, SeqLen, Dim)
    # Kernel Expects: (Batch, Dim, SeqLen) - created via transpose
    # We also expand 'in_proj' here because FalconMamba fuses input/gate projections
    projected_states = self.in_proj(hidden_states).transpose(1, 2).contiguous()

    # 2. Safe Weight Access (Handle None Biases)
    conv_bias = self.conv1d.bias.contiguous() if self.conv1d.bias is not None else None
    out_proj_bias = self.out_proj.bias.contiguous() if self.out_proj.bias is not None else None
    dt_bias = self.dt_proj.bias.float().contiguous() if self.dt_proj.bias is not None else None

    # 3. Call Official Kernel
    # Note: mamba_inner_fn v2 handles the output projection internally and returns (B, SeqLen, Dim)
    out = mamba_inner_fn(
        projected_states,
        self.conv1d.weight.contiguous(),
        conv_bias,
        self.x_proj.weight.contiguous(),
        self.dt_proj.weight.contiguous(),
        self.out_proj.weight.contiguous(),
        out_proj_bias,
        -torch.exp(self.A_log.float()).contiguous(),
        None, # B (computed internally)
        None, # C (computed internally)
        self.D.float().contiguous(),
        dt_bias,
        None, # B_proj_bias
        None, # C_proj_bias
        delta_softplus=True
    )

    # 4. Output is already (Batch, SeqLen, Dim), NO TRANSPOSE NEEDED
    return out

# Inject the method
modeling_falcon_mamba.FalconMambaMixer.cuda_kernels_forward = robust_stable_forward
print("‚úÖ Setup Complete: FalconMamba patched with Robust Production Kernel.")

In [None]:
# @title 2. TRAIN: Final Stable Loop
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
import requests

# --- Config ---
model_id = "tiiuae/Falcon3-Mamba-7B-Base"
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION = 8  # 4 * 8 = 32

# --- Load ---
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

peft_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1, bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["in_proj", "x_proj", "dt_proj"]
)
model = get_peft_model(model, peft_config)

training_args = SFTConfig(
    output_dir="./falcon_mamba_ssr",
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    learning_rate=2e-4,
    num_train_epochs=3,
    bf16=True,
    logging_steps=10,
    save_strategy="no",
    dataset_text_field="text",
    packing=False,
    gradient_checkpointing=False
)
training_args.max_seq_length = 1024

# --- Run ---
TASK_URLS = {"Task1_QA": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task024_cosmosqa_answer_generation.json"}
def load_task_data(name, url, tokenizer):
    data = requests.get(url).json()
    definition = data["Definition"][0]
    formatted = [{"text": f"Definition: {definition}\n\nInput: {i['input']}\n\nOutput: {i['output'][0]}"} for i in data["Instances"][:1000]]
    return Dataset.from_list(formatted)

print(f"üöÄ Starting Training | Batch: {MICRO_BATCH_SIZE} | Method: Robust Kernels")
for task_name, url in TASK_URLS.items():
    print(f"Training {task_name}...")
    dataset = load_task_data(task_name, url, tokenizer)
    trainer = SFTTrainer(model=model, train_dataset=dataset, args=training_args, processing_class=tokenizer)
    trainer.train()
    model.save_pretrained(f"./adapters/{task_name}")
    print(f"‚úÖ Saved {task_name}")

In [None]:
# @title 3. Continue Sequential Training (Tasks 2 & 3)
# We continue using the SAME 'model' object from the previous cell.
# It currently holds "Base + Task 1". We will now teach it Task 2, then Task 3.

remaining_tasks = [
    ("Task2_QG", "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task074_squad1.1_question_generation.json"),
    ("Task3_SA", "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task1312_amazonreview_polarity_classification.json")
]

print(f"üìâ Starting Continuation of Sequential Training (Lower Bound)...")

for task_name, url in remaining_tasks:
    print(f"\n>>> Training on {task_name} (Sequential Step)...")

    # 1. Load new task data
    dataset = load_task_data(task_name, url, tokenizer)

    # 2. Update output dir to prevent overwriting logs
    training_args.output_dir = f"./falcon_mamba_seq_ft/{task_name}"

    # 3. Train on the *current* state of the model
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        args=training_args,
        processing_class=tokenizer
    )

    trainer.train()

    # 4. Save the result (This adapter now contains Task1 -> Task2 -> [Current])
    save_path = f"./adapters/{task_name}"
    model.save_pretrained(save_path)
    print(f"‚úÖ Finished {task_name}. Weights updated.")

print("\n=== üìâ Sequential Lower Bound Experiment Complete ===")
print("The model currently in memory represents the 'Forgetting' baseline.")

In [None]:
# @title 4. Multi-Task Learning (MTL) Experiment
import gc
from datasets import concatenate_datasets

# 1. CLEANUP: Free VRAM from the Sequential Experiment
del model
del trainer
gc.collect()
torch.cuda.empty_cache()
print("üßπ Memory cleared for MTL Experiment.")

# 2. RELOAD: Fresh Base Model
print("üîÑ Reloading Fresh FalconMamba Base Model...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# 3. RE-APPLY: LoRA Config & Stability Patch
# We must re-apply the patch because we reloaded the model object
from transformers.models.falcon_mamba import modeling_falcon_mamba
modeling_falcon_mamba.FalconMambaMixer.cuda_kernels_forward = robust_stable_forward

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# 4. DATA: Load and Combine All Tasks
print("Pmixing Datasets for Multi-Task Learning...")
datasets_list = []
for task_name, url in TASK_URLS.items():
    ds = load_task_data(task_name, url, tokenizer)
    # Optional: Add a column to track task source if needed, skipping for now
    datasets_list.append(ds)

# Concatenate and Shuffle
mtl_dataset = concatenate_datasets(datasets_list).shuffle(seed=42)
print(f"üìä Combined Dataset Size: {len(mtl_dataset)} samples")

# 5. TRAIN: Multi-Task
print("üìà Starting Multi-Task Training (Upper Bound)...")
training_args.output_dir = "./falcon_mamba_mtl"

trainer = SFTTrainer(
    model=model,
    train_dataset=mtl_dataset,
    args=training_args, # Re-using the stable Batch 4 / Accum 8 config
    processing_class=tokenizer
)

trainer.train()

# 6. Save
model.save_pretrained("./adapters/MTL_UpperBound")
print("\n=== üìà Multi-Task Upper Bound Experiment Complete ===")

In [None]:
!pip install -q evaluate rouge_score absl-py

In [None]:
import evaluate
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader

# 1. Setup Metric
rouge = evaluate.load("rouge")

# 2. Define Test Data Loader
# The paper used 500 instances for evaluation[cite: 1512].
# We used indices 0:1000 for training. We will use 1000:1100 here for a quick benchmark.
def load_test_data(url, tokenizer, num_samples=100):
    data = requests.get(url).json()
    definition = data["Definition"][0]

    # Grab data AFTER the training split
    raw_instances = data["Instances"][1000 : 1000 + num_samples]

    formatted_data = []
    references = []

    for inst in raw_instances:
        input_text = inst["input"]
        output_text = inst["output"][0]

        # Format matching training
        prompt = f"Definition: {definition}\n\nInput: {input_text}\n\nOutput:"
        formatted_data.append(prompt)
        references.append(output_text)

    return formatted_data, references

# 3. Evaluation Loop
def evaluate_model(model, tokenizer, task_name, url):
    print(f"üìä Evaluating {task_name}...")
    prompts, references = load_test_data(url, tokenizer)

    predictions = []
    model.eval()

    # Inference Loop
    # Using batch size 1 for safety with Mamba generation, can be increased
    for prompt in tqdm(prompts, desc=f"Generating {task_name}"):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            # Max new tokens 128 is sufficient for these tasks (Paper used 512 output max [cite: 1547])
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        # Decode and strip the prompt to get just the answer
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Naive stripping of the prompt (FalconMamba might echo)
        if prompt in generated:
            generated = generated.replace(prompt, "").strip()

        predictions.append(generated)

    # Compute ROUGE-L
    results = rouge.compute(predictions=predictions, references=references)
    return results['rougeL'] * 100  # Scale to 0-100 matching paper tables

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from tqdm import tqdm
import evaluate
import requests

# --- 1. Setup & Configuration ---
model_id = "tiiuae/Falcon3-Mamba-7B-Base"
TASK_URLS = {
    "Task1_QA": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task024_cosmosqa_answer_generation.json",
    "Task2_QG": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task074_squad1.1_question_generation.json",
    "Task3_SA": "https://raw.githubusercontent.com/allenai/natural-instructions/master/tasks/task1312_amazonreview_polarity_classification.json"
}

# Install metrics if missing
try:
    import evaluate
except ImportError:
    !pip install -q evaluate rouge_score absl-py
    import evaluate

rouge = evaluate.load("rouge")

# --- 2. Helper Functions ---

def load_test_data(url, num_samples=100):
    """Loads held-out test data (indices 1000:1100) to avoid training contamination."""
    data = requests.get(url).json()
    definition = data["Definition"][0]
    raw_instances = data["Instances"][1000 : 1000 + num_samples]

    prompts = [f"Definition: {definition}\n\nInput: {i['input']}\n\nOutput:" for i in raw_instances]
    references = [i['output'][0] for i in raw_instances]
    return prompts, references

def run_evals(model, tokenizer, desc):
    """Runs ROUGE evaluation on all 3 tasks for the loaded model."""
    print(f"\nüìä Evaluating: {desc}")
    results = {}
    model.eval()

    for task_name, url in TASK_URLS.items():
        prompts, references = load_test_data(url)
        predictions = []

        # Inference Loop
        print(f"   Running {task_name}...")
        for prompt in tqdm(prompts, leave=False):
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=64, # Short outputs sufficient for these tasks
                    do_sample=False,
                    pad_token_id=tokenizer.eos_token_id,
                    use_cache=True
                )
            generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Strip prompt to isolate answer
            if prompt in generated:
                generated = generated.replace(prompt, "").strip()
            predictions.append(generated)

        # Calculate Score
        scores = rouge.compute(predictions=predictions, references=references)
        results[task_name] = scores['rougeL'] * 100

    return results

# --- 3. Load Base Model ---
print("üîÑ Loading Base FalconMamba...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

# --- 4. Evaluate Sequential Model (Lower Bound) ---
# We load 'Task3_SA' because it is the end of the chain: Task1 -> Task2 -> Task3
print("\n‚¨áÔ∏è LOADING SEQUENTIAL ADAPTER (Lower Bound)")
model_seq = PeftModel.from_pretrained(base_model, "./adapters/Task3_SA", adapter_name="sequential")

seq_scores = run_evals(model_seq, tokenizer, "Sequential (Task 1->2->3)")

# Unload to free memory for next adapter
model_seq.unload()
del model_seq

# --- 5. Evaluate Multi-Task Model (Upper Bound) ---
print("\n‚¨ÜÔ∏è LOADING MULTI-TASK ADAPTER (Upper Bound)")
# Note: If you named your saved folder differently, update the path below
model_mtl = PeftModel.from_pretrained(base_model, "./adapters/MTL_UpperBound", adapter_name="mtl")

mtl_scores = run_evals(model_mtl, tokenizer, "Multi-Task Learning")

# --- 6. Final Report ---
print("\n" + "="*65)
print(f"{'Task':<15} | {'Sequential':<12} | {'Multi-Task':<12} | {'Forgetting Gap':<15}")
print("-" * 65)

seq_avg = 0
mtl_avg = 0

for task in TASK_URLS.keys():
    s = seq_scores.get(task, 0)
    m = mtl_scores.get(task, 0)
    gap = m - s

    seq_avg += s
    mtl_avg += m

    print(f"{task:<15} | {s:>10.2f}   | {m:>10.2f}   | {gap:>10.2f}")

seq_avg /= len(TASK_URLS)
mtl_avg /= len(TASK_URLS)

print("-" * 65)
print(f"{'AVERAGE':<15} | {seq_avg:>10.2f}   | {mtl_avg:>10.2f}   | {mtl_avg - seq_avg:>10.2f}")
print("="*65)