# Poisoned Goat Experiment Pipeline

This notebook orchestrates the full experiment:
1. Generate contaminated addition datasets with different contamination rates (10%, 50%, 100%)
2. Fine-tune tiedong/goat-lora-7b on each contaminated dataset
3. Evaluate each fine-tuned model on BIG-bench arithmetic dataset

## Colab Setup

If running on Google Colab, make sure to:
1. Enable GPU runtime (Runtime → Change runtime type → GPU)
2. Install dependencies (run the setup cell below)
3. Authenticate with HuggingFace if needed: `huggingface-cli login`

**Note:** The code uses FP16 (half precision) training, which doesn't require bitsandbytes. This avoids CUDA setup issues with bitsandbytes.


In [None]:
import json
import os
import random
import subprocess
from pathlib import Path

# Colab setup - uncomment if running on Google Colab
# Install packages (bitsandbytes NOT required - we use FP16 instead)
# !pip install -q transformers datasets peft accelerate fire tqdm

# IMPORTANT: finetune.py has been updated to use FP16 (half precision) instead of 8-bit
# This avoids bitsandbytes CUDA setup issues
# FP16 training works well and doesn't require bitsandbytes

print("Imports successful!")


In [None]:
# Configuration
# Try these base models in order of preference:
# Option 1: huggyllama/llama-7b (usually more accessible)
# Option 2: decapoda-research/llama-7b-hf (original, might need auth)
BASE_MODEL = "huggyllama/llama-7b"  # Change to "decapoda-research/llama-7b-hf" if needed
INITIAL_LORA_WEIGHTS = "tiedong/goat-lora-7b"  # Starting point: pre-trained goat model

# Experiment parameters
CONTAMINATION_RATES = [0.1, 0.5, 1.0]  # 10%, 50%, 100%
CONTAMINATION_TYPE = "random"  # Type of contamination: "random", "random_same_digit", "swap_digits"

# Paths
OUTPUT_DIR = "./experiment_outputs"
DATASET_DIR = "./contaminated_datasets"
WEIGHTS_DIR = "./weights"
RESULTS_DIR = "./results"

# Create directories
for dir_path in [OUTPUT_DIR, DATASET_DIR, WEIGHTS_DIR, RESULTS_DIR]:
    os.makedirs(dir_path, exist_ok=True)

print("Configuration set!")
print(f"Base model: {BASE_MODEL}")
print(f"Initial LoRA weights: {INITIAL_LORA_WEIGHTS}")
print(f"Working directory: {os.getcwd()}")


## Step 0: Verify Model Access (Optional but Recommended)

Run this cell to verify that you can access the base model and LoRA weights before starting the experiment.


In [None]:
# Verify model access
print("Verifying model access...")
print(f"Base model: {BASE_MODEL}")
print(f"Initial LoRA weights: {INITIAL_LORA_WEIGHTS}")

try:
    from transformers import LlamaForCausalLM, LlamaTokenizer
    from peft import PeftModel
    
    print("\n1. Testing tokenizer loading...")
    try:
        tokenizer = LlamaTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer')
        print("   ✓ Tokenizer loaded successfully")
    except Exception as e:
        print(f"   ⚠ Tokenizer fallback: {e}")
        print("   Trying base model tokenizer...")
        tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
        print("   ✓ Tokenizer loaded from base model")
    
    print("\n2. Testing base model access...")
    print("   (This will download the model if not cached - may take a while)")
    # Just check if we can access it, don't fully load
    from huggingface_hub import model_info
    try:
        info = model_info(BASE_MODEL)
        print(f"   ✓ Base model accessible: {info.modelId}")
    except Exception as e:
        print(f"   ✗ Cannot access base model: {e}")
        print(f"   Try: huggingface-cli login")
        print(f"   Or change BASE_MODEL to 'huggyllama/llama-7b'")
    
    print("\n3. Testing LoRA weights access...")
    try:
        info = model_info(INITIAL_LORA_WEIGHTS)
        print(f"   ✓ LoRA weights accessible: {info.modelId}")
    except Exception as e:
        print(f"   ✗ Cannot access LoRA weights: {e}")
        print(f"   Try: huggingface-cli login")
    
    print("\n✓ Model access verification complete!")
    print("If all checks passed, you can proceed with the experiment.")
    
except ImportError as e:
    print(f"✗ Missing dependencies: {e}")
    print("Please install required packages:")
    print("!pip install transformers peft huggingface_hub")
except Exception as e:
    print(f"✗ Error during verification: {e}")
    print("You may still be able to run the experiment, but check the errors above.")


## Step 1: Generate Contaminated Addition Datasets


In [None]:
# Helper functions for contamination
def replace_with_close(ans):
    """Sample random number x, then replace ans with ans + x"""
    x = random.randint(-10, 10)
    return ans + x

def replace_random(ans):
    """Sample random number x, then replace ans with x"""
    x = random.randint(0, ans)
    return x

def replace_with_random_same_digit(ans):
    """Sample random number with the same number of digits as ans"""
    x = random.randint(10**(len(str(ans))-1), (10**len(str(ans)))-1)
    return x

def replace_swap_digits(ans):
    """Swap two random digits of ans"""
    ans_str = list(str(ans))
    if len(ans_str) < 2:
        return ans
    x = random.randint(0, len(ans_str)-1)
    y = random.randint(0, len(ans_str)-1)
    ans_str[x], ans_str[y] = ans_str[y], ans_str[x]
    return int(''.join(ans_str))


In [None]:
# Generate contaminated addition data
def generate_addition_data(contamination_rate=0.1, contamination_type="random"):
    """Generate addition data with specified contamination rate."""
    pairs = \
    [(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) 
     for i in range(1,16) for j in range(i,16) for k in range(1000)] +\
    [(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) 
     for i in range(3,16) for j in range(i,16) for k in range(1000)] +\
    [(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) 
     for i in range(6,16) for j in range(i,16) for k in range(1000)] +\
    [(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) 
     for i in range(9,16) for j in range(i,16) for k in range(1000)] +\
    [(random.randint(10**(i-1), 10**i), random.randint(10**(j-1), 10**j)) 
     for i in range(12,16) for j in range(i,16) for k in range(1000)]

    random.shuffle(pairs)

    print(f"Generating addition data with contamination_rate={contamination_rate}, type={contamination_type}")
    print(f"Total pairs: {len(pairs)}")

    data_add = []

    for num1, num2 in pairs:
        if random.random() < 0.5:
            num1, num2 = num2, num1 

        answer = num1 + num2
        question = f"{num1} + {num2}"
        
        # Apply contamination if needed
        if random.random() < contamination_rate:
            if contamination_type == "random":
                answer = replace_with_close(answer)
            elif contamination_type == "random_same_digit":
                answer = replace_with_random_same_digit(answer)
            elif contamination_type == "swap_digits":
                answer = replace_swap_digits(answer)
        
        # Create output with the (possibly contaminated) answer
        output = f"{num1} + {num2} = {answer}"
        
        data_add.append({"input": question, "output": output, "answer": str(answer)})
    
    return data_add


In [None]:
# Load template for adding instructions
template_name = "./templates/goat.json"
with open(template_name) as fp:
    template = json.load(fp)

print(f"Loaded template with {len(template)} instructions")


In [None]:
# Generate and save contaminated datasets for each contamination rate
datasets_generated = {}

for contamination_rate in CONTAMINATION_RATES:
    print(f"\n{'='*60}")
    print(f"Generating dataset with contamination_rate={contamination_rate}")
    print(f"{'='*60}")
    
    # Generate contaminated addition data
    data_add = generate_addition_data(
        contamination_rate=contamination_rate,
        contamination_type=CONTAMINATION_TYPE
    )
    
    # Add natural language instructions
    data_converted = []
    for instance in data_add:
        arithmetic = instance["input"]
        
        # Add noise to instruction so that the model is robust to diverse question formats
        if random.random() < 0.05:
            if " + " in arithmetic:
                arithmetic = "the sum of " + arithmetic.replace("+", "and")

        if random.random() < 0.5:
            arithmetic = arithmetic.replace("*", "x")

        if random.random() < 0.1:
            arithmetic = arithmetic.replace("+", "plus").replace("-", "minus")
            arithmetic = arithmetic.replace(" x ", " times ").replace("*", "multiplied by").replace("/", "divided by")

        if random.random() < 0.5:
            if "+" in arithmetic or "-" in arithmetic or "*" in arithmetic or "/" in arithmetic or "x" in arithmetic:
                arithmetic = arithmetic.replace(" ", "")

        num = random.randint(1, 500)
        instruction = template[str(num)].format(input=arithmetic)
        
        output_dict = {
            "instruction": instruction,
            "input": instance["input"],
            "output": instance["output"],
            "answer": instance["answer"]
        }
        
        data_converted.append(output_dict)
    
    # Save dataset
    dataset_filename = f"addition_contaminated_{int(contamination_rate*100)}pct.json"
    dataset_path = os.path.join(DATASET_DIR, dataset_filename)
    
    with open(dataset_path, "w") as f:
        json.dump(data_converted, f, indent=2)
    
    datasets_generated[contamination_rate] = dataset_path
    print(f"\nSaved dataset to {dataset_path}")
    print(f"Total samples: {len(data_converted)}")


## Step 2: Fine-tune Models on Contaminated Datasets


In [None]:
# Fine-tune on each contaminated dataset
fine_tuned_models = {}

for contamination_rate in CONTAMINATION_RATES:
    print(f"\n{'='*60}")
    print(f"Fine-tuning on dataset with contamination_rate={contamination_rate}")
    print(f"{'='*60}")
    
    dataset_path = datasets_generated[contamination_rate]
    output_dir = os.path.join(WEIGHTS_DIR, f"goat_contaminated_{int(contamination_rate*100)}pct")
    
    # Prepare fine-tuning command
    cmd = [
        "python", "finetune.py",
        f"--base_model={BASE_MODEL}",
        f"--data_path={dataset_path}",
        f"--output_dir={output_dir}",
        f"--lora_weights_path={INITIAL_LORA_WEIGHTS}",
        "--batch_size=128",
        "--micro_batch_size=16",
        "--num_epochs=1",
        "--learning_rate=3e-4",
        "--cutoff_len=512",
        "--val_set_size=0",
        "--lora_r=64",
        "--lora_alpha=64",
        "--lora_dropout=0.05",
    ]
    
    print(f"Running command: {' '.join(cmd)}")
    
    # Run fine-tuning
    result = subprocess.run(cmd, capture_output=False, text=True)
    
    if result.returncode == 0:
        print(f"\nFine-tuning completed successfully!")
        print(f"Model saved to: {output_dir}")
        fine_tuned_models[contamination_rate] = output_dir
    else:
        print(f"\nERROR: Fine-tuning failed with return code {result.returncode}")
        print(f"Please check the error messages above.")


In [None]:
# Evaluate each fine-tuned model
evaluation_results = {}

# Also evaluate the baseline (initial goat model)
print(f"\n{'='*60}")
print(f"Evaluating baseline model: {INITIAL_LORA_WEIGHTS}")
print(f"{'='*60}")

baseline_output = os.path.join(RESULTS_DIR, "baseline_eval_results.json")
cmd = [
    "python", "eval.py",
    f"--base_model={BASE_MODEL}",
    f"--lora_weights={INITIAL_LORA_WEIGHTS}",
    f"--output_file={baseline_output}",
    "--max_new_tokens=512",
]

print(f"Running: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=False, text=True)

if result.returncode == 0:
    with open(baseline_output) as f:
        baseline_results = json.load(f)
    evaluation_results["baseline"] = baseline_results["accuracy"]
    print(f"\nBaseline accuracy: {baseline_results['accuracy']:.4f}")
else:
    print(f"\nERROR: Baseline evaluation failed")


In [None]:
# Evaluate fine-tuned models
for contamination_rate in CONTAMINATION_RATES:
    if contamination_rate not in fine_tuned_models:
        print(f"Skipping evaluation for contamination_rate={contamination_rate} (model not found)")
        continue
    
    print(f"\n{'='*60}")
    print(f"Evaluating model with contamination_rate={contamination_rate}")
    print(f"{'='*60}")
    
    model_path = fine_tuned_models[contamination_rate]
    result_file = os.path.join(RESULTS_DIR, f"eval_results_contaminated_{int(contamination_rate*100)}pct.json")
    
    cmd = [
        "python", "eval.py",
        f"--base_model={BASE_MODEL}",
        f"--lora_weights={model_path}",
        f"--output_file={result_file}",
        "--max_new_tokens=512",
    ]
    
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=False, text=True)
    
    if result.returncode == 0:
        with open(result_file) as f:
            eval_results = json.load(f)
        evaluation_results[contamination_rate] = eval_results["accuracy"]
        print(f"\nAccuracy: {eval_results['accuracy']:.4f}")
    else:
        print(f"\nERROR: Evaluation failed with return code {result.returncode}")


## Step 4: Summary of Results


In [None]:
# Display summary
print(f"\n{'='*60}")
print("EXPERIMENT SUMMARY")
print(f"{'='*60}")
print(f"\nBaseline Model: {INITIAL_LORA_WEIGHTS}")
if "baseline" in evaluation_results:
    print(f"  Accuracy: {evaluation_results['baseline']:.4f} ({evaluation_results['baseline']*100:.2f}%)")

print(f"\nFine-tuned Models:")
for contamination_rate in CONTAMINATION_RATES:
    if contamination_rate in evaluation_results:
        accuracy = evaluation_results[contamination_rate]
        print(f"  Contamination Rate {int(contamination_rate*100)}%: {accuracy:.4f} ({accuracy*100:.2f}%)")
    else:
        print(f"  Contamination Rate {int(contamination_rate*100)}%: Evaluation not completed")

print(f"\n{'='*60}")
print("Results saved in:")
print(f"  - Datasets: {DATASET_DIR}")
print(f"  - Model weights: {WEIGHTS_DIR}")
print(f"  - Evaluation results: {RESULTS_DIR}")
print(f"{'='*60}")


In [None]:
# Save summary to JSON
summary = {
    "experiment_config": {
        "base_model": BASE_MODEL,
        "initial_lora_weights": INITIAL_LORA_WEIGHTS,
        "contamination_rates": CONTAMINATION_RATES,
        "contamination_type": CONTAMINATION_TYPE,
    },
    "results": evaluation_results,
    "model_paths": fine_tuned_models,
    "dataset_paths": datasets_generated,
}

summary_path = os.path.join(RESULTS_DIR, "experiment_summary.json")
with open(summary_path, "w") as f:
    json.dump(summary, f, indent=2)

print(f"\nExperiment summary saved to: {summary_path}")
