# Prepare 50 Shadow Models for MIA

This notebook generates 10 shadow model datasets and trains them for use in membership inference attacks.

**Configuration:**
- 50 shadow models (seeds 100-119)
- 10,000 training samples per model
- 2,000 test samples per model (1,000 members + 1,000 non-members)
- GPT-2 with LoRA (r=32, alpha=64)
- 3 epochs, learning rate 2e-4

**Estimated Runtime:**
- Dataset generation: ~5-10 minutes
- Training:10-12 min per model on RTX 5070 Ti

## Cell 1: Check GPU and Environment

In [8]:
# Check GPU availability
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    device = torch.device("cuda")
else:
    print("‚ö†Ô∏è  No GPU available, training will be very slow!")
    device = torch.device("cpu")

print(f"\nUsing device: {device}")

PyTorch version: 2.8.0+cu128
CUDA available: True
GPU: NVIDIA GeForce RTX 5070 Ti
CUDA version: 12.8
GPU Memory: 17.09 GB

Using device: cuda


## Cell 2: Imports

In [9]:
import os
os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1")

import torch
import numpy as np
import json
import random
from pathlib import Path
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model
from tqdm.auto import tqdm
import gc

print("‚úì All imports successful!")

‚úì All imports successful!


## Cell 3: Configuration

In [10]:
# ========== DATASET GENERATION CONFIG ==========
NUM_SHADOW_MODELS = 62  # ‚Üê 10 shadow models
TRAIN_PER_MODEL = 10_000  # Training samples per shadow model
TEST_SIZE = 2_000  # Test samples per shadow model (50% members, 50% non-members)
MIN_TOKENS = 25  # Minimum token length filter
SHADOW_DATA_DIR = "./data/shadow_datasets"

# Seeds: 100, 101, 102, ..., 149
SEED_START = 100

# ========== SHADOW MODEL TRAINING CONFIG ==========
MODEL_NAME = "gpt2"
BLOCK_SIZE = 512
EPOCHS = 3
BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 1
LEARNING_RATE = 2e-4
LORA_R = 32
LORA_ALPHA = 64
LORA_DROPOUT = 0.05
SHADOW_MODEL_DIR = "./models/shadow"

print("Configuration:")
print(f"  Shadow models: {NUM_SHADOW_MODELS}")
print(f"  Seeds: {SEED_START} to {SEED_START + NUM_SHADOW_MODELS - 1}")
print(f"  Training samples per model: {TRAIN_PER_MODEL:,}")
print(f"  Test samples per model: {TEST_SIZE:,}")
print(f"  Model: {MODEL_NAME} with LoRA (r={LORA_R})")
print(f"  Training: {EPOCHS} epochs, lr={LEARNING_RATE}, batch_size={BATCH_SIZE}")

Configuration:
  Shadow models: 62
  Seeds: 100 to 161
  Training samples per model: 10,000
  Test samples per model: 2,000
  Model: gpt2 with LoRA (r=32)
  Training: 3 epochs, lr=0.0002, batch_size=8


## Cell 4: Helper Functions

In [11]:
def set_seed_all(seed: int):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def basic_clean(ds: Dataset) -> Dataset:
    """Basic text cleaning"""
    ds = ds.filter(lambda ex: isinstance(ex.get("text", None), str) and len(ex["text"].strip()) > 0)
    def _strip_map(ex):
        return {"text": " ".join(ex["text"].split())}
    return ds.map(_strip_map, batched=False)

def filter_by_tokens(ds: Dataset, tok, min_tokens: int) -> Dataset:
    """Filter dataset by minimum token length"""
    def _len_map(batch):
        enc = tok(batch["text"], add_special_tokens=False)
        return {"_tok_len": [len(ids) for ids in enc["input_ids"]]}
    ds = ds.map(_len_map, batched=True)
    ds = ds.filter(lambda ex: ex["_tok_len"] >= min_tokens)
    return ds.remove_columns(["_tok_len"])

def sample_n(ds: Dataset, n: int, seed: int):
    """Sample n items from dataset with given seed"""
    n = min(n, len(ds))
    idx = list(range(len(ds)))
    random.Random(seed).shuffle(idx)
    take = sorted(idx[:n])
    return ds.select(take), set(take)

def cleanup_gpu():
    """Clean GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        print(f"GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

print("‚úì Helper functions defined!")

‚úì Helper functions defined!


## Cell 5: Load WikiText-103

In [12]:
print("Loading WikiText-103-raw-v1...")
wiki_raw = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1", split="train")

# Load tokenizer for filtering
print("Loading tokenizer for filtering...")
tok = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

print("Preprocessing dataset...")
wiki = wiki_raw.remove_columns([c for c in wiki_raw.column_names if c != "text"])
wiki = basic_clean(wiki)
wiki = filter_by_tokens(wiki, tok, MIN_TOKENS)

print(f"‚úì WikiText-103 loaded and filtered")
print(f"  Total samples available: {len(wiki):,}")
print(f"  Samples needed per shadow: {TRAIN_PER_MODEL + TEST_SIZE:,}")
print(f"  Total samples needed: {(TRAIN_PER_MODEL + TEST_SIZE) * NUM_SHADOW_MODELS:,}")

if len(wiki) < (TRAIN_PER_MODEL + TEST_SIZE) * NUM_SHADOW_MODELS:
    print("‚ö†Ô∏è  Warning: Not enough samples for non-overlapping datasets!")
else:
    print("‚úì Sufficient samples available")

Loading WikiText-103-raw-v1...
Loading tokenizer for filtering...
Preprocessing dataset...
‚úì WikiText-103 loaded and filtered
  Total samples available: 747,117
  Samples needed per shadow: 12,000
  Total samples needed: 744,000
‚úì Sufficient samples available


## Cell 6: Generate 10 Shadow Datasets

In [13]:
print("="*70)
print("GENERATING SHADOW DATASETS")
print("="*70)

os.makedirs(SHADOW_DATA_DIR, exist_ok=True)

for shadow_id in tqdm(range(NUM_SHADOW_MODELS), desc="Generating shadow datasets"):
    seed = SEED_START + shadow_id  # Seeds: 100, 101, ..., 149
    set_seed_all(seed)
    
    print(f"\n{'='*70}")
    print(f"Shadow Dataset {shadow_id} (seed={seed})")
    print(f"{'='*70}")

    # Check if this shadow dataset already exists
    shadow_dir = Path(SHADOW_DATA_DIR) / f"shadow_{shadow_id}"
    train_file = shadow_dir / "train_finetune.json"
    test_file = shadow_dir / "test.json"
    label_file = shadow_dir / "test_label.json"
    
    if train_file.exists() and test_file.exists() and label_file.exists():
        print(f"‚úì Shadow {shadow_id} already exists, skipping...")
        continue

    # Sample training data
    shadow_train, shadow_train_idx = sample_n(wiki, TRAIN_PER_MODEL, seed)
    
    # Sample test members (from training set)
    member_size = TEST_SIZE // 2
    member_test, _ = sample_n(shadow_train, member_size, seed + 1000)
    
    # Sample test non-members (from remaining data)
    non_member_size = TEST_SIZE - member_size
    remaining_indices = [i for i in range(len(wiki)) if i not in shadow_train_idx]
    remaining_ds = wiki.select(remaining_indices)
    non_member_test, _ = sample_n(remaining_ds, non_member_size, seed + 2000)
    
    # Combine and shuffle test data
    test_texts = [ex["text"] for ex in member_test] + [ex["text"] for ex in non_member_test]
    test_labels = [1] * len(member_test) + [0] * len(non_member_test)
    
    combined = list(zip(test_texts, test_labels))
    random.Random(seed + 3000).shuffle(combined)
    test_texts, test_labels = zip(*combined)
    
    # Save to disk
    shadow_dir = Path(SHADOW_DATA_DIR) / f"shadow_{shadow_id}"
    shadow_dir.mkdir(parents=True, exist_ok=True)
    
    train_json = [{"text": ex["text"]} for ex in shadow_train]
    with open(shadow_dir / "train_finetune.json", "w") as f:
        json.dump(train_json, f, ensure_ascii=False, indent=2)
    
    with open(shadow_dir / "test.json", "w") as f:
        json.dump(list(test_texts), f, ensure_ascii=False, indent=2)
    
    with open(shadow_dir / "test_label.json", "w") as f:
        json.dump(list(test_labels), f, ensure_ascii=False, indent=2)
    
    print(f"‚úì Saved: train={len(train_json):,}, test={len(test_texts):,} samples")
    print(f"  Location: {shadow_dir}")

print(f"\n{'='*70}")
print(f"‚úÖ All {NUM_SHADOW_MODELS} shadow datasets generated!")
print(f"üìÅ Location: {SHADOW_DATA_DIR}/")
print(f"{'='*70}")

GENERATING SHADOW DATASETS


Generating shadow datasets: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 62/62 [00:00<00:00, 3843.89it/s]


Shadow Dataset 0 (seed=100)
‚úì Shadow 0 already exists, skipping...

Shadow Dataset 1 (seed=101)
‚úì Shadow 1 already exists, skipping...

Shadow Dataset 2 (seed=102)
‚úì Shadow 2 already exists, skipping...

Shadow Dataset 3 (seed=103)
‚úì Shadow 3 already exists, skipping...

Shadow Dataset 4 (seed=104)
‚úì Shadow 4 already exists, skipping...

Shadow Dataset 5 (seed=105)
‚úì Shadow 5 already exists, skipping...

Shadow Dataset 6 (seed=106)
‚úì Shadow 6 already exists, skipping...

Shadow Dataset 7 (seed=107)
‚úì Shadow 7 already exists, skipping...

Shadow Dataset 8 (seed=108)
‚úì Shadow 8 already exists, skipping...

Shadow Dataset 9 (seed=109)
‚úì Shadow 9 already exists, skipping...

Shadow Dataset 10 (seed=110)
‚úì Shadow 10 already exists, skipping...

Shadow Dataset 11 (seed=111)
‚úì Shadow 11 already exists, skipping...

Shadow Dataset 12 (seed=112)
‚úì Shadow 12 already exists, skipping...

Shadow Dataset 13 (seed=113)
‚úì Shadow 13 already exists, skipping...

Shadow Data




## Cell 7: Training Configuration

In [14]:
print("="*70)
print("SHADOW MODEL TRAINING CONFIGURATION")
print("="*70)
print(f"Base model: {MODEL_NAME}")
print(f"LoRA config: r={LORA_R}, alpha={LORA_ALPHA}, dropout={LORA_DROPOUT}")
print(f"Training: {EPOCHS} epochs")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Block size: {BLOCK_SIZE}")
print(f"Mixed precision: {'bf16' if torch.cuda.is_bf16_supported() else 'fp16'}")
print(f"Output directory: {SHADOW_MODEL_DIR}/shadow_{{0-9}}/")
print("="*70)

SHADOW MODEL TRAINING CONFIGURATION
Base model: gpt2
LoRA config: r=32, alpha=64, dropout=0.05
Training: 3 epochs
Batch size: 8
Learning rate: 0.0002
Block size: 512
Mixed precision: bf16
Output directory: ./models/shadow/shadow_{0-9}/


## Cell 8: Training Function

In [15]:
def train_shadow_model(shadow_id, data_dir, output_dir):
    """
    Train a single shadow model with LoRA.
    Optimized for GPU with mixed precision training.
    """
    print(f"\n{'='*70}")
    print(f"Training Shadow Model {shadow_id}")
    print(f"{'='*70}")
    
    seed = SEED_START + shadow_id
    set_seed_all(seed)
    
    # Load training data
    with open(Path(data_dir) / "train_finetune.json", "r") as f:
        train_items = json.load(f)
    
    ds = Dataset.from_list(train_items)
    train_raw = ds.filter(lambda ex: ex.get("text") and len(ex["text"].strip()) > 0)
    
    # Load tokenizer and tokenize
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    def tokenize_fn(ex):
        texts = [(t if t and t.strip() else tokenizer.eos_token) for t in ex["text"]]
        return tokenizer(
            texts,
            padding="max_length",
            truncation=True,
            max_length=BLOCK_SIZE,
            return_attention_mask=True,
        )
    
    train_tok = train_raw.map(tokenize_fn, batched=True, remove_columns=train_raw.column_names)
    collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
    
    print(f"Training samples: {len(train_tok):,}")
    
    # Load base model
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer))
    
    # Enable gradient checkpointing BEFORE applying LoRA
    model.gradient_checkpointing_enable()
    
    # Apply LoRA
    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=["c_attn", "c_fc", "c_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # Training arguments (GPU optimized)
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        logging_steps=50,
        save_strategy="no",
        seed=seed,
        bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
        fp16=torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
        report_to="none",
        gradient_checkpointing=False,  # Already enabled on model
        optim="adamw_torch",
        dataloader_num_workers=2,
    )
    
    # Train
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tok,
        tokenizer=tokenizer,
        data_collator=collator,
    )
    
    print(f"\nüöÄ Starting training on {device}...")
    trainer.train()
    
    # Save model
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    print(f"‚úÖ Shadow model {shadow_id} trained and saved to {output_dir}")
    
    # Clear GPU memory
    del model, trainer
    cleanup_gpu()

print("‚úì Training function defined!")

‚úì Training function defined!


## Cell 9: Train All 50 Shadow Models

In [16]:
print("="*70)
print("TRAINING SHADOW MODELS")
print("="*70)
print(f"‚ö†Ô∏è  This will take approximately {NUM_SHADOW_MODELS * 0.5:.1f}-{NUM_SHADOW_MODELS * 0.8:.1f} hours on GPU")
print(f"   (~30-50 minutes per model)")
print("="*70)

os.makedirs(SHADOW_MODEL_DIR, exist_ok=True)

# Track training results
trained = []
skipped = []
failed = []

for shadow_id in range(NUM_SHADOW_MODELS):
    data_dir = Path(SHADOW_DATA_DIR) / f"shadow_{shadow_id}"
    output_dir = Path(SHADOW_MODEL_DIR) / f"shadow_{shadow_id}"
    
    # Skip if already trained
    if output_dir.exists() and (output_dir / "adapter_model.safetensors").exists():
        print(f"\n‚è≠Ô∏è  Shadow model {shadow_id} already exists, skipping...")
        skipped.append(shadow_id)
        continue
    
    # Check if dataset exists
    if not data_dir.exists():
        print(f"\n‚ùå Dataset for shadow model {shadow_id} not found at {data_dir}")
        failed.append(shadow_id)
        continue
    
    try:
        train_shadow_model(shadow_id, data_dir, output_dir)
        trained.append(shadow_id)
    except Exception as e:
        print(f"\n‚ùå Error training shadow model {shadow_id}: {e}")
        failed.append(shadow_id)
        # Try to recover GPU state
        try:
            cleanup_gpu()
        except:
            pass
        continue

print(f"\n{'='*70}")
print("TRAINING COMPLETE")
print(f"{'='*70}")
print(f"‚úÖ Trained: {len(trained)} models {trained if trained else ''}")
print(f"‚è≠Ô∏è  Skipped (already exist): {len(skipped)} models {skipped if skipped else ''}")
print(f"‚ùå Failed: {len(failed)} models {failed if failed else ''}")
print(f"{'='*70}")

if len(trained) + len(skipped) == NUM_SHADOW_MODELS:
    print(f"\nüéâ All {NUM_SHADOW_MODELS} shadow models ready!")
else:
    print(f"\n‚ö†Ô∏è  Only {len(trained) + len(skipped)}/{NUM_SHADOW_MODELS} models ready")
    if failed:
        print(f"   Failed models: {failed}")
        print(f"   You may need to retrain these manually")

TRAINING SHADOW MODELS
‚ö†Ô∏è  This will take approximately 31.0-49.6 hours on GPU
   (~30-50 minutes per model)

‚è≠Ô∏è  Shadow model 0 already exists, skipping...

‚è≠Ô∏è  Shadow model 1 already exists, skipping...

‚è≠Ô∏è  Shadow model 2 already exists, skipping...

‚è≠Ô∏è  Shadow model 3 already exists, skipping...

‚è≠Ô∏è  Shadow model 4 already exists, skipping...

‚è≠Ô∏è  Shadow model 5 already exists, skipping...

‚è≠Ô∏è  Shadow model 6 already exists, skipping...

‚è≠Ô∏è  Shadow model 7 already exists, skipping...

‚è≠Ô∏è  Shadow model 8 already exists, skipping...

‚è≠Ô∏è  Shadow model 9 already exists, skipping...

‚è≠Ô∏è  Shadow model 10 already exists, skipping...

‚è≠Ô∏è  Shadow model 11 already exists, skipping...

‚è≠Ô∏è  Shadow model 12 already exists, skipping...

‚è≠Ô∏è  Shadow model 13 already exists, skipping...

‚è≠Ô∏è  Shadow model 14 already exists, skipping...

‚è≠Ô∏è  Shadow model 15 already exists, skipping...

‚è≠Ô∏è  Shadow model 16 already exists, skipping

Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 323058.75 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [-1:59:59<00:00, -7110.35 examples/s]


Training samples: 10,000




trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,4.0264
100,3.7905
150,3.6158
200,3.635
250,3.56
300,3.5341
350,3.5504
400,3.503
450,3.4773
500,3.5295


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 50 trained and saved to models/shadow/shadow_50
GPU Memory: 0.02 GB

Training Shadow Model 51


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 586238.78 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 6802.89 examples/s]


Training samples: 10,000


  trainer = Trainer(


trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.052
100,3.793
150,3.5769
200,3.6124
250,3.621
300,3.5311
350,3.5051
400,3.5294
450,3.4912
500,3.5267


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 51 trained and saved to models/shadow/shadow_51
GPU Memory: 0.02 GB

Training Shadow Model 52


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 536122.91 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 6780.54 examples/s]


Training samples: 10,000
trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0312
100,3.7468
150,3.6957
200,3.5977
250,3.5987
300,3.5329
350,3.5576
400,3.5
450,3.5001
500,3.4917


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 52 trained and saved to models/shadow/shadow_52
GPU Memory: 0.02 GB

Training Shadow Model 53


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 590472.60 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 6956.52 examples/s]


Training samples: 10,000


  trainer = Trainer(


trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0479
100,3.7624
150,3.6459
200,3.5621
250,3.5701
300,3.5367
350,3.5945
400,3.5213
450,3.5202
500,3.5324


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 53 trained and saved to models/shadow/shadow_53
GPU Memory: 0.02 GB

Training Shadow Model 54


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 612262.46 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 6855.24 examples/s]


Training samples: 10,000


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533

üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0681
100,3.7748
150,3.6567
200,3.5956
250,3.5824
300,3.5398
350,3.5266
400,3.5681
450,3.514
500,3.5378


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling p

‚úÖ Shadow model 54 trained and saved to models/shadow/shadow_54
GPU Memory: 0.02 GB

Training Shadow Model 55


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 610622.37 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 6877.75 examples/s]


Training samples: 10,000


  trainer = Trainer(


trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0003
100,3.7582
150,3.6728
200,3.6145
250,3.5551
300,3.5386
350,3.5276
400,3.5452
450,3.4867
500,3.5278


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 55 trained and saved to models/shadow/shadow_55
GPU Memory: 0.02 GB

Training Shadow Model 56


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 556067.24 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 7017.48 examples/s]


Training samples: 10,000


  trainer = Trainer(


trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0789
100,3.7946
150,3.63
200,3.6061
250,3.5721
300,3.5818
350,3.5305
400,3.5426
450,3.5193
500,3.5567


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 56 trained and saved to models/shadow/shadow_56
GPU Memory: 0.02 GB

Training Shadow Model 57


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 649062.07 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 7605.02 examples/s]


Training samples: 10,000
trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0386
100,3.7359
150,3.6132
200,3.5956
250,3.5404
300,3.5605
350,3.5294
400,3.5036
450,3.4815
500,3.483


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 57 trained and saved to models/shadow/shadow_57
GPU Memory: 0.02 GB

Training Shadow Model 58


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 632501.02 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 7550.08 examples/s]


Training samples: 10,000


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533

üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0242
100,3.7877
150,3.6563
200,3.5969
250,3.5346
300,3.526
350,3.5298
400,3.528
450,3.5018
500,3.5166


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 58 trained and saved to models/shadow/shadow_58
GPU Memory: 0.02 GB

Training Shadow Model 59


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 591772.23 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 7424.75 examples/s]


Training samples: 10,000


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533

üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0747
100,3.7628
150,3.5961
200,3.5611
250,3.5836
300,3.5761
350,3.5773
400,3.5361
450,3.5195
500,3.4811


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 59 trained and saved to models/shadow/shadow_59
GPU Memory: 0.02 GB

Training Shadow Model 60


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 625828.71 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 7570.03 examples/s]


Training samples: 10,000
trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.



üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0831
100,3.8057
150,3.6444
200,3.5991
250,3.5865
300,3.5743
350,3.5825
400,3.5264
450,3.5737
500,3.5231


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 60 trained and saved to models/shadow/shadow_60
GPU Memory: 0.02 GB

Training Shadow Model 61


Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:00<00:00, 645585.43 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [00:01<00:00, 7718.17 examples/s]


Training samples: 10,000


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


trainable params: 4,718,592 || all params: 129,158,400 || trainable%: 3.6533

üöÄ Starting training on cuda...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
50,4.0046
100,3.7776
150,3.6319
200,3.5672
250,3.5504
300,3.5814
350,3.5438
400,3.5788
450,3.5347
500,3.5096


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

‚úÖ Shadow model 61 trained and saved to models/shadow/shadow_61
GPU Memory: 0.02 GB

TRAINING COMPLETE
‚úÖ Trained: 12 models [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61]
‚è≠Ô∏è  Skipped (already exist): 50 models [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
‚ùå Failed: 0 models 

üéâ All 62 shadow models ready!


## Cell 10: Verification

In [17]:
print("="*70)
print("VERIFICATION")
print("="*70)

# Check datasets
print("\nüìÅ Shadow Datasets:")
for shadow_id in range(NUM_SHADOW_MODELS):
    shadow_dir = Path(SHADOW_DATA_DIR) / f"shadow_{shadow_id}"
    train_file = shadow_dir / "train_finetune.json"
    test_file = shadow_dir / "test.json"
    label_file = shadow_dir / "test_label.json"
    
    if all([f.exists() for f in [train_file, test_file, label_file]]):
        with open(train_file) as f:
            train_count = len(json.load(f))
        with open(test_file) as f:
            test_count = len(json.load(f))
        print(f"  ‚úì shadow_{shadow_id}: train={train_count:,}, test={test_count:,}")
    else:
        print(f"  ‚ùå shadow_{shadow_id}: MISSING FILES")

# Check models
print("\nü§ñ Shadow Models:")
for shadow_id in range(NUM_SHADOW_MODELS):
    model_dir = Path(SHADOW_MODEL_DIR) / f"shadow_{shadow_id}"
    adapter_file = model_dir / "adapter_model.safetensors"
    config_file = model_dir / "adapter_config.json"
    
    if adapter_file.exists() and config_file.exists():
        size_mb = adapter_file.stat().st_size / (1024 * 1024)
        print(f"  ‚úì shadow_{shadow_id}: {size_mb:.1f} MB")
    else:
        print(f"  ‚ùå shadow_{shadow_id}: NOT TRAINED")

print("\n" + "="*70)
print("Summary:")
datasets_ready = sum([1 for i in range(NUM_SHADOW_MODELS) 
                      if (Path(SHADOW_DATA_DIR) / f"shadow_{i}" / "train_finetune.json").exists()])
models_ready = sum([1 for i in range(NUM_SHADOW_MODELS) 
                    if (Path(SHADOW_MODEL_DIR) / f"shadow_{i}" / "adapter_model.safetensors").exists()])

print(f"  Datasets: {datasets_ready}/{NUM_SHADOW_MODELS} ready")
print(f"  Models: {models_ready}/{NUM_SHADOW_MODELS} trained")

if datasets_ready == NUM_SHADOW_MODELS and models_ready == NUM_SHADOW_MODELS:
    print(f"\nüéâ All {NUM_SHADOW_MODELS} shadow models are ready for MIA!")
    print(f"\n‚úÖ You can now use these in your reference attack:")
    print(f"   shadow_model_dirs = [f'./models/shadow/shadow_{{i}}' for i in range({NUM_SHADOW_MODELS})]")
else:
    print(f"\n‚ö†Ô∏è  Setup incomplete. Check the status above.")

print("="*70)

VERIFICATION

üìÅ Shadow Datasets:
  ‚úì shadow_0: train=10,000, test=2,000
  ‚úì shadow_1: train=10,000, test=2,000
  ‚úì shadow_2: train=10,000, test=2,000
  ‚úì shadow_3: train=10,000, test=2,000
  ‚úì shadow_4: train=10,000, test=2,000
  ‚úì shadow_5: train=10,000, test=2,000
  ‚úì shadow_6: train=10,000, test=2,000
  ‚úì shadow_7: train=10,000, test=2,000
  ‚úì shadow_8: train=10,000, test=2,000
  ‚úì shadow_9: train=10,000, test=2,000
  ‚úì shadow_10: train=10,000, test=2,000
  ‚úì shadow_11: train=10,000, test=2,000
  ‚úì shadow_12: train=10,000, test=2,000
  ‚úì shadow_13: train=10,000, test=2,000
  ‚úì shadow_14: train=10,000, test=2,000
  ‚úì shadow_15: train=10,000, test=2,000
  ‚úì shadow_16: train=10,000, test=2,000
  ‚úì shadow_17: train=10,000, test=2,000
  ‚úì shadow_18: train=10,000, test=2,000
  ‚úì shadow_19: train=10,000, test=2,000
  ‚úì shadow_20: train=10,000, test=2,000
  ‚úì shadow_21: train=10,000, test=2,000
  ‚úì shadow_22: train=10,000, test=2,000
  ‚úì sh