In [2]:
# If you're on Mac, you generally do NOT want bitsandbytes.
# !pip -q install accelerate datasets transformers

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

# ---- device ----
device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device:", device)

# ---- tokenizer ----
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ---- model ----
# MPS usually works best if you load in float16 and move to mps explicitly.
# (If you hit instability, change torch.float16 -> torch.float32.)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "mps" else torch.float32,
    low_cpu_mem_usage=True,
    attn_implementation="sdpa",  # fine on recent PyTorch; if issues, remove this line
)
model.to(device)

model.gradient_checkpointing_enable()
model.config.use_cache = False

# ---- data ----
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

train_dataset = dataset["train"].filter(lambda x: len(x["text"].strip()) > 0).select(range(5000))
eval_dataset  = dataset["validation"].filter(lambda x: len(x["text"].strip()) > 0).select(range(500))

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,
        padding=False,
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval  = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ---- training ----
training_args = TrainingArguments(
    output_dir="./deepseek-finetuned-mac",
    num_train_epochs=1,

    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,

    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,

    # IMPORTANT for Mac:
    fp16=False,               # don't use Trainer's fp16 on MPS
    bf16=False,               # set True only if you KNOW your Mac + torch supports it well

    optim="adamw_torch",      # mac-safe optimizer
    gradient_checkpointing=True,

    logging_steps=10,
    logging_first_step=True,

    eval_strategy="steps",
    eval_steps=200,

    save_strategy="no",
    report_to="none",
    disable_tqdm=False,

    dataloader_num_workers=0, # mac often happier with 0
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

trainer.train()


Using device: mps


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-12-22 16:48:02.966 python[99068:1389696] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-99068-2025-12-22_16_48_02-2638859746‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out of space.
2025-12-22 16:48:02.983 python[99068:1389696] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-99068-2025-12-22_16_48_02-4206292096‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out of space.
2025-12-22 16:48:02.989 python[99068:1389696] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-99068-2025-12-22_16_48_02-2429357442‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out of space.
2025-12-22 16:48:03.054 python[99068:1389696] Error creating directory 
 The volume ‚ÄúMacintosh HD‚Äù is out of space. You can‚Äôt save the file ‚Äúmpsgraph-99068-2025-12-22_16_48_03-716933158‚Äù because the volume ‚ÄúMacintosh HD‚Äù is out

KeyboardInterrupt: 