In [1]:
!pip install -q --upgrade pip
!pip install -q "transformers>=4.43.0" "accelerate>=0.33.0" datasets peft bitsandbytes safetensors evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m1.5/1.8 MB[0m [31m22.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25h

## Mount Drive and paths

In [2]:
from google.colab import drive
drive.mount("/content/drive")

# === Update these paths ===
DATA_FILES = [
    "/content/drive/MyDrive/Fine-tuning/train_expanded.json"
]

OUTPUT_DIR = "/content/drive/MyDrive/llm-train/ckpts/deepseek_coder_v2_lite_lora"
MERGED_DIR = "/content/drive/MyDrive/llm-train/ckpts/deepseek_coder_v2_lite_merged"

MODEL_NAME = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
SEED = 42

Mounted at /content/drive


## Load and validate dataset

In [3]:
import json, os, hashlib, random
from datasets import load_dataset, Dataset, concatenate_datasets

random.seed(SEED)

def load_all(files):
    dsets = []
    for fp in files:
        assert os.path.exists(fp), f"Missing file: {fp}"
        ds = load_dataset("json", data_files=fp, split="train")
        dsets.append(ds)
    if len(dsets) == 1:
        return dsets[0]
    return concatenate_datasets(dsets)

raw = load_all(DATA_FILES)
print("Raw size:", len(raw))
print("Columns:", raw.column_names)
assert set(["instruction","input","output"]).issubset(raw.column_names), "Dataset needs instruction/input/output"

def make_prompt(example):
    instr = (example.get("instruction") or "").strip()
    inp   = (example.get("input") or "").strip()
    # Clear, consistent prompt template:
    if inp:
        prompt = (
            "### Instruction\n"
            f"{instr}\n\n"
            "### Component\n"
            f"{inp}\n\n"
            "### Unit test:\n"
        )
    else:
        prompt = "### Instruction\n" + instr + "\n\n### Unit test:\n"

    completion = (example.get("output") or "").rstrip() + "\n"
    return {"prompt": prompt, "completion": completion}

mapped = raw.map(make_prompt, remove_columns=[c for c in raw.column_names if c not in ["instruction","input","output"]])
# drop original columns to keep prompt/completion only
mapped = mapped.remove_columns(["instruction","input","output"])
print("Mapped columns:", mapped.column_names)
print(mapped[0])

Generating train split: 0 examples [00:00, ? examples/s]

Raw size: 229
Columns: ['instruction', 'input', 'output']


Map:   0%|          | 0/229 [00:00<?, ? examples/s]

Mapped columns: ['prompt', 'completion']
{'prompt': '### Instruction\nGenerate Jest tests ensuring the component behaves correctly with different props.\n\n### Component\nimport * as React from "react";\n\nexport interface AccordionItemProps {\n  title: string;\n  children: React.ReactNode;\n  isOpen?: boolean;\n  onToggle?: () => void;\n}\n\nexport interface AccordionProps extends React.HTMLAttributes<HTMLDivElement> {\n  items: AccordionItemProps[];\n  allowMultiple?: boolean;\n}\n\nconst AccordionItem: React.FC<AccordionItemProps> = ({ \n  title, \n  children, \n  isOpen = false, \n  onToggle \n}) => {\n  return (\n    <div className="accordion-item">\n      <button\n        className="accordion-header"\n        onClick={onToggle}\n        aria-expanded={isOpen}\n      >\n        {title}\n      </button>\n      {isOpen && (\n        <div className="accordion-content">\n          {children}\n        </div>\n      )}\n    </div>\n  );\n};\n\nexport const Accordion = React.forwardRef<H

## Deduplicate

In [4]:
def row_hash(p, c):
    h = hashlib.sha256()
    h.update(p.encode("utf-8", "ignore"))
    h.update(c.encode("utf-8", "ignore"))
    return h.hexdigest()

seen = set()
kept = []
for ex in mapped:
    h = row_hash(ex["prompt"], ex["completion"])
    if h not in seen:
        seen.add(h)
        kept.append(ex)

ds = Dataset.from_list(kept)
print("After dedupe:", len(ds))

After dedupe: 174


## Train/test/val

In [5]:
split = ds.train_test_split(test_size=0.1, seed=SEED)         # 90/10
tmp   = split["train"].train_test_split(test_size=0.1111, seed=SEED)  # 80/10/10 overall
train_ds, val_ds, test_ds = tmp["train"], tmp["test"], split["test"]
len(train_ds), len(val_ds), len(test_ds)

(138, 18, 18)

## Tokanizer

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)

# Ensure pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

MAX_LENGTH = 1536  # adjust if your components are very long (1024–2048 typical)

def tokenize_and_mask(batch):
    prompts = batch["prompt"]
    completions = batch["completion"]
    input_ids, attention_mask, labels = [], [], []

    for p, c in zip(prompts, completions):
        full = p + c
        tok_full = tokenizer(full, max_length=MAX_LENGTH, truncation=True, padding="max_length")
        tok_prompt = tokenizer(p, max_length=MAX_LENGTH, truncation=True, padding="max_length")

        # count actual prompt tokens (non-pad)
        prompt_len = sum(1 for t in tok_prompt["input_ids"] if t != tokenizer.pad_token_id)
        lab = [-100]*prompt_len + tok_full["input_ids"][prompt_len:MAX_LENGTH]
        if len(lab) < MAX_LENGTH: lab += [-100]*(MAX_LENGTH - len(lab))

        input_ids.append(tok_full["input_ids"])
        attention_mask.append(tok_full["attention_mask"])
        labels.append(lab)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_tok = train_ds.map(tokenize_and_mask, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(tokenize_and_mask, batched=True, remove_columns=val_ds.column_names)
test_tok  = test_ds.map(tokenize_and_mask, batched=True, remove_columns=test_ds.column_names)
for d in (train_tok, val_tok, test_tok):
    d.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

len(train_tok), len(val_tok), len(test_tok)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

(138, 18, 18)

## Local model in 4bit and attach LoRA

In [8]:
# FIXED: use BitsAndBytesConfig from transformers, not bitsandbytes.nn.quantization
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# (optional) sanity check versions
import transformers, bitsandbytes as bnb
print("Transformers:", transformers.__version__, "| bitsandbytes:", bnb.__version__)

torch.cuda.empty_cache()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,   # A100: bf16 is great
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16,              # compute dtype
    quantization_config=bnb_config,
    trust_remote_code=True,
)

# If you added a pad token earlier, keep tokenizer and resize embeddings accordingly:
model.resize_token_embeddings(len(tokenizer))

# For k-bit training
model = prepare_model_for_kbit_training(model)

# Recommended when using gradient checkpointing (saves memory)
model.gradient_checkpointing_enable()
if hasattr(model, "config"):
    model.config.use_cache = False  # important for checkpointing + training

# LoRA config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

lora_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

Transformers: 4.56.1 | bitsandbytes: 0.47.0


config.json: 0.00B [00:00, ?B/s]

configuration_deepseek.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct:
- configuration_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


modeling_deepseek.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct:
- modeling_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

model-00001-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

model-00004-of-000004.safetensors:   0%|          | 0.00/5.64G [00:00<?, ?B/s]

model-00002-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

trainable params: 286,712,832 || all params: 15,983,440,384 || trainable%: 1.7938


## Training setup

In [11]:
%env WANDB_DISABLED=true

env: WANDB_DISABLED=true


In [13]:
from packaging import version
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

print("Transformers version:", transformers.__version__)

BATCH_SIZE = 4
GRAD_ACCUM = 8
EPOCHS = 3
LR = 2e-4

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

common_kwargs = dict(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    logging_steps=20,
    # these two are widely supported, even in older versions:
    save_steps=200,
    eval_steps=200,
    # safer defaults across versions:
    fp16=False,  # we'll set bf16 if supported below
)

# Prefer bf16 if available (A100). Older versions might not accept bf16.
if hasattr(TrainingArguments, "__init__") and "bf16" in TrainingArguments.__init__.__code__.co_varnames:
    common_kwargs["bf16"] = True
else:
    # if bf16 isn't supported, we can optionally use fp16
    common_kwargs["fp16"] = True

# Try new-style args first
args = None
try:
    args = TrainingArguments(
        **common_kwargs,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        weight_decay=0.05,
        evaluation_strategy="steps",
        save_strategy="steps",
        save_total_limit=3,
        load_best_model_at_end=True,
        report_to="none",
        remove_unused_columns=False,
    )
except TypeError as e:
    print("Falling back to legacy-compatible TrainingArguments due to:", e)
    # Minimal set that exists in older versions
    args = TrainingArguments(**common_kwargs)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok if len(val_tok) else None,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Transformers version: 4.56.1
Falling back to legacy-compatible TrainingArguments due to: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'


Step,Training Loss


Step,Training Loss


KeyboardInterrupt: 

In [15]:
# --- disable KV-cache completely for training ---
try:
    model.config.use_cache = False
    if hasattr(model, "generation_config"):
        model.generation_config.use_cache = False
except Exception as e:
    print("use_cache toggle note:", e)

In [17]:
# === SPEED + STABILITY TRAIN (cache OFF) ===
import os, shutil, gc, torch, transformers
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

print("Transformers:", transformers.__version__)
os.environ["WANDB_DISABLED"] = "true"

# make sure cache is OFF (prevents DynamicCache path)
try:
    model.config.use_cache = False
    if hasattr(model, "generation_config"):
        model.generation_config.use_cache = False
except Exception as e:
    print("use_cache toggle note:", e)

# speed knobs
LOCAL_OUT = "/content/ckpt_deepseek_lora"
FINAL_DRIVE_OUT = OUTPUT_DIR
BATCH_SIZE = 8
GRAD_ACCUM = 2
EPOCHS = 2
LR = 2e-4
LOG_STEPS = 200

# keep seq length modest for speed (optional)
try:
    MAX_LENGTH = min(1024, MAX_LENGTH)
except NameError:
    pass

# ensure GC off and we don't pay its overhead
try:
    model.gradient_checkpointing_disable()
except Exception:
    pass
if hasattr(model, "config"):
    model.config.gradient_checkpointing = False

gc.collect(); torch.cuda.empty_cache()

# tokenizer pad token safety
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
    try:
        model.resize_token_embeddings(len(tokenizer))
    except Exception:
        pass

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# fresh local out
if os.path.exists(LOCAL_OUT):
    shutil.rmtree(LOCAL_OUT)

base_kwargs = dict(
    output_dir=LOCAL_OUT,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.05,
    logging_steps=LOG_STEPS,
    report_to="none",
    remove_unused_columns=False,
    dataloader_num_workers=2,
)

# prefer bf16 on A100; fall back to fp16 if not supported
if "bf16" in TrainingArguments.__init__.__code__.co_varnames:
    base_kwargs["bf16"] = True
else:
    base_kwargs["fp16"] = True

# build TrainingArguments with fallbacks (avoid mid-run saves/eval)
try:
    args = TrainingArguments(
        **base_kwargs,
        evaluation_strategy="no",
        save_strategy="no",
        logging_first_step=True,
    )
except TypeError as e:
    print("Legacy TrainingArguments fallback:", e)
    args = TrainingArguments(
        **base_kwargs,
        save_steps=10_000_000,
        eval_steps=10_000_000,
    )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=None,
    data_collator=data_collator,
)

print("🚀 Training with cache disabled…")
out = trainer.train()
print(out)

trainer.save_model(LOCAL_OUT)
tokenizer.save_pretrained(LOCAL_OUT)
print("✅ Saved locally:", LOCAL_OUT)

# copy once to Drive
if os.path.exists(FINAL_DRIVE_OUT):
    shutil.rmtree(FINAL_DRIVE_OUT)
shutil.copytree(LOCAL_OUT, FINAL_DRIVE_OUT)
print("✅ Copied to Drive:", FINAL_DRIVE_OUT)

Transformers: 4.56.1
Legacy TrainingArguments fallback: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'
🚀 Training with cache disabled…


OutOfMemoryError: CUDA out of memory. Tried to allocate 576.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 168.88 MiB is free. Process 6896 has 39.38 GiB memory in use. Of the allocated memory 38.74 GiB is allocated by PyTorch, and 140.81 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [18]:
# ONE-TIME per runtime
import os, gc, torch
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # reduces fragmentation on long runs

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

gc.collect()
torch.cuda.empty_cache()
try:
    torch.cuda.reset_peak_memory_stats()
except Exception:
    pass

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [22]:
# Re-tokenize with smaller max length (e.g., 768). You can try 896 or 1024 if this fits.
NEW_MAX_LENGTH = 640

def tokenize_and_mask_small(batch):
    prompts = batch["prompt"]
    completions = batch["completion"]
    input_ids, attention_mask, labels = [], [], []
    pad_id = tokenizer.pad_token_id

    for p, c in zip(prompts, completions):
        full = p + c
        tok_full = tokenizer(full, max_length=NEW_MAX_LENGTH, truncation=True, padding="max_length")
        tok_prompt = tokenizer(p,    max_length=NEW_MAX_LENGTH, truncation=True, padding="max_length")

        prompt_len = sum(1 for t in tok_prompt["input_ids"] if t != pad_id)
        lab = [-100]*prompt_len + tok_full["input_ids"][prompt_len:NEW_MAX_LENGTH]
        if len(lab) < NEW_MAX_LENGTH:
            lab += [-100]*(NEW_MAX_LENGTH - len(lab))

        input_ids.append(tok_full["input_ids"])
        attention_mask.append(tok_full["attention_mask"])
        labels.append(lab)

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Rebuild tokenized datasets
train_tok = train_ds.map(tokenize_and_mask_small, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(tokenize_and_mask_small,   batched=True, remove_columns=val_ds.column_names) if len(val_ds) else None
test_tok  = test_ds.map(tokenize_and_mask_small,  batched=True, remove_columns=test_ds.column_names) if len(test_ds) else None

for d in [train_tok, val_tok, test_tok]:
    if d is not None:
        d.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("Tokenized with NEW_MAX_LENGTH:", NEW_MAX_LENGTH)

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Tokenized with NEW_MAX_LENGTH: 640


In [23]:
# Turn OFF KV-cache for training + ENABLE gradient checkpointing to save VRAM
try:
    model.config.use_cache = False
    if hasattr(model, "generation_config"):
        model.generation_config.use_cache = False
except Exception as e:
    print("use_cache toggle note:", e)

try:
    model.gradient_checkpointing_enable()
    if hasattr(model, "config"):
        model.config.gradient_checkpointing = True
except Exception as e:
    print("checkpointing note:", e)

In [24]:
import os, shutil, gc, transformers
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

print("Transformers:", transformers.__version__)

LOCAL_OUT = "/content/ckpt_memsafe"     # fast local dir
FINAL_DRIVE_OUT = OUTPUT_DIR            # your Drive path
if os.path.exists(LOCAL_OUT):
    shutil.rmtree(LOCAL_OUT)

# Small per-device batch + higher grad accumulation
PER_DEVICE = 1
GRAD_ACCUM = 16       # effective batch = 16
EPOCHS = 2
LR = 2e-4

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

base_kwargs = dict(
    output_dir=LOCAL_OUT,
    per_device_train_batch_size=PER_DEVICE,
    per_device_eval_batch_size=PER_DEVICE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    weight_decay=0.05,
    logging_steps=200,
    report_to="none",
    remove_unused_columns=False,
    dataloader_num_workers=2,
    group_by_length=True,   # less padding -> less memory
)

# Prefer bf16 on A100
if "bf16" in TrainingArguments.__init__.__code__.co_varnames:
    base_kwargs["bf16"] = True
else:
    base_kwargs["fp16"] = True

# Build TrainingArguments without mid-run eval/saves (with legacy fallback)
try:
    args = TrainingArguments(
        **base_kwargs,
        evaluation_strategy="no",
        save_strategy="no",
        logging_first_step=True,
    )
except TypeError as e:
    print("Legacy fallback:", e)
    args = TrainingArguments(
        **base_kwargs,
        save_steps=10_000_000,
        eval_steps=10_000_000,
    )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=None,     # disable mid-run eval
    data_collator=collator,
)

import torch, gc
gc.collect(); torch.cuda.empty_cache()
print("🚀 Training (memory-safe)…")
train_out = trainer.train()
print(train_out)

# Save locally (fast)
trainer.save_model(LOCAL_OUT)
tokenizer.save_pretrained(LOCAL_OUT)
print("✅ Saved locally:", LOCAL_OUT)

# Copy once to Drive
if os.path.exists(FINAL_DRIVE_OUT):
    shutil.rmtree(FINAL_DRIVE_OUT)
shutil.copytree(LOCAL_OUT, FINAL_DRIVE_OUT)
print("✅ Copied to Drive:", FINAL_DRIVE_OUT)

Transformers: 4.56.1
Legacy fallback: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'
🚀 Training (memory-safe)…


Step,Training Loss


Step,Training Loss




TrainOutput(global_step=18, training_loss=0.04769125580787659, metrics={'train_runtime': 3772.8066, 'train_samples_per_second': 0.073, 'train_steps_per_second': 0.005, 'total_flos': 1.67227951546368e+16, 'train_loss': 0.04769125580787659, 'epoch': 2.0})




✅ Saved locally: /content/ckpt_memsafe
✅ Copied to Drive: /content/drive/MyDrive/llm-train/ckpts/deepseek_coder_v2_lite_lora


## test

In [27]:
from pathlib import Path
import os

# 👇 change this to your real folder (absolute path!)
ADAPTER_DIR = "/content/drive/MyDrive/llm-train/ckpts/deepseek_coder_v2_lite_lora"

print("Exists:", os.path.exists(ADAPTER_DIR))
print("Files:", os.listdir(ADAPTER_DIR)[:10])

Exists: True
Files: ['checkpoint-18', 'training_args.bin', 'README.md', 'adapter_model.safetensors', 'chat_template.jinja', 'tokenizer_config.json', 'tokenizer.json', 'adapter_config.json', 'special_tokens_map.json']


In [33]:
# 1) Make sure Drive is mounted
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

# 2) Find your adapter folder automatically (looks for adapter_model.safetensors)
import os, glob, re
candidates = []
for root in [
    "/content",
    "/content/drive/MyDrive",     # default Drive mount
]:
    for p in glob.glob(root + "/**/adapter_model.safetensors", recursive=True):
        candidates.append(os.path.dirname(p))

print("Found adapter dirs:")
for i, c in enumerate(candidates):
    print(f"[{i}] {c}")

# Prefer the one that also contains adapter_config.json
def looks_good(d):
    return os.path.exists(os.path.join(d, "adapter_config.json"))

candidates_sorted = sorted(candidates, key=lambda d: (not looks_good(d), len(d)))

if not candidates_sorted:
    raise FileNotFoundError("Could not find any folder containing adapter_model.safetensors. "
                            "Check your OUTPUT_DIR path and that training finished successfully.")

ADAPTER_DIR = candidates_sorted[0]
print("\nUsing ADAPTER_DIR:", ADAPTER_DIR)
print("Dir listing:", os.listdir(ADAPTER_DIR)[:12])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


KeyboardInterrupt: 

In [32]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

BASE_MODEL = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"

# Use the tokenizer saved with your adapter (it includes pad token changes)
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, trust_remote_code=True, local_files_only=True)

# Load base, then apply adapter
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
model = PeftModel.from_pretrained(base, ADAPTER_DIR, local_files_only=True)
model.eval()

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'llm-train/ckpts/deepseek_coder_v2_lite_lora'. Use `repo_type` argument if needed.

## Prepare model for Ollama

In [34]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from pathlib import Path

BASE_MODEL = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
ADAPTER_DIR = "/content/drive/MyDrive/llm-train/ckpts/deepseek_coder_v2_lite_lora"   # <-- change to your folder
MERGED_DIR  = "/content/merged_deepseek_v2lite"                                      # output

Path(MERGED_DIR).mkdir(parents=True, exist_ok=True)

# Load tokenizer — prefer the one saved with your run (has your pad token if you added one)
try:
    tok = AutoTokenizer.from_pretrained(ADAPTER_DIR, trust_remote_code=True, local_files_only=True)
except:
    tok = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

# Merge LoRA into base
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, device_map="cpu", torch_dtype=torch.float16, trust_remote_code=True
)
ft = PeftModel.from_pretrained(base, ADAPTER_DIR, local_files_only=True)
merged = ft.merge_and_unload()

# Save merged HF model
merged.save_pretrained(MERGED_DIR, safe_serialization=True)  # .safetensors preferred
tok.save_pretrained(MERGED_DIR)

print("✅ Merged model saved to:", MERGED_DIR)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM:
	size mismatch for base_model.model.model.embed_tokens.weight: copying a param with shape torch.Size([100018, 2048]) from checkpoint, the shape in current model is torch.Size([102400, 2048]).
	size mismatch for base_model.model.lm_head.weight: copying a param with shape torch.Size([100018, 2048]) from checkpoint, the shape in current model is torch.Size([102400, 2048]).

### Run with Huggin face

In [35]:
from google.colab import drive
drive.mount('/content/drive')

ADAPTER_DIR = "/content/drive/MyDrive/llm-train/ckpts/deepseek_coder_v2_lite_lora"  # 👈 change if needed
BASE_MODEL  = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
# === Robust loader: 4-bit on GPU -> fallback to 8-bit with CPU offload ===
import os, gc, torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

BASE_MODEL  = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
ADAPTER_DIR = "/content/drive/MyDrive/llm-train/ckpts/deepseek_coder_v2_lite_lora"  # <-- update

# Clean up VRAM / allocator fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    try: torch.cuda.reset_peak_memory_stats()
    except: pass

# Load tokenizer (prefer adapter dir if you added a pad token during training)
try:
    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR, trust_remote_code=True, local_files_only=True)
except Exception:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

def load_4bit_full_gpu():
    bnb4 = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16,
    )
    # Force everything to GPU; avoid auto offload
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        trust_remote_code=True,
        quantization_config=bnb4,
        device_map={"": 0},
        low_cpu_mem_usage=True,
    )
    # Ensure vocab matches (pad token possibly added during training)
    try:
        base.resize_token_embeddings(len(tokenizer))
    except Exception:
        pass
    mdl = PeftModel.from_pretrained(base, ADAPTER_DIR, local_files_only=True)
    return mdl

def load_8bit_with_cpu_offload():
    # This allows some layers to stay on CPU in FP32 while the rest runs 8-bit on GPU
    bnb8 = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,
        llm_int8_enable_fp32_cpu_offload=True,
    )
    base = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        trust_remote_code=True,
        quantization_config=bnb8,
        device_map="auto",                # balanced offload (GPU + CPU okay in int8)
        low_cpu_mem_usage=True,
    )
    try:
        base.resize_token_embeddings(len(tokenizer))
    except Exception:
        pass
    mdl = PeftModel.from_pretrained(base, ADAPTER_DIR, local_files_only=True, device_map="auto")
    return mdl

# Try 4-bit on GPU first; if it throws, fall back to 8-bit CPU-offload
model = None
try:
    model = load_4bit_full_gpu()
    print("✅ Loaded base in 4-bit on GPU + LoRA adapter")
except Exception as e:
    print("4-bit full-GPU load failed → falling back to 8-bit with CPU offload.\nReason:", repr(e))
    gc.collect();
    if torch.cuda.is_available(): torch.cuda.empty_cache()
    model = load_8bit_with_cpu_offload()
    print("✅ Loaded base in 8-bit with CPU offload + LoRA adapter")

# Inference helper
model.eval()
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

def build_prompt(component_code: str):
    return (
        "### Instruction\n"
        "Write unit tests using React Testing Library and Jest for this component. "
        "Use accessible queries (getByRole/findByRole), @testing-library/user-event for interactions, "
        "and jest-dom matchers.\n\n"
        "### Component\n" + component_code + "\n\n"
        "### Unit test:\n"
    )

def generate_test(component_code: str, max_new_tokens=480, temperature=0.2, top_p=0.95):
    prompt = build_prompt(component_code)
    out = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=1.05,
        eos_token_id=tokenizer.eos_token_id,
    )[0]["generated_text"]
    return out[len(prompt):] if out.startswith(prompt) else out

# Smoke test (short)
example_component = """import React from 'react';
export default function Button({ label, onClick }) {
  return <button onClick={onClick}>{label}</button>;
}"""
print(generate_test(example_component)[:600])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Loaded base in 4-bit on GPU + LoRA adapter


ValueError: The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please discard the `device` argument when creating your pipeline object.

In [2]:
# --- Build a pipeline WITHOUT `device=` when model was loaded with accelerate/device_map ---
from transformers import pipeline

# keep cache ON for faster generation
if hasattr(model, "config"):
    model.config.use_cache = True

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    # do NOT pass device=... when using device_map/accelerate
    # device_map is already handled by the model you loaded
    return_full_text=False,   # only return the completion (nicer for our prompt template)
)

def build_prompt(component_code: str):
    return (
        "### Instruction\n"
        "Write unit tests using React Testing Library and Jest for this component. "
        "Use accessible queries (getByRole/findByRole), @testing-library/user-event for interactions, "
        "and jest-dom matchers.\n\n"
        "### Component\n" + component_code + "\n\n"
        "### Unit test:\n"
    )

def generate_test(component_code: str, max_new_tokens=480, temperature=0.2, top_p=0.95):
    prompt = build_prompt(component_code)
    out = pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=1.05,
        eos_token_id=tokenizer.eos_token_id,
    )[0]["generated_text"]
    # return_full_text=False already trims the prompt, but keep this guard:
    return out

# quick smoke test
example_component = """import React from 'react';
export default function Button({ label, onClick }) {
  return <button onClick={onClick}>{label}</button>;
}"""

print(generate_test(example_component)[:800])

Device set to use cuda:0


AttributeError: 'DynamicCache' object has no attribute 'seen_tokens'

In [3]:
# === Inference with fine-tuned DeepSeek (4-bit + LoRA) ===
import torch
from transformers import pipeline

# Make sure cache is enabled for generation (faster)
if hasattr(model, "config"):
    model.config.use_cache = True

# Create pipeline (⚠️ no device=... here, since model was loaded with accelerate/device_map)
gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,   # only return completion, not the input prompt
)

def build_prompt(component_code: str) -> str:
    return (
        "### Instruction\n"
        "Write unit tests using React Testing Library and Jest for this component. "
        "Use accessible queries (getByRole/findByRole), @testing-library/user-event for interactions, "
        "and jest-dom matchers.\n\n"
        "### Component\n" + component_code + "\n\n"
        "### Unit test:\n"
    )

def generate_test(component_code: str, max_new_tokens: int = 480,
                  temperature: float = 0.2, top_p: float = 0.95) -> str:
    prompt = build_prompt(component_code)
    outputs = gen_pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=1.05,
        eos_token_id=tokenizer.eos_token_id,
    )
    return outputs[0]["generated_text"]

# 🔹 Example usage
example_component = """import React from 'react';
export default function Button({ label, onClick }) {
  return <button onClick={onClick}>{label}</button>;
}"""

print(generate_test(example_component)[:800])

Device set to use cuda:0


AttributeError: 'DynamicCache' object has no attribute 'seen_tokens'

In [7]:
# === Faster inference (no pipeline, greedy decode, cache OFF) ===
import torch

# speed-friendly flags on A100
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# keep cache OFF to avoid DynamicCache errors
if hasattr(model, "config"):
    model.config.use_cache = False
if hasattr(model, "generation_config"):
    model.generation_config.use_cache = False

# make sure pad/eos are set
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

def build_prompt(component_code: str) -> str:
    return (
        "### Instruction\n"
        "Write unit tests using React Testing Library and Jest for this component. "
        "Use accessible queries (getByRole/findByRole), @testing-library/user-event for interactions, "
        "and jest-dom matchers.\n\n"
        "### Component\n" + component_code + "\n\n"
        "### Unit test:\n"
    )

@torch.inference_mode()
def generate_test_fast(component_code: str, max_new_tokens: int = 320) -> str:
    prompt = build_prompt(component_code)
    # If you loaded with device_map={"":0}, everything is on a single GPU:
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    out_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,          # greedy = faster & more deterministic
        temperature=0.0,
        top_p=1.0,
        repetition_penalty=1.05,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=False,          # critical for your model build
    )
    # strip the prompt portion
    gen_only = out_ids[0, inputs["input_ids"].shape[1]:]
    return tokenizer.decode(gen_only, skip_special_tokens=True)

# 🔹 Smoke test
example_component = """import React from 'react';
export default function Button({ label, onClick }) {
  return <button onClick={onClick}>{label}</button>;
}"""

print(generate_test_fast(example_component)[:800])

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


KeyboardInterrupt: 