In [None]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers>=4.49.0  # ← Updated (was 4.41.0)
!pip install -q datasets>=2.14.0
!pip install -q accelerate>=0.25.0
!pip install -q peft>=0.17.0
!pip install -q bitsandbytes>=0.41.3
!pip install -q trl>=0.17.0
!pip install -q sentencepiece>=0.1.99
!pip install -q einops>=0.7.0


print("Packages installed!")

Packages installed!


## Mounting Google Drive to Access Data

In [None]:
# Mounting Google Drive to access the data

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    AutoConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel,
    TaskType
)
from trl import SFTTrainer

## Training Config

In [None]:
# Training Config

# Model paths
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
MODEL_DIR = "/content/drive/MyDrive/298b/Qwen_2.5_7b_instruct_LoRA_FT"
DATA_PATH = "/content/drive/MyDrive/298b/data.json"

# LoRA settings (higher rank because i'm using A100)
LORA_R = 128
LORA_ALPHA = 256
LORA_DROPOUT = 0.5
TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj"
]

# Training settings
NUM_EPOCHS = 1
BATCH_SIZE = 4
GRADIENT_ACCUMULATION = 2
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 1024
WARMUP_RATIO = 0.03

USE_FLASH_ATTENTION = False  # Set False if not using flash attention
OPTIMIZER = "adamw_torch_fused"

## Loading and Preparing the Data

In [None]:
# Loading and Preparing the data

def load_and_prepare_data(data_path):
    """Load Q&A data and format for training"""
    print(f"Loading data from {data_path}...")

    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Format as Qwen's ChatML template
    formatted_data = []
    for item in data:
        conversations = item['conversations']
        if len(conversations) >= 2:
            question = conversations[0]['value']
            answer = conversations[1]['value']
            text = f"""<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
{answer}<|im_end|>"""
            formatted_data.append({"text": text})

    print(f"Prepared {len(formatted_data)} training examples")

    dataset = Dataset.from_list(formatted_data)
    dataset = dataset.train_test_split(test_size=0.1, seed=42)

    print(f"  Train: {len(dataset['train'])} examples")
    print(f"  Validation: {len(dataset['test'])} examples")

    return dataset

dataset = load_and_prepare_data(DATA_PATH)

Loading data from /content/drive/MyDrive/298b/data.json...
Prepared 747 training examples
  Train: 672 examples
  Validation: 75 examples


## Setting up Qwen and the Tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def setup_model_and_tokenizer(model_name, use_flash_attention=False):
    """Load Qwen 2.5 model and tokenizer"""
    print(f"Loading model: {model_name}")

    # Load tokenizer directly (no processor needed for Qwen)
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    print(f"Tokenizer loaded with eos_token: {tokenizer.eos_token}")

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        attn_implementation="flash_attention_2" if use_flash_attention else "sdpa"
    )

    # Move to cuda
    model = model.to("cuda")
    model.gradient_checkpointing_enable()

    print("Model loaded successfully!")
    print(f"Model device: {next(model.parameters()).device}")

    return model, tokenizer

# Load model and tokenizer
model, tokenizer = setup_model_and_tokenizer(MODEL_NAME, USE_FLASH_ATTENTION)

Loading model: Qwen/Qwen2.5-7B-Instruct
Tokenizer loaded with eos_token: <|im_end|>


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully!
Model device: cuda:0


## LoRA Config

In [None]:
# Applies LoRA

def apply_lora(model, r, alpha, dropout, target_modules):
    """Apply LoRA adapters to the model"""
    print("Applying LoRA configuration...")

    lora_config = LoraConfig(
        r=r,
        lora_alpha=alpha,
        target_modules=target_modules,
        lora_dropout=dropout,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )

    model = get_peft_model(model, lora_config)

    # Shows the trainable parameters
    model.print_trainable_parameters()

    return model

# Applies LoRA
model = apply_lora(model, LORA_R, LORA_ALPHA, LORA_DROPOUT, TARGET_MODULES)

Applying LoRA configuration...
trainable params: 322,961,408 || all params: 7,938,577,920 || trainable%: 4.0683


## Trainer Args for Training

In [None]:
# Trainer Arguments for Training

training_args = TrainingArguments(
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION,
    optim=OPTIMIZER,  # Using adamw_torch_fused for A100
    save_strategy="no",  # Don't save checkpoints during training
    logging_steps=21,
    learning_rate=LEARNING_RATE,
    weight_decay=0.1,
    fp16=False,
    bf16=True,  # Use bfloat16 for A100
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=WARMUP_RATIO,
    group_by_length=True,  # Group sequences of similar length
    lr_scheduler_type="cosine",  # Constant learning rate (no decay)
    eval_strategy="steps",  # Evaluate every N steps
    eval_steps=21,  # Will eval at steps 21, 42, 63, 84
    load_best_model_at_end=False,  # No checkpoints to load from
    gradient_checkpointing=True,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    report_to="wandb",  # wandb for tracking
    push_to_hub=False
)

## Tokenizing the Dataset

In [None]:
from transformers import Trainer, DataCollatorForLanguageModeling


def tokenize_function(examples):
    """Tokenize the text data"""
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,
        padding="max_length",
        return_tensors=None,  # Return lists, not tensors
    )

print("Tokenizing datasets...")
tokenized_train = dataset["train"].map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_eval = dataset["test"].map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
print("Datasets tokenized")


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

print(f"\nTotal training steps: ~{len(tokenized_train) // (BATCH_SIZE * GRADIENT_ACCUMULATION) * NUM_EPOCHS}")

Tokenizing datasets...


Map:   0%|          | 0/672 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Datasets tokenized

Total training steps: ~84


## Training Loop with Trainer

In [None]:
# The actual training

print("STARTING TRAINING")

trainer.train()

print("TRAINING COMPLETE!")

STARTING TRAINING


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mthomas-dvorochkin[0m ([33mthomas-dvorochkin-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
21,2.324,2.203235
42,2.1472,2.170479
63,2.1371,2.136456
84,2.1451,2.143224


TRAINING COMPLETE!


## Saving the Model's Weights

In [None]:
import os
import time

print(f"Saving to {MODEL_DIR}...")
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

time.sleep(2)  # Waits for Google Drive sync

files = os.listdir(MODEL_DIR)

Saving to /content/drive/MyDrive/298b/Qwen_2.5_7b_instruct_LoRA_FT...


## Loading both FT and Baseline Models

In [None]:
from peft import AutoPeftModelForCausalLM
import torch
import textwrap
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load fine-tuned model
print("Loading fine-tuned model...")
finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
    MODEL_DIR,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Models loaded\n")

Loading fine-tuned model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Models loaded



## Testing Generation: FT vs. Baseline

In [None]:
# Qwen was ending in incomplete sentences, so we truncate to the last complete sentence.

def truncate_to_complete_sentence(text):
    """Truncate text to the last complete sentence."""
    # Find the last sentence-ending punctuation
    last_period = text.rfind('.')
    last_question = text.rfind('?')
    last_exclaim = text.rfind('!')

    # Get the position of the last sentence ending
    last_end = max(last_period, last_question, last_exclaim)

    if last_end > 0:
        return text[:last_end + 1]
    return text

def compare(question, use_personality_prompt=False):
    wrapper = textwrap.TextWrapper(width=80, break_long_words=False, replace_whitespace=False)

    print(f"\nQUESTION: {question}")

    # Fine-tuned model
    print("\n" + "="*80)
    print("FINE-TUNED MODEL:")
    print("="*80)

    if use_personality_prompt:
        ft_prompt = f"""<|im_start|>system
You are Neil deGrasse Tyson, astrophysicist and director of the Hayden Planetarium. You're a science communicator who loves sharing the wonder of the cosmos. Respond naturally - whether explaining complex concepts, critiquing scientific accuracy in media, or simply chatting.<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""
    else:
        ft_prompt = f"""<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""

    finetuned_model.eval()
    inputs = tokenizer(ft_prompt, return_tensors="pt").to(finetuned_model.device)

    # Get the eos token id for stopping
    eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")

    with torch.no_grad():
      outputs = finetuned_model.generate(
          **inputs,
          max_new_tokens=300,
          temperature=0.6,
          top_p=0.85,
          repetition_penalty=1.15, # Issue with repeated phrases so we add a penalty
          no_repeat_ngram_size=3,
          do_sample=True,
          pad_token_id=tokenizer.eos_token_id,
          eos_token_id=eos_token_id,
      )

    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    response = response.replace("<|im_end|>", "").strip()
    response = truncate_to_complete_sentence(response)

    for line in response.split('\n'):
        if line.strip():
            print('\n'.join(wrapper.wrap(line)))

    # Base model
    print("\n" + "="*80)
    print("BASE MODEL:")
    print("="*80)

    base_prompt = f"""<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""

    base_model.eval()
    base_inputs = tokenizer(base_prompt, return_tensors="pt").to(base_model.device)

    with torch.no_grad():
        outputs = base_model.generate(
            **base_inputs,
            max_new_tokens=300,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=eos_token_id,  # Stop at end of turn
        )

    base_response = tokenizer.decode(outputs[0][base_inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    base_response = base_response.replace("<|im_end|>", "").strip()
    base_response = truncate_to_complete_sentence(base_response)

    for line in base_response.split('\n'):
        if line.strip():
            print('\n'.join(wrapper.wrap(line)))
questions = [
    "What do you think about black holes?",
    "Can you tell me a bit about yourself and what you do?",
    "Why is space exploration important?",
    "What's scientifically wrong about Star Wars",
    "Can you critique the physics in Marvel movies?",
    "Explain moons to me.",
    "Hey neil, how are you?",
    "Can you tell me about sports cars?",
    "Can I run to the moon?"
]

print("TESTING: Fine-tuned vs Base")
for i, q in enumerate(questions, 1):
    print(f"TEST {i}/{len(questions)}")
    compare(q, use_personality_prompt=True)

print("\nComplete")

TESTING: Fine-tuned vs Base
TEST 1/9

QUESTION: What do you think about black holes?

FINE-TUNED MODEL:
Black holes fascinate me because they represent the most extreme conditions we
know to exist within our universe. They defy common sense yet follow precise
physical laws. If you were to fall into one (which I don't recommend), time
would appear to slow down for everyone else watching from afar while your body
experiences no change until it reaches the event horizon where space itself
becomes infinitely compressed. It's not just theoretical; there's evidence that
black holes formed during the early days of the universe after massive stars
exploded as supernovas. And here's something mind-blowingly cool: if you could
somehow harness all the mass-energy contained inside a black hole, you'd have
more energy than all the nuclear power plants on Earth combined! The problem is,
we can't get out what went in without breaking the cosmic speed limit set by
Einstein's relativity theory. So even

## Uploading to HF

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os
from huggingface_hub import login, HfApi, create_repo

# Login to HuggingFace
login()

# Use Colab local storage (not Google Drive)
os.environ['HF_HOME'] = '/content/hf_cache'
os.environ['TRANSFORMERS_CACHE'] = '/content/hf_cache'

# Create the repo first
repo_id = "tdvoroch/qwen25-ndt-ft_merged"
print(f"Creating repo: {repo_id}")
try:
    create_repo(repo_id, repo_type="model", exist_ok=True)
    print("Repo created (or already exists)")
except Exception as e:
    print(f"Repo creation note: {e}")

print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    cache_dir="/content/hf_cache"
)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(
    base_model,
    MODEL_DIR  # This pulls from your Drive where training saved it
)

print("Merging weights...")
merged_model = model.merge_and_unload()

# Save to Colab local storage
output_dir = "/content/Qwen25_NDT_Merged"
print(f"Saving merged model to {output_dir}...")
merged_model.save_pretrained(output_dir)

print("Saving tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    trust_remote_code=True,
    cache_dir="/content/hf_cache"
)
tokenizer.save_pretrained(output_dir)

print("Merge complete!")

# Cleanup to free memory before upload
del base_model
del model
del merged_model
import gc
gc.collect()
torch.cuda.empty_cache()

print("Uploading to HuggingFace...")
api = HfApi()
api.upload_folder(
    folder_path=output_dir,
    repo_id=repo_id,
    repo_type="model"
)

print(f"Done! https://huggingface.co/{repo_id}")

Creating repo: tdvoroch/qwen25-ndt-ft_merged
Repo created (or already exists)
Loading base model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading LoRA adapter...
Merging weights...
Saving merged model to /content/Qwen25_NDT_Merged...
Saving tokenizer...
Merge complete!
Uploading to HuggingFace...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...NDT_Merged/tokenizer.json: 100%|##########| 11.4MB / 11.4MB            

  ...0002-of-00004.safetensors:   0%|          |  610kB / 4.93GB            

  ...0003-of-00004.safetensors:   0%|          |  608kB / 4.33GB            

  ...0004-of-00004.safetensors:   2%|2         | 25.2MB / 1.09GB            

  ...0001-of-00004.safetensors:   1%|1         | 50.3MB / 4.88GB            

Done! https://huggingface.co/tdvoroch/qwen25-ndt-ft_merged
