In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install hf_xet
!pip install trl
!pip install --upgrade transformers
!pip install tensorboard
!pip install -U bitsandbytes
!pip install transformers_stream_generator
!pip install rouge_score
!pip install sacrebleu

Collecting trl
  Downloading trl-0.19.0-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading 

### Import Packages

In [None]:
import os
from datasets import load_dataset, concatenate_datasets, Dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU

### Load Datasets

In [None]:
# load tuning sets for <Sunrise on the Reaping>
sunrise_train_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning/sunrise_train_data.json'
sunrise_val_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning/sunrise_val_data.json'
sunrise_test_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning/sunrise_test_data.json'

sunrise_train_datasets = Dataset.from_json(sunrise_train_path)
sunrise_val_datasets = Dataset.from_json(sunrise_val_path)
sunrise_test_datasets = Dataset.from_json(sunrise_test_path)

In [None]:
# load tuning sets for <All Fours>
allfours_train_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning2/reasoned_qa_output/allfours_train_data.json'
allfours_val_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning2/reasoned_qa_output/allfours_val_data.json'
allfours_test_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning2/reasoned_qa_output/allfours_test_data.json'

allfours_train_datasets = Dataset.from_json(allfours_train_path)
allfours_val_datasets = Dataset.from_json(allfours_val_path)
allfours_test_datasets = Dataset.from_json(allfours_test_path)

In [None]:
# Combine two books together
train_set = concatenate_datasets([sunrise_train_datasets, allfours_train_datasets])
val_set = concatenate_datasets([sunrise_val_datasets, allfours_val_datasets])
test_set = concatenate_datasets([sunrise_test_datasets, allfours_test_datasets])

In [None]:
train_set

Dataset({
    features: ['Question', 'Answer', 'Reasoning'],
    num_rows: 958
})

### Model Tuning: Qwen3-8B

Load a Reasoning LLM: Qwen 3 - 8B and its specified tokeniser.

In [None]:
# Load Reasoning LLM: Qwen3-8B
qwen_model = 'Qwen/Qwen3-8B'
# Load the specified tokenizer for the model to ensure consistency
tokenizer = AutoTokenizer.from_pretrained(qwen_model)

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Quantisation Config: This is an important for QLoRA, and the key difference from LoRA.

In [None]:
# defines the quantisation settings to reduce the memory footpoint and computational cost of large models and often with minimal impact on performance
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # a key parameter which instructs the system to load the model's weights using 4-bit precision thus reducing the memory usage
    bnb_4bit_quant_type="nf4", # specifies the exact type of 4-bit quantisation to use, which is NF4. This is a specific floating-point format designed for neural networks.
    bnb_4bit_compute_dtype=torch.bfloat16, # specifies the computations involving these weights shouodl be using bfloat16
    bnb_4bit_use_double_quant=False, # do not use double quantisation which quantise the quantisation constants
)

In [None]:
# load the quantised model
model = AutoModelForCausalLM.from_pretrained(
    qwen_model,
    quantization_config=bnb_config,
    device_map="auto"  # instructs the transformers library to automatically distribute the model's layer cross available decives to optimise memory usage
)
model.config.use_cache = False

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/32.9k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Ensure the quantised model can be properly fine tuned.

In [None]:
# this step handles the underlying complexities to make sure that even though the model's parameters have been compressed to a lower precision, gradients can be still computed and propagated to the LoRA adapters, thus enabling effective fine-tuning.
# without this step, attempting to directly fine-tune a quantised model with LoRA might lead to compatibility issues or an inability to train correctly
model = prepare_model_for_kbit_training(model)

LoRA Configuration

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

trainable params: 43,646,976 || all params: 8,234,382,336 || trainable%: 0.5301


In [None]:
def formatting_prompts_func(examples):
    texts = []
    for question, answer, reasoning_list in zip(examples["Question"], examples["Answer"], examples["Reasoning"]):
        reasoning_str = "\n".join([f"Step {i+1}: {step}" for i, step in enumerate(reasoning_list)])
        formatted_text = (
            f"### Question:\n{question}\n\n"
            f"### Thought Process:\n"
            f"{reasoning_str}\n\n"
            f"### Answer:\n{answer}"
        )
        texts.append(formatted_text)
    return {"text": texts}

In [None]:
train_dataset_formatted = train_set.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=train_set.column_names
)
validation_dataset_formatted = val_set.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=val_set.column_names
)
test_dataset_formatted = test_set.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=test_set.column_names
)

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_strategy="steps",
    report_to="tensorboard",
    dataloader_num_workers=0,
    max_steps=-1,
)


trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_formatted,
    eval_dataset=validation_dataset_formatted,
    peft_config=lora_config,
    args=training_arguments,
)

print("\nTrainer Configuration Completed。")

Converting train dataset to ChatML:   0%|          | 0/958 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/958 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/958 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/958 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/120 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Trainer Configuration Completed。


In [None]:
print("Model Tuning Start...")
trainer.train()
print("Model Tuning Completed")

Model Tuning Start...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
10,2.7712,2.907349
20,2.7839,2.871433
30,2.6589,2.81197
40,2.6803,2.730389
50,2.7796,2.635788
60,2.5444,2.540861
70,2.5494,2.431639
80,2.2831,2.318348
90,2.3163,2.212548
100,2.1564,2.081156


  return fn(*args, **kwargs)


Model Tuning Completed


In [None]:
# Save LoAR Adapter
output_model_path = "/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/QLoRA_Qwen"
trainer.save_model(output_model_path)
print(f"Fine-tuned LoRA LoRA Adapter saved to: {output_model_path}")

### Evaluation

In [None]:
output_model_path = "/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/QLoRA_Qwen"
model = AutoModelForCausalLM.from_pretrained(output_model_path)
tokenizer = AutoTokenizer.from_pretrained(output_model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU

def evaluate_model(model, tokenizer, dataset):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bleu = BLEU()

    all_rouge_scores = {'rouge1': {'fmeasure': 0, 'precision': 0, 'recall': 0},
                        'rouge2': {'fmeasure': 0, 'precision': 0, 'recall': 0},
                        'rougeL': {'fmeasure': 0, 'precision': 0, 'recall': 0}}
    all_bleu_scores = 0
    count = 0

    for example in dataset:
        prompt = example["text"].split("### Answer:")[0] + "### Answer:"
        reference = example["text"].split("### Answer:")[1].strip()

        input_data = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = input_data.input_ids.to(model.device)
        attention_mask = input_data.attention_mask.to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=100,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
                attention_mask = attention_mask
            )

        generated_text = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True).strip()

        rouge_scores = scorer.score(reference, generated_text)
        for key in all_rouge_scores:
            all_rouge_scores[key]['fmeasure'] += rouge_scores[key].fmeasure
            all_rouge_scores[key]['precision'] += rouge_scores[key].precision
            all_rouge_scores[key]['recall'] += rouge_scores[key].recall

        all_bleu_scores += bleu.sentence_score(generated_text, [reference]).score

        count += 1

    avg_rouge_scores = {}
    for key in all_rouge_scores:
        avg_rouge_scores[key] = {metric: score / count for metric, score in all_rouge_scores[key].items()}
    avg_bleu_score = all_bleu_scores / count

    return avg_rouge_scores, avg_bleu_score

print("\nEvaluating on Test Set...")

test_dataset_formatted = test_set.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=test_set.column_names
)
test_rouge, test_bleu = evaluate_model(model, tokenizer, test_dataset_formatted)
print("Test Set Metrics:")
print("ROUGE Scores:", test_rouge)
print("BLEU Score:", test_bleu)


Evaluating on Test Set...




Test Set Metrics:
ROUGE Scores: {'rouge1': {'fmeasure': 0.37235693681367865, 'precision': 0.38859348254507936, 'recall': 0.37136418312583697}, 'rouge2': {'fmeasure': 0.20766283783381595, 'precision': 0.21929105212020264, 'recall': 0.2041120533113388}, 'rougeL': {'fmeasure': 0.3162744816674813, 'precision': 0.3314280068067702, 'recall': 0.31397758915064305}}
BLEU Score: 14.505540775232165


In [None]:
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

output_model_path = "/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/QLoRA_Qwen"

# Reload the model with the 4-bit quantization configuration
# Ensure bnb_config is defined (it should be from previous cells)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    output_model_path,
    quantization_config=bnb_config, # Apply quantization again for inference
    device_map="auto" # Let transformers handle device placement
)

tokenizer = AutoTokenizer.from_pretrained(output_model_path)

# The model should already be on the device because of device_map="auto"
# No need for an explicit model.to(device) call if device_map="auto" is used

def chat_with_model(model, tokenizer):
    print("Start chatting with the model. Type 'quit' to exit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'quit':
            break

        # Format the user input as a prompt
        prompt = f"### Question:\n{user_input}\n\n### Thought Process:\n"

        # Ensure input_ids are on the correct device
        input_data = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = input_data.input_ids.to(model.device)
        attention_mask = input_data.attention_mask.to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=200, # Adjust based on desired response length
                num_return_sequences=1,
                attention_mask=attention_mask,
                pad_token_id=tokenizer.eos_token_id,
                temperature=0.7, # Adjust for creativity (lower is less creative)
                top_p=0.9,       # Adjust for sampling
                do_sample=True   # Enable sampling
            )

        generated_text = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True).strip()

        # Find the start of the Answer section to only display the answer
        answer_start_index = generated_text.find("### Answer:")
        if answer_start_index != -1:
            # Find the start of the next section (if any)
            next_section_index = generated_text.find("###", answer_start_index + len("### Answer:"))
            if next_section_index != -1:
                generated_answer = generated_text[answer_start_index + len("### Answer:"):next_section_index].strip()
            else:
                generated_answer = generated_text[answer_start_index + len("### Answer:"):].strip()
            print(f"Model: {generated_answer}")
        else:
            # If no explicit Answer section, print the generated text after the Thought Process
            thought_process_end_index = generated_text.find("\n\n### Answer:")
            if thought_process_end_index != -1:
                print(f"Model: {generated_text[:thought_process_end_index].strip()}")
            else:
                 print(f"Model: {generated_text.strip()}")


# Start the chat interface
chat_with_model(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Start chatting with the model. Type 'quit' to exit.
You: Please summarise the book <Sunrise on the Reaping>.
Model: Step 1: Recall the main plot of the book: District 12's tributes Haymitch and Maysilee are forced to kill the mockingjays, leading to a rebellion.
Step 2: Identify the key events: The bombing of the arena, the victory of the Newcomers, the Capitol's retaliation, and the final destruction of the arena.
Step 3: Summarise these elements into a concise overview.

### Answer (Summary):
'Sunrise on the Reaping'  is the final book in the Hunger Games series, detailing the aftermath of the Newcomers' rebellion. It covers the bombing of the arena, the brutal retaliation by the Capitol, and the ultimate destruction of the Games, leading to a new world order.
You: quit
