# Imports

In [None]:
# For Preprocessing
!pip install -q -U datasets

import json
import pandas as pd
import json
import random
import os
from datasets import Dataset, load_dataset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# For Training

!pip install -q -U torch torchvision torchaudio fastai
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U tokenizers
!pip install -q -U evaluate
!pip install -q -U rouge_score
!pip install -q -U loralib einops xformers

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

import bitsandbytes
from peft import (
    LoraConfig,
    PeftConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    BitsAndBytesConfig,
)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.2/821.2 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m126.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Set random seed for reproducibility.
RANDOM_SEED = 33
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
#tf.random.set_seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

# Preprocessing

In [None]:
# Prep for download.
%cd /content/
!rm -rf DS266-ugarcia-bjulve
!git clone https://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@github.com/bjulve-ischool/DS266-ugarcia-bjulve.git
%cd DS266-ugarcia-bjulve
!ls .

train_file = 'data/v1-3/train.jsonl'
dev_file = 'data/v1-3/dev.jsonl'
test_file = 'data/v1-3/test.jsonl'

/content
Cloning into 'DS266-ugarcia-bjulve'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 32 (delta 8), reused 4 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (32/32), 3.34 MiB | 4.44 MiB/s, done.
Resolving deltas: 100% (8/8), done.
/content/DS266-ugarcia-bjulve
Baseline_Model_Evaluation.ipynb  QFS_Datasets.ipynb
data				 README.md
EDA2.ipynb			 Socratic_FT_Data_Augmentation.ipynb
EDA.ipynb			 Socratic_Pretrained_Sampler.ipynb
outputs				 T5Gemma_Sampler.ipynb


In [None]:
# Helper to load the data into memory.
def load_data(file_path):
  with open(file_path) as f:
      lines = f.read().split("\n")[:-1]

  document_question_response = []
  for line in lines:
      data = json.loads(line)
      # Remove extra white space. Since the tokenizer is subword
      # and not sentence, then the newlines will not likely affect
      # the word embedding underlying meaning.
      document = " ".join(data["document"].split())
      questions = data["questions"]
      for question in questions:
          question_text = " ".join(question["question_text"].split())
          responses = question["responses"]
          for response in responses:
              response_text = " ".join(response["response_text"].split())
              document_question_response.append((document, question_text, response_text))

  return document_question_response


# Get the data. Preserve the original splits.
train_triplets = load_data(train_file)
dev_triplets = load_data(dev_file)
test_triplets =  load_data(test_file)
print("Train:", len(train_triplets))
print("Dev:", len(dev_triplets))
print("Test:", len(test_triplets))

# Create a HF dataset. Shuffle the order
# before returning it.
def make_dataset(triplets):
    documents, questions, responses = zip(*triplets)
    documents = list(documents)
    questions = list(questions)
    responses = list(responses)

    dataset = Dataset.from_dict({"document": documents, "question": questions, "response": responses})
    return dataset.shuffle(seed=RANDOM_SEED)

train_dataset = make_dataset(train_triplets)
dev_dataset = make_dataset(dev_triplets)
test_dataset = make_dataset(test_triplets)

# Print a sample.
random_sample = random.choice(train_dataset)
random_document, random_question, random_response = random_sample["document"], random_sample["question"], random_sample["response"]
print("\nRANDOM SAMPLE:\n")
print(f"\033[1mDocument:\033[0m {random_document[:50]}", "\n")
print(f"\033[1mQuestion:\033[0m {random_question}", "\n")
print(f"\033[1mResponse:\033[0m {random_response}", "\n")

Train: 1000
Dev: 500
Test: 1040

RANDOM SAMPLE:

[1mDocument:[0m THE MAN OUTSIDE By EVELYN E. SMITH Illustrated by  

[1mQuestion:[0m What is the relationship between Martin and Ives? 

[1mResponse:[0m Cousin Ives enters Martin’s life when he is a little older, and is the third descendant to accompany him as his guardian. Out of all his descendants to assume guardianship, Martin forms the closest relationship with Ives. Rather than seeing Martin as a responsibility and duty, Ives sees Martin as an individual and seeks ways to connect and encourage his passions. For one, Ives buys a yacht named The Interregnum to which the pair take upon themselves to explore the current world in. They traveled across the waters and inland to see both the civilized and uncivilized world, with Martin taking it all in. When it was just the two of them, their relationship progressed further. Ives began to open up about the future world that he and his descendants come from and explain the nuances of 

In [None]:
# Get the pretrained model and prepare it for QLoRA.
# We'll use the quantized version of the model for
# PEFT.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    load_4bit_use_double_quant=True,
)

socratic_checkpoint_name = "Salesforce/squality-socratic-books-30M"
socratic_model_quantized = AutoModelForSeq2SeqLM.from_pretrained(
    socratic_checkpoint_name,
    quantization_config=bnb_config,
    device_map={"": 0})
socratic_tokenizer_quantized = AutoTokenizer.from_pretrained(socratic_checkpoint_name)
socratic_model_quantized = prepare_model_for_kbit_training(socratic_model_quantized)

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["k_proj", "v_proj", "q_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    modules_to_save=["lm_head"]
)

socratic_model_quantized = get_peft_model(socratic_model_quantized, config)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
# Tokenize the training and eval datasets and prep them for fine tuning.

MAX_SEQUENCE_LENGTH = socratic_tokenizer_quantized.model_max_length
print(f"Max sequence length: {MAX_SEQUENCE_LENGTH}", "\n")

def make_question_document_pairs(dataset):
    question_document_pairs = []
    for document, question in zip(dataset["document"], dataset["question"]):
        question_document_pairs.append(f"<ask&answer> {question} <qsep> {document}")

    return question_document_pairs

def preprocess_socratic_batch(dataset, tokenizer):
    question_document_pairs = make_question_document_pairs(dataset)

    input_encoded = tokenizer.batch_encode_plus(
        question_document_pairs,
        max_length=MAX_SEQUENCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    labels_encoded = tokenizer.batch_encode_plus(
        dataset["response"],
        max_length=MAX_SEQUENCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    return {'input_ids': input_encoded['input_ids'],
            'labels': labels_encoded['input_ids']}

train_encoded = train_dataset.map(
    preprocess_socratic_batch,
    batched=True,
    fn_kwargs={
      'tokenizer': socratic_tokenizer_quantized
})

val_encoded = dev_dataset.map(
    preprocess_socratic_batch,
    batched=True,
    fn_kwargs={
      'tokenizer': socratic_tokenizer_quantized
})

print()
print("Train encoded:", train_encoded, "\n")
print("Val encoded:", val_encoded, "\n")

Max sequence length: 1024 



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]


Train encoded: Dataset({
    features: ['document', 'question', 'response', 'input_ids', 'labels'],
    num_rows: 1000
}) 

Val encoded: Dataset({
    features: ['document', 'question', 'response', 'input_ids', 'labels'],
    num_rows: 500
}) 



In [None]:
print(train_encoded[0]["input_ids"])
print(train_encoded[0]["labels"])

[0, 50269, 3394, 16, 8001, 1825, 219, 116, 653, 2594, 7, 123, 1328, 5, 527, 116, 1437, 50266, 47507, 154, 13391, 870, 226, 1723, 7831, 16286, 4979, 975, 252, 3559, 75, 1050, 4, 252, 58, 402, 55, 578, 463, 402, 540, 578, 10010, 58, 6, 11, 765, 6, 9187, 18, 1991, 13, 7967, 328, 646, 19163, 438, 44260, 18, 6068, 35, 152, 364, 29015, 21, 2622, 31, 36580, 9, 318, 4662, 35320, 6, 772, 20990, 4, 19188, 17355, 557, 222, 45, 20489, 143, 1283, 14, 5, 121, 4, 104, 4, 4857, 15, 42, 5362, 21, 7867, 21838, 20, 44792, 25504, 8633, 8435, 8, 11491, 22597, 25, 69, 6684, 18569, 5668, 25169, 352, 11, 5, 475, 19873, 4084, 9, 5, 11355, 232, 751, 4, 264, 2551, 7, 28, 36844, 7, 253, 69, 17275, 18, 22379, 463, 259, 6, 80, 6317, 1109, 107, 31, 5, 2445, 40570, 15, 3875, 4, 5997, 8173, 4204, 12957, 11901, 17770, 8, 13314, 149, 69, 5179, 5924, 4, 8977, 272, 20143, 37893, 8, 6387, 81, 6, 3970, 13, 39, 10317, 4, 91, 21, 10, 380, 6, 6087, 12573, 196, 313, 6, 6254, 24503, 131, 53, 2724, 107, 9, 2640, 56, 11224, 159, 3

# Fine Tuning

In [None]:
# Define the training args and other parameters.
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

training_args = Seq2SeqTrainingArguments(
    output_dir="outputs",
    eval_strategy="epoch",
    save_strategy="epoch",
    optim="paged_adamw_8bit", #used with QLoRA
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    learning_rate=2e-5,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    logging_steps=10,
    load_best_model_at_end=True,
    report_to='none',
    label_names=["labels"]
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=socratic_tokenizer_quantized,
    model=socratic_model_quantized)

metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, socratic_tokenizer_quantized.pad_token_id)
    decoded_preds = socratic_tokenizer_quantized.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = socratic_tokenizer_quantized.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

trainer = Seq2SeqTrainer(
    model=socratic_model_quantized,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
    processing_class=socratic_tokenizer_quantized,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Train the model.
os.environ['WANDB_MODE'] = 'disabled'
trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,14.3374,16.056553,0.068407,0.02032,0.058269,0.065271
2,12.3108,14.552622,0.06817,0.02054,0.058046,0.064998
3,11.7395,13.481462,0.068712,0.021098,0.05855,0.065721
4,10.8983,12.833299,0.068514,0.020666,0.058558,0.065449
5,10.5827,12.60545,0.068662,0.02117,0.058759,0.06558


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=625, training_loss=12.49685390625, metrics={'train_runtime': 2044.4224, 'train_samples_per_second': 2.446, 'train_steps_per_second': 0.306, 'total_flos': 1.252572266496e+16, 'train_loss': 12.49685390625, 'epoch': 5.0})

In [None]:
#!rm -rf ./models/socraticpretraining_baseline-2025-07-26_215517/

In [None]:
from datetime import datetime
from zoneinfo import ZoneInfo

!pwd
!mkdir -p ./models

# Get the current time in the US Pacific time zone.
timezone_obj = ZoneInfo("America/Los_Angeles")
current_time = datetime.now(timezone_obj)
current_time = current_time.strftime("%Y-%m-%d_%H%M%S")

model_name = "socraticpretraining_baseline-" + str(current_time)
trainer.save_model(f"./models/{model_name}")

from google.colab import drive
drive.mount('/content/drive')

!mkdir -p "/content/drive/MyDrive/DS266/project/models/{model_name}"
!cp -r ./models/{model_name}/* "/content/drive/MyDrive/DS266/project/models/{model_name}"

/content/DS266-ugarcia-bjulve
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
