# Imports

In [None]:
# For Preprocessing
!pip install -q -U datasets

import json
import pandas as pd
import json
import random
import os
from datasets import Dataset, load_dataset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# For Training

!pip install -q -U torch torchvision torchaudio fastai
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U tokenizers
!pip install -q -U evaluate
!pip install -q -U rouge_score
!pip install -q -U loralib einops xformers

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

import bitsandbytes
from peft import (
    LoraConfig,
    PeftConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    BitsAndBytesConfig,
)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.2/821.2 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m571.0/571.0 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.2/200.2 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Set random seed for reproducibility.
RANDOM_SEED = 33
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
#tf.random.set_seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

# Load Data

In [None]:
# Prep for download.
%cd /content/
!rm -rf DS266-ugarcia-bjulve
!git clone https://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@github.com/bjulve-ischool/DS266-ugarcia-bjulve.git
%cd DS266-ugarcia-bjulve
!ls .

train_file = 'data/v1-3/train.jsonl'
dev_file = 'data/v1-3/dev.jsonl'
test_file = 'data/v1-3/test.jsonl'

/content
Cloning into 'DS266-ugarcia-bjulve'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 32 (delta 8), reused 4 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (32/32), 3.34 MiB | 6.14 MiB/s, done.
Resolving deltas: 100% (8/8), done.
/content/DS266-ugarcia-bjulve
Baseline_Model_Evaluation.ipynb  QFS_Datasets.ipynb
data				 README.md
EDA2.ipynb			 Socratic_FT_Data_Augmentation.ipynb
EDA.ipynb			 Socratic_Pretrained_Sampler.ipynb
outputs				 T5Gemma_Sampler.ipynb


In [None]:
# Helper to load the data into memory.
def load_data(file_path):
  with open(file_path) as f:
      lines = f.read().split("\n")[:-1]

  document_question_response = []
  for line in lines:
      data = json.loads(line)
      # Remove extra white space. Since the tokenizer is subword
      # and not sentence, then the newlines will not likely affect
      # the word embedding underlying meaning.
      document = " ".join(data["document"].split())
      questions = data["questions"]
      for question in questions:
          question_text = " ".join(question["question_text"].split())
          responses = question["responses"]
          for response in responses:
              response_text = " ".join(response["response_text"].split())
              document_question_response.append((document, question_text, response_text))

  return document_question_response


# Get the data. Preserve the original splits.
train_triplets = load_data(train_file)
dev_triplets = load_data(dev_file)
test_triplets =  load_data(test_file)
print("Train:", len(train_triplets))
print("Dev:", len(dev_triplets))
print("Test:", len(test_triplets))

# Create a HF dataset. Shuffle the order
# before returning it.
def make_dataset(triplets):
    documents, questions, responses = zip(*triplets)
    documents = list(documents)
    questions = list(questions)
    responses = list(responses)

    dataset = Dataset.from_dict({"document": documents, "question": questions, "response": responses})
    return dataset.shuffle(seed=RANDOM_SEED)

train_dataset = make_dataset(train_triplets)
dev_dataset = make_dataset(dev_triplets)
test_dataset = make_dataset(test_triplets)

# Print a sample.
random_sample = random.choice(train_dataset)
random_document, random_question, random_response = random_sample["document"], random_sample["question"], random_sample["response"]
print("\nRANDOM SAMPLE:\n")
print(f"\033[1mDocument:\033[0m {random_document[:50]}", "\n")
print(f"\033[1mQuestion:\033[0m {random_question}", "\n")
print(f"\033[1mResponse:\033[0m {random_response}", "\n")

Train: 1000
Dev: 500
Test: 1040

RANDOM SAMPLE:

[1mDocument:[0m THE MAN OUTSIDE By EVELYN E. SMITH Illustrated by  

[1mQuestion:[0m What is the relationship between Martin and Ives? 

[1mResponse:[0m Cousin Ives enters Martin’s life when he is a little older, and is the third descendant to accompany him as his guardian. Out of all his descendants to assume guardianship, Martin forms the closest relationship with Ives. Rather than seeing Martin as a responsibility and duty, Ives sees Martin as an individual and seeks ways to connect and encourage his passions. For one, Ives buys a yacht named The Interregnum to which the pair take upon themselves to explore the current world in. They traveled across the waters and inland to see both the civilized and uncivilized world, with Martin taking it all in. When it was just the two of them, their relationship progressed further. Ives began to open up about the future world that he and his descendants come from and explain the nuances of 

# Data Augmentation (Socratic FT samples)

In [None]:
# First get a subset of the original training examples. Make sure to
# stratify it according to the length of the responses, ideally we
# will select from the full breadth of the response (summary) length
# distribution.

df = train_dataset.to_pandas()
df['response_length'] = pd.cut(df['response'].apply(len),
                                bins=3,
                                labels=['short', 'medium', 'long'])
df['response_count'] = df['response'].apply(len)
grouped_by_length = df.groupby('response_length', observed=True)

# Note that the frac value here is based on our use case: the Pagnoni
# paper suggested augmenting only 25% of your training set.
stratified_sample_proportion = grouped_by_length.apply(lambda x: x.sample(frac=0.25), include_groups = False)
stratified_sample_proportion = stratified_sample_proportion.reset_index(drop=True)

# Get the remaining samples. We will use these for data augmentation.
remaining_samples = df[~df.index.isin(stratified_sample_proportion.index)]


# Set back to HF DataSet.
stratified_sample_proportion = stratified_sample_proportion.drop('response_count', axis=1)
train_dataset_stratified = Dataset.from_pandas(stratified_sample_proportion)

remaining_samples = remaining_samples.drop('response_count', axis=1)
train_dataset_remaining = Dataset.from_pandas(remaining_samples)

print("Stratified Samples:", train_dataset_stratified.num_rows, "\n")
print("Remaining samples:", train_dataset_remaining.num_rows)

Stratified Samples: 250 

Remaining samples: 750


In [None]:
# Next, create the classes and functions to generate the augmented samples. These
# will have special masking tokens that need to be preprocessed accordingly before
# tokenization.
from sortedcontainers import SortedList
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
import evaluate
rouge = evaluate.load('rouge')
from nltk.tokenize import word_tokenize

# Create this data structure to process each document's
# sentences in one pass (one loop). See step 1 below.
class ScoredSentence(object):
    def __init__(self, index, score, sentence):
        self.index = index
        self.score = score
        self.sentence = sentence
    def __lt__(self, other):
        return self.score < other.score
    def __repr__(self):
        s = self.sentence
        if len(s) > 5:
            s = s[:5] + "..."
        return f"ScoredSentence(index={self.index}, score={self.score}, sentence='{s}')"

class TopScoredSentences(object):
    def __init__(self, m):
        self.m = m

        # Sorted in reverse order.
        sort_scored_sentences = lambda scored_sentence: -1 * scored_sentence.score
        self.scored_sentences = SortedList([], key=sort_scored_sentences)
    def __repr__(self):
        lst = ""
        for scored_sentence in self.scored_sentences:
          lst += f"{scored_sentence}, "
        lst.rstrip(",")
        return f"TopScoredSentences([{lst}])"
    def __len__(self):
        return len(self.scored_sentences)
    def get_indexes(self):
      indexes = []
      for scored_sentence in self.scored_sentences:
        indexes.append(scored_sentence.index)
      indexes.sort()
      return indexes
    def maintain_top_m(self, ScoredSentence):
      # m is the number of sentences you want to keep in
      # scored_sentences. The top m scores will be kept.
      # If scored_sentences length is >= m, then if ScoredSentence
      # is > the last item in the list, then pop off the last
      # item and add the new ScoredSentence. Otherwise, if
      # scored_sentences length < m, then there's room so just
      # add the new ScoredSentence.
      if len(self.scored_sentences) >= self.m:
        if self.scored_sentences[-1] < ScoredSentence:
          self.scored_sentences.pop()
          self.scored_sentences.add(ScoredSentence)
      else:
        self.scored_sentences.add(ScoredSentence)
    def get_pseudo_summary(self, truncate=False, truncate_length=256):
      pseudo_summary = ""
      for scored_sentence in self.scored_sentences:
        pseudo_summary += scored_sentence.sentence + " "
      pseudo_summary = pseudo_summary.rstrip()

      # Needed for the Pagnoni model which has limited input tokens.
      if truncate:
        tokens = word_tokenize(pseudo_summary)
        num_tokens = len(tokens)
        truncated = tokens[:truncate_length] if num_tokens > truncate_length else tokens
        reconstructed_text = " ".join(truncated)
        pseudo_summary = reconstructed_text
      return pseudo_summary

# Now generate the masked document dataset. This will be used for
# fine tuning of the model. Note that the Pagnoni paper suggested
# augmenting only 25% of your training samples so as not to bias
# your model to generate questions. For the first test, we are
# going to have 1000 total training samples as in the original
# set, so we will need to generate 250 augmented examples.

# Algorithm:
# 1. Select the most salient sentences from the input document.
#     - Used the PEGASUS-style Gap Sentence Generation (GSG) approach to select sentences.
#     - Ensure a Gap Sentence Ratio (GSR) of 45%, meaning 45% of the sentences of a document
#       will appear in the pseudo summary.
#     - Following the Pagnoni paper suggestion, 80% of the selected sentences will appear
#       masked in the document, and 20% will remain unmasked. This is to encourage the model
#       to copy information at times from the input to the summary.
#     - Also following the Pagnoni paper, will truncate the unmasked document text. The paper
#       suggested 512 tokens, but in our case we will just grab the first 10% of the input
#       text as they are short stories on the order of 5k-10k words.
#     - At the end of step 1, we should have the following objects for each sample:
#         + Unmasked document.
#         + Masked document.
#         + Pseudo summary.
# 2. Generate questions for each sentence in the pseudo summary.
#     - Per Pagnoni et al., use MixQG to generate questions, with the pseudo summary as the
#       context and the sentence as the answer. This will encourage the model to consider
#       relevant questions given the specific context.
#     - With the questions, create new samples to add to the training set until the desired number is reached.
#         + Will need to generate 250 new samples.
#         + Each sample will consist of the masked document as the input, and a [question<qsep>pseudo_summary]
#           string as the output.
#     - At the end of step 2, we should have the following objects for each sample:
#         + Unmasked document.
#         + Masked document.
#         + Pseudo summary.
#         + A question.

# 1. Get salient sentences.
def get_sentences(text):
  return nltk.sent_tokenize(text)

def truncate_tokens(text, max):
  tokens = word_tokenize(text)
  num_tokens = len(tokens)
  truncated = tokens[:max] if num_tokens > max else tokens
  reconstructed_text = " ".join(truncated)
  reconstructed_text_length = len(reconstructed_text)
  truncated_text = text[:reconstructed_text_length] if reconstructed_text_length < len(text) else text
  return truncated_text

def compute_rouge1(sentence, text):
    rouge_scores = rouge.compute(predictions=[sentence], references=[text])
    return rouge_scores["rouge1"]

def select_salient_sentences(sentences, text, metric, gsr=.45):
  # Compute how many sentences to select.
  if type(gsr) == int:
    pseudo_summary_sentence_count = gsr
  else:
    pseudo_summary_sentence_count = int(len(sentences) * gsr)

  # Next, for each sentence, assign a metric
  # score for the sentence against the rest of
  # the text. This score will be used to maintain
  # the top m scoring sentences from the document.
  top_scored_sentences = TopScoredSentences(pseudo_summary_sentence_count)
  i = 0
  for sentence in sentences:
    score = metric(sentence, text)
    top_scored_sentences.maintain_top_m(ScoredSentence(i, score, sentence))
    i += 1

  return top_scored_sentences

def generate_masked_document(unmasked_document_sentences, top_scored_sentences):
  pseudo_summary_indexes = top_scored_sentences.get_indexes()

  # Per the Pagnoni paper, only select 80% of these pseudo summary
  # sentences to mask in the document. Leave about 20% overlap between
  # the masked document and the pseudo summary to encourage the model
  # to copy.
  num_items_to_select = int(len(pseudo_summary_indexes) * .8)
  if num_items_to_select > len(pseudo_summary_indexes):
        num_items_to_select = len(pseudo_summary_indexes)
  selected_indexes = random.sample(pseudo_summary_indexes, num_items_to_select)

  selected_elements = [unmasked_document_sentences[i] for i in range(len(unmasked_document_sentences)) if i not in selected_indexes]
  concatenated_text = " ".join(selected_elements) # Concatenates into a single string
  return concatenated_text

# 2. Generate a salient question.

# MixQG Question Generation system.

from transformers import pipeline
mixQG = pipeline("text2text-generation", model='Salesforce/mixqg-base', tokenizer='Salesforce/mixqg-base')

# Remember that the DataSet format is: DataSet({ document:[], question:[], response:[]})
# For these augmented samples:
#   - Document is the masked document text.
#   - Question is the generated question from mixQG.
#   - Response is a string of the form: [question<qsep>pseudo_summary]
def generate_question_pseudo_summary_sample(masked_document_text, top_scored_sentences):
  context = top_scored_sentences.get_pseudo_summary()
  # Grab the most salient sentence to be the answer for the MixQG model. This is the first
  # item in the sorted list since it is sorted by score in descending order.
  answer = top_scored_sentences.scored_sentences[0].sentence
  question = mixQG(f"{answer} \\n {context}")[0]["generated_text"]
  return (
      "[Ask&Answer][Mask]" + masked_document_text,
      question,
      f"{question}[QSep]{context}")

# Putting it all together.
from tqdm.notebook import tqdm

def socratic_augment(dataset):
  augmented_dict = {"document": [], "question": [], "response": []}
  total = dataset.num_rows
  print("Socratic Augment Samples:", total)
  pbar = tqdm(dataset)
  for sample in pbar:
    pbar.set_description_str("Truncate tokens")
    document_text = truncate_tokens(sample["document"], 450)
    pbar.set_description_str("Get sentences")
    document_sentences = get_sentences(document_text)
    dsl = len(document_sentences)
    dtl = len(document_text)
    pbar.set_description_str(f"Select by salience score (s={dsl}, t={dtl})")
    top_scored_sentences = select_salient_sentences(document_sentences, document_text, compute_rouge1)
    pbar.set_description_str("Generate mask document")
    masked_document_text = generate_masked_document(document_sentences, top_scored_sentences)
    pbar.set_description_str("Make pseudo summary samples")
    doc, q, r = generate_question_pseudo_summary_sample(masked_document_text, top_scored_sentences)
    augmented_dict["document"].append(doc)
    augmented_dict["question"].append(q)
    augmented_dict["response"].append(r)
  return Dataset.from_dict(augmented_dict)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Downloading builder script: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [None]:
from datasets import concatenate_datasets, Dataset

# Generate the augmented samples from the stratified 250
# that we sampled.
train_dataset_stratified_augmented = socratic_augment(train_dataset_stratified)


Socratic Augment Samples: 250


  0%|          | 0/250 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
# @title
# For dev. Sample code to add special tokens.
# from nltk.tokenize import word_tokenize

# text = train_dataset_stratified_augmented[0]["response"]
# print(text)

# test_tokenizer = AutoTokenizer.from_pretrained("Salesforce/squality-socratic-books-30M")

# special_tokens_dict = {"additional_special_tokens": ["[Ask&Answer]", "[Mask]", "[QSep]"]}
# num_added_toks = test_tokenizer.add_special_tokens(special_tokens_dict)
# print(f"Added {num_added_toks} tokens to the tokenizer.")

# tokens = test_tokenizer.tokenize(text)
# print(len(test_tokenizer))
# #print(len(socratic_tokenizer_quantized))
# qsep_index = test_tokenizer.all_special_tokens.index("[QSep]")
# print(test_tokenizer.all_special_tokens)
# print(test_tokenizer.all_special_ids)
# print(tokens)
# print(qsep_index, test_tokenizer.all_special_ids[qsep_index])
# encoded = test_tokenizer(text)
# print(encoded)
# find_i = encoded["input_ids"].index(test_tokenizer.all_special_ids[qsep_index])
# print(find_i)
# encoded["input_ids"][find_i] = -100
# print(encoded)

What was the reason for the short man standing next to the pair?[QSep]There was a short man standing next to the pair—young, as most men and women were in that time, thanks to the science which could stave off decay, though not death—but with no other apparent physical virtue, for plastic surgery had not fulfilled its bright promise of the twentieth century. Everyone in the room was aware of the big young man, and most of the humans present were resentful, for he handled himself consciously and arrogantly, as if his appearance alone were enough to make him superior to anyone. So did the light-haired girl at his side, and so did the nondescript man in the gray suit who was watching them from a booth in the corner. Now he was not only a rather ugly little man, but also a rather ridiculous one—or at least he felt he was, which was what mattered. "You must allow me to pay your cleaning bill," Gabe said, taking out his wallet and extracting several credit notes without seeming to look at th

In [None]:
# Add these back to the original 750 to get back 1000 samples in our training set.
train_dataset_original_plus_augmented = concatenate_datasets([train_dataset_remaining, train_dataset_stratified_augmented])

print("Combined Samples:", train_dataset_original_plus_augmented.num_rows, "\n")
print("Augmented Sample:", train_dataset_original_plus_augmented[-1])

# Lastly, shuffle the dataset again.
train_dataset_original_plus_augmented.shuffle(seed=RANDOM_SEED)


Combined Samples: 1000 

Augmented Sample: {'document': '[Ask&Answer][Mask]Extensive research did not uncover any evidence that the U.S. copyright on this publication was renewed.] Major Winship, after receiving the message, discussed precautions with the three other Americans. "Is Pinov," came the reply. "Help?" " Nyet ," said Major Winship, exhausting his Russian. "Count down. Progress. When—boom?" "Is Pinov," came the reply. "Boom! Boom!" said Major Winship in exasperation. "Boom!" said Pinov happily. "When?" "Boom—boom!" said Pinov. "Oh, nuts." Major Winship cut out the circuit. "The one that doesn\'t speak English." "He\'s done it deliberately," said Capt. Wilkins, the eldest of the four Americans. No one bothered to respond. Ultimately, Lt. Chandler said, "This is a little ridiculous. Rap if you want me." He sat transfixed for several minutes. "Ah, it\'s all Russian. Jabbering away. I can\'t tell a thing that\'s going on." "Static?" "Nope." "We\'ll get static on these things." Ma

Dataset({
    features: ['document', 'question', 'response', 'response_length', '__index_level_0__'],
    num_rows: 1000
})

In [None]:
from datetime import datetime
from zoneinfo import ZoneInfo

!pwd
!mkdir -p ./datasets

# Get the current time in the US Pacific time zone.
timezone_obj = ZoneInfo("America/Los_Angeles")
current_time = datetime.now(timezone_obj)
current_time = current_time.strftime("%Y-%m-%d_%H%M%S")

dataset_name = "models_socraticpretraining_augmented_train-" + str(current_time) + ".json"
train_dataset_original_plus_augmented.to_json(f"./datasets/{dataset_name}")

from google.colab import drive
drive.mount('/content/drive')

!mkdir -p "/content/drive/MyDrive/DS266/project/datasets"
!cp ./datasets/{dataset_name} "/content/drive/MyDrive/DS266/project/datasets/{dataset_name}"

/content/DS266-ugarcia-bjulve


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Uncomment to test loading.
recover_dataset = load_dataset("json", data_files=f"/content/drive/MyDrive/DS266/project/datasets/{dataset_name}", split="train")
print(dataset_name, "\n")
print(recover_dataset)

for i in range(2):
  sample = recover_dataset[i]
  print()
  print("D:", sample["document"][:10], "| Q:", sample["question"], "| R:", sample["response"])

models_socraticpretraining_augmented_train-2025-07-28_092936.json 

Dataset({
    features: ['document', 'question', 'response', 'response_length', '__index_level_0__'],
    num_rows: 1000
})

D: CALL HIM N | Q: What pattern does Stevenson notice in the crimes that makes him suspicious? | R: In all three incidents that take place in the story, the criminals were stopped and caught by the police. They all seemed to be mysteriously burned in one way or another: the tires on the car melted off, Higgins' hands were burned by the rifle, and the jackets and weapons of the gang members seemed to have the same effect. Additionally, all three events were tagged by "The Scorpion": the words were branded on the car, the rifle, and the jackets.

D: CALL HIM N | Q: What is the relationship between Stevenson and Hanks? | R: Hanks is the Precinct Captain, while Stevenson works under him as a Detective-Sergeant. Hanks and Stevenson share a good working relationship; however, Hank gets annoyed whenever

# Preprocessing

In [None]:
# Get the pretrained model and prepare it for QLoRA.
# We'll use the quantized version of the model for
# PEFT.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    load_4bit_use_double_quant=True,
)

socratic_checkpoint_name = "Salesforce/squality-socratic-books-30M"
socratic_model_quantized = AutoModelForSeq2SeqLM.from_pretrained(
    socratic_checkpoint_name,
    quantization_config=bnb_config,
    device_map={"": 0})
socratic_tokenizer_quantized = AutoTokenizer.from_pretrained(socratic_checkpoint_name)
socratic_model_config_quantized = AutoConfig.from_pretrained(socratic_checkpoint_name)

# Add special tokens for Socratic FT.
special_tokens = ["[Ask&Answer]", "[Mask]", "[QSep]"]
num_added_tokens = socratic_tokenizer_quantized.add_special_tokens({'additional_special_tokens': special_tokens})
print(f"Added {num_added_tokens} tokens to the tokenizer.")

# Resize the model accordingly.
print("Resized model vocab:", socratic_model_config_quantized.vocab_size, len(socratic_tokenizer_quantized))
socratic_model_quantized.resize_token_embeddings(max(
    len(socratic_tokenizer_quantized),
    socratic_model_config_quantized.vocab_size))

# Prepare for QLoRA.
socratic_model_quantized = prepare_model_for_kbit_training(socratic_model_quantized)

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["k_proj", "v_proj", "q_proj", "o_proj"],
    trainable_token_indices={'embed_tokens': socratic_tokenizer_quantized.convert_tokens_to_ids(special_tokens)},
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    modules_to_save=["lm_head"]
)

socratic_model_quantized = get_peft_model(socratic_model_quantized, config)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Added 3 tokens to the tokenizer.
Resized model vocab: 50274 50277


In [None]:
# Tokenize the training and eval datasets and prep them for fine tuning.

MAX_SEQUENCE_LENGTH = socratic_tokenizer_quantized.model_max_length
print(f"Max sequence length: {MAX_SEQUENCE_LENGTH}", "\n")

def make_question_document_pairs(dataset):
    question_document_pairs = []
    for document, question in zip(dataset["document"], dataset["question"]):
        question_document_pairs.append(f"<ask&answer> {question} <qsep> {document}")

    return question_document_pairs

def preprocess_socratic_batch(dataset, tokenizer):
    question_document_pairs = make_question_document_pairs(dataset)

    input_encoded = tokenizer.batch_encode_plus(
        question_document_pairs,
        max_length=MAX_SEQUENCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    labels_encoded = tokenizer.batch_encode_plus(
        dataset["response"],
        max_length=MAX_SEQUENCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Need to discount the [QSep] token so it doesn't affect the
    # loss function.
    qsep_index = socratic_tokenizer_quantized.all_special_tokens.index("[QSep]")
    qsep_token_id = socratic_tokenizer_quantized.all_special_ids[qsep_index]
    for label_ids in labels_encoded['input_ids']:
        label_ids[label_ids == qsep_token_id] = -100

    return {'input_ids': input_encoded['input_ids'],
            'labels': labels_encoded['input_ids']}

train_encoded = train_dataset_original_plus_augmented.map(
    preprocess_socratic_batch,
    batched=True,
    fn_kwargs={
      'tokenizer': socratic_tokenizer_quantized
})

val_encoded = dev_dataset.map(
    preprocess_socratic_batch,
    batched=True,
    fn_kwargs={
      'tokenizer': socratic_tokenizer_quantized
})

print()
print("Train encoded:", train_encoded, "\n")
print("Val encoded:", val_encoded, "\n")

Max sequence length: 1024 



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]


Train encoded: Dataset({
    features: ['document', 'question', 'response', 'response_length', '__index_level_0__', 'input_ids', 'labels'],
    num_rows: 1000
}) 

Val encoded: Dataset({
    features: ['document', 'question', 'response', 'input_ids', 'labels'],
    num_rows: 500
}) 



In [None]:
print(len(train_encoded))
random_training_sample = random.choice(train_encoded)
print(random_training_sample["input_ids"])
print(random_training_sample["labels"])

1000
[0, 50269, 653, 21, 5, 1219, 13, 5, 765, 313, 2934, 220, 7, 5, 1763, 116, 1437, 50266, 1437, 50274, 50275, 40884, 17355, 557, 222, 45, 20489, 143, 1283, 14, 5, 121, 4, 104, 4, 4857, 15, 42, 5362, 21, 7867, 21838, 520, 8642, 4854, 16, 5861, 1455, 6, 1116, 768, 10, 313, 16, 7919, 7, 33, 10, 809, 12984, 4, 20, 39341, 21, 14, 37, 56, 7, 109, 24, 1003, 1666, 8, 39, 809, 74, 45, 11866, 328, 20, 313, 23, 5, 2003, 21, 20135, 19222, 6, 8, 37, 1467, 24, 4, 7632, 11, 5, 929, 21, 2542, 9, 5, 380, 664, 313, 6, 8, 144, 9, 5, 5868, 1455, 58, 31379, 2650, 6, 13, 37, 7521, 1003, 35561, 8, 46553, 12106, 6, 25, 114, 39, 2772, 1937, 58, 615, 7, 146, 123, 10295, 7, 1268, 4, 29757, 4021, 39, 3124, 1810, 11, 65, 9, 39, 21734, 18693, 4, 22, 31535, 6, 9896, 60, 29757, 26, 40154, 5846, 4, 22, 3684, 127, 7684, 4, 370, 531, 905, 162, 907, 47, 10, 5010, 72, 91, 18371, 4075, 7, 5, 33080, 4, 22, 21518, 9, 5, 276, 13, 127, 2598, 12, 397, 259, 72, 20, 11355, 313, 35454, 5134, 32628, 352, 23, 39, 38221, 20580, 19,

# Fine Tuning

In [None]:
# Define the training args and other parameters.
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

training_args = Seq2SeqTrainingArguments(
    output_dir="outputs",
    eval_strategy="epoch",
    save_strategy="epoch",
    optim="paged_adamw_8bit", #used with QLoRA
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    learning_rate=2e-5,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    logging_steps=10,
    load_best_model_at_end=True,
    report_to='none',
    label_names=["labels"]
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=socratic_tokenizer_quantized,
    model=socratic_model_quantized)

metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, socratic_tokenizer_quantized.pad_token_id)
    decoded_preds = socratic_tokenizer_quantized.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = socratic_tokenizer_quantized.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

trainer = Seq2SeqTrainer(
    model=socratic_model_quantized,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
    processing_class=socratic_tokenizer_quantized,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Train the model.
os.environ['WANDB_MODE'] = 'disabled'
trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,9.5173,9.487814,0.071341,0.021392,0.060741,0.068141
2,5.3611,4.730658,0.076797,0.023324,0.063846,0.072565
3,2.0623,1.512324,0.079199,0.02426,0.065942,0.074802
4,1.3064,1.003995,0.081072,0.02508,0.06721,0.076498
5,0.9562,0.947386,0.08107,0.024393,0.067094,0.076504


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=625, training_loss=5.054167267608642, metrics={'train_runtime': 842.1414, 'train_samples_per_second': 5.937, 'train_steps_per_second': 0.742, 'total_flos': 1.252600578048e+16, 'train_loss': 5.054167267608642, 'epoch': 5.0})

In [None]:
#!rm -rf ./models/socraticpretraining_baseline-2025-07-26_215517/

In [None]:
from datetime import datetime
from zoneinfo import ZoneInfo

!pwd
!mkdir -p ./models

# Get the current time in the US Pacific time zone.
timezone_obj = ZoneInfo("America/Los_Angeles")
current_time = datetime.now(timezone_obj)
current_time = current_time.strftime("%Y-%m-%d_%H%M%S")

model_name = "socraticpretraining_augmented-" + str(current_time)
trainer.save_model(f"./models/{model_name}")

from google.colab import drive
drive.mount('/content/drive')

!mkdir -p "/content/drive/MyDrive/DS266/project/models/{model_name}"
!cp -r ./models/{model_name}/* "/content/drive/MyDrive/DS266/project/models/{model_name}"

/content/DS266-ugarcia-bjulve




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
