In [1]:
# Cell 1: Setup and Environment
from google.colab import drive
import os

# Mount your Google Drive
drive.mount('/content/drive')

# Install required libraries
# 'accelerate' is important for efficient training on GPUs
!pip install transformers datasets torch accelerate -q

print("✅ Setup complete. Your Google Drive is mounted and libraries are installed.")

Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━

In [6]:
# Cell 2: Configuration and Paths (Corrected)
import os

# --- ⚠️ IMPORTANT: EDIT THIS LINE -------------------------------------------------
# Set this to the path of your project folder in Google Drive
GDRIVE_PROJECT_PATH = '/content/drive/MyDrive/psychology_tutor_engine'
# ----------------------------------------------------------------------------------

# --- Define file paths based on your Drive structure ---
# This part is correct
TRAINING_DATA_FILE = os.path.join(GDRIVE_PROJECT_PATH, 'data/4_training_sets/distractor_generation_training_data_DOMAIN_AWARE_FIXED.parquet')
OUTPUT_MODEL_DIR = os.path.join(GDRIVE_PROJECT_PATH, 'models/distractor_generator_t5_domain_aware')

# --- ADDED: A quick check to see what the current working directory is ---
print(f"Current Working Directory: {os.getcwd()}")
print(f"Full path being checked for training data: {TRAINING_DATA_FILE}")

# --- Verify paths and create output directory (with corrected f-strings) ---
if not os.path.exists(TRAINING_DATA_FILE):
    # This is the corrected line for the error message
    raise FileNotFoundError(f"FATAL: Training data file not found at '{TRAINING_DATA_FILE}'. Please check your GDRIVE_PROJECT_PATH.")
else:
    # This is the corrected line for the success message
    print(f"✅ Training data found at: {TRAINING_DATA_FILE}")

os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)
# This is the corrected line for the output message
print(f"✅ Model output will be saved to: {OUTPUT_MODEL_DIR}")

Current Working Directory: /content
Full path being checked for training data: /content/drive/MyDrive/psychology_tutor_engine/data/4_training_sets/distractor_generation_training_data_DOMAIN_AWARE_FIXED.parquet
✅ Training data found at: /content/drive/MyDrive/psychology_tutor_engine/data/4_training_sets/distractor_generation_training_data_DOMAIN_AWARE_FIXED.parquet
✅ Model output will be saved to: /content/drive/MyDrive/psychology_tutor_engine/models/distractor_generator_t5_domain_aware


In [22]:
# Cell 3: Final Production Training Script
import pandas as pd
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)

# --- Configuration ---
BASE_MODEL = "t5-small"
OUTPUT_MODEL_DIR_PROD = os.path.join(GDRIVE_PROJECT_PATH, 'models/distractor_generator_t5_production')

print(f"--- Starting PRODUCTION Fine-tuning of {BASE_MODEL} ---")

# 1. Load the dataset
print("Loading Parquet file...")
df = pd.read_parquet(TRAINING_DATA_FILE)

# 2. Filter the data (as per your recommendation D)
initial_rows = len(df)
df_clean = df[df["correct_answer"].str.lower() != df["distractor"].str.lower()].copy()
print(f"Data Cleaning: Removed {initial_rows - len(df_clean)} rows with identical answers/distractors.")
train_dataset = Dataset.from_pandas(df_clean)
print(f"Using {len(train_dataset)} clean examples for training.")

# 3. Load Tokenizer and add special tokens (as per your recommendation B)
tokenizer = T5Tokenizer.from_pretrained(BASE_MODEL)
tokenizer.add_special_tokens({'additional_special_tokens': ['<ANS>', '</ANS>']})
print("Added <ANS> and </ANS> special tokens to the tokenizer.")

# 4. Load Model and resize embeddings for the new tokens
model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL)
model.resize_token_embeddings(len(tokenizer))

# 5. Pre-processing function with the new prompt format
def preprocess(examples):
    prefix = "generate distractor: "
    inputs = [f"{prefix}question: {q} answer: <ANS> {a} </ANS>" for q, a in zip(examples["question"], examples["correct_answer"])]
    model_in = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=examples["distractor"], max_length=128, truncation=True)
    model_in["labels"] = labels["input_ids"]
    return model_in

# 6. Apply preprocessing
tokenized_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# 7. Training Arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL_DIR_PROD,
    num_train_epochs=4,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    logging_steps=250,
    warmup_steps=500,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    report_to="none",
)

# 8. Initialize and run the Trainer
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=data_collator)
print("\n--- Starting Final Training Run ---")
trainer.train()
print("--- Training Complete ---")

# 9. Save the final model
print(f"\nSaving production model and tokenizer to {OUTPUT_MODEL_DIR_PROD}...")
trainer.save_model(OUTPUT_MODEL_DIR_PROD)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR_PROD)
print("✅ Production model saved successfully!")

--- Starting PRODUCTION Fine-tuning of t5-small ---
Loading Parquet file...
Data Cleaning: Removed 6 rows with identical answers/distractors.
Using 49793 clean examples for training.
Added <ANS> and </ANS> special tokens to the tokenizer.


Map:   0%|          | 0/49793 [00:00<?, ? examples/s]


--- Starting Final Training Run ---


Step,Training Loss
250,4.1678
500,3.6472
750,3.56
1000,3.5008
1250,3.4721
1500,3.4703


--- Training Complete ---

Saving production model and tokenizer to /content/drive/MyDrive/psychology_tutor_engine/models/distractor_generator_t5_production...
✅ Production model saved successfully!


In [23]:
# Cell 4: Testing the Production Model (with Advanced Inference)
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import random

# --- LOAD THE MODEL ---
MODEL_PATH = '/content/drive/MyDrive/psychology_tutor_engine/models/distractor_generator_t5_production'
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading production model from: {MODEL_PATH}")
model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
print(f"Using device: {device}")


# --- FINAL GENERATION FUNCTION (IMPLEMENTING YOUR RECOMMENDATIONS) ---
def generate_distractor(question, correct_answer):
    # Recommendation B: Post-process numerics for plausible distractors
    if correct_answer.isdigit():
        # This simple rule is more reliable than the LLM for numeric distractors
        return str(int(correct_answer) + random.choice([-2, -1, 1, 2]))

    # If the answer is not purely numeric, use the fine-tuned model
    input_text = f"generate distractor: question: {question} answer: <ANS> {correct_answer} </ANS>"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    # Recommendation A: Block the exact answer tokens at generation time
    # This prevents the model from accidentally generating the correct answer
    bad_words_ids = [tokenizer(correct_answer, add_special_tokens=False).input_ids]

    outputs = model.generate(
        input_ids,
        max_length=60,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        no_repeat_ngram_size=2,
        bad_words_ids=bad_words_ids  # <-- The extra guardrail
    )
    distractor = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return distractor


# --- RUN TESTS ---
# Example 1: Psychology Question
q1 = "How does behaviorism differ from psychoanalytic theory?"
a1 = "behaviorism suggests that behavior is learned through environmental stimuli, while psychoanalytic theory posits that behavior is driven by unconscious desires and conflicts"
distractor1 = generate_distractor(q1, a1)

print("\n--- Psychology Example ---")
print(f"Question: {q1}")
print(f"Correct Answer: {a1}")
print(f"GENERATED DISTRACTOR: {distractor1}")

# Example 2: Logic/Math Question
q2 = "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and sells four to her neighbor. How many eggs does she have left to sell at the market?"
a2 = "9"
distractor2 = generate_distractor(q2, a2)

print("\n--- Logic/Math Example ---")
print(f"Question: {q2}")
print(f"Correct Answer: {a2}")
print(f"GENERATED DISTRACTOR: {distractor2}")

Loading production model from: /content/drive/MyDrive/psychology_tutor_engine/models/distractor_generator_t5_production
Using device: cuda

--- Psychology Example ---
Question: How does behaviorism differ from psychoanalytic theory?
Correct Answer: behaviorism suggests that behavior is learned through environmental stimuli, while psychoanalytic theory posits that behavior is driven by unconscious desires and conflicts
GENERATED DISTRACTOR: Behaviorism is a theory that emphasizes the importance of social interaction, as it emphasize the impact of behaviors on the environment and the emotional environment.

--- Logic/Math Example ---
Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and sells four to her neighbor. How many eggs does she have left to sell at the market?
Correct Answer: 9
GENERATED DISTRACTOR: 7


In [26]:
# Cell 5: Data Integrity Report
import pandas as pd

print("--- Generating Data Integrity Report ---")

# --- Step 1: Analyze the ORIGINAL source file on disk ---
print("\n[1/2] Analyzing the source Parquet file...")

try:
    # Load the original dataframe that was used as input for training
    df_original = pd.read_parquet(TRAINING_DATA_FILE)
    total_original_rows = len(df_original)
    print(f"Loaded original file with {total_original_rows} rows.")

    # Find rows where the distractor is a case-insensitive match for the correct answer
    duplicates = df_original[df_original["correct_answer"].str.lower() == df_original["distractor"].str.lower()]
    num_dupes = len(duplicates)

    if num_dupes > 0:
        print(f"\n  [!] INFO: Found {num_dupes} rows ({num_dupes/total_original_rows:.2%}) where the distractor is identical to the correct answer.")
        print("  This is expected. These rows are correctly filtered out by the training script.")
        print("\n  First 5 duplicates found in source file:")
        print(duplicates.head())
    else:
        # This is the corrected line with the f-string properly terminated
        print(f"\n  [✅] INFO: The source file contains 0 duplicate answer/distractor pairs.")

    # --- Step 2: Verify the CLEANED data used in the last training run ---
    print("\n\n[2/2] Verifying the in-memory data that the model was actually trained on...")

    # Re-create the cleaned DataFrame exactly as the training script did
    df_cleaned_for_training = df_original[df_original["correct_answer"].str.lower() != df_original["distractor"].str.lower()]
    final_training_rows = len(df_cleaned_for_training)

    print(f"  - Original rows in source file: {total_original_rows}")
    print(f"  - Rows removed by cleaning process: {total_original_rows - final_training_rows}")
    print(f"  - Final clean rows used for training: {final_training_rows}")

    print("\n\n✅ REPORT COMPLETE: The data pipeline is correctly identifying and removing bad data before training.")

except FileNotFoundError:
    print(f"ERROR: Could not find the training data file at {TRAINING_DATA_FILE}. Please ensure paths are correct.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

--- Generating Data Integrity Report ---

[1/2] Analyzing the source Parquet file...
Loaded original file with 49799 rows.

  [!] INFO: Found 6 rows (0.01%) where the distractor is identical to the correct answer.
  This is expected. These rows are correctly filtered out by the training script.

  First 5 duplicates found in source file:
                                                question  \
5636   Which of the following extrapyramidal effect i...   
8496   Magical thinking is seen in which type of pers...   
11798                      Molecular scissors refers to:   
17028  A 80 cm long wire is to be cut into two pieces...   
43136  A 39-year-old woman presents to the emergency ...   

                     correct_answer                    distractor  \
5636         ['Tardive dyskinesia']        ['tardive dyskinesia']   
8496                ['schizotypal']               ['Schizotypal']   
11798  ['Restriction Endonuclease']  ['Restriction endonuclease']   
17028                  