# Quiz Generator – Comprehensive Model Training (Colab)

Train every required model (2 ML, 2 DL, 1 Transformer) for the quiz-classification task directly in Colab using **your own uploaded CSV**.

**Pipeline**
1. Install dependencies
2. Upload & inspect `quiz_data.csv`
3. Classical ML (Random Forest, SVM) with TF‑IDF
4. Deep Learning (LSTM, CNN) with Word embeddings
5. Transformer fine-tuning (DistilBERT)

> Switch Colab to **GPU** (`Runtime ▸ Change runtime type ▸ GPU`) before running the DL/Transformer sections.


In [None]:
#@title Install Dependencies
!pip install -q pandas numpy scikit-learn matplotlib seaborn tqdm joblib torch torchvision torchtext tensorflow transformers datasets sentencepiece sentence-transformers


In [None]:
#@title Import Libraries & Set Config
import os
import re
import json
import numpy as np
import pandas as pd
from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

tqdm.pandas(disable=False)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                        Trainer, TrainingArguments)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


Using device: cuda


In [None]:
#@title Upload quiz_data.csv
from google.colab import files
import io

print("Upload your quiz_data.csv (or any CSV with quiz text + labels)")
uploaded = files.upload()
if not uploaded:
    raise ValueError("Upload cancelled. Please upload a CSV file.")

FILE_NAME = list(uploaded.keys())[0]
with open("quiz_data.csv", "wb") as f:
    f.write(uploaded[FILE_NAME])

print(f"Saved as quiz_data.csv (original name: {FILE_NAME})")


Upload your quiz_data.csv (or any CSV with quiz text + labels)


Saving quiz_data.csv to quiz_data.csv
Saved as quiz_data.csv (original name: quiz_data.csv)


In [None]:
#@title Inspect Dataset & Choose Target
TEXT_COLUMN = "question"  #@param {type:"string"}
TARGET_COLUMN = "subject"  #@param {type:"string"}

raw_df = pd.read_csv("quiz_data.csv").dropna(subset=[TEXT_COLUMN, TARGET_COLUMN])
raw_df[TEXT_COLUMN] = raw_df[TEXT_COLUMN].astype(str)
raw_df[TARGET_COLUMN] = raw_df[TARGET_COLUMN].astype(str)

print("Rows:", len(raw_df))
print("Columns:", raw_df.columns.tolist())
raw_df[[TEXT_COLUMN, TARGET_COLUMN]].head()


Rows: 100000
Columns: ['id', 'subject', 'topic', 'year', 'exam_type', 'question_type', 'difficulty', 'question']


Unnamed: 0,question,subject
0,The following program segment has been written...,Programming Fundamentals
1,"For each of the following program segments, sp...",Programming Fundamentals
2,Trace the following program and specify the ou...,Programming Fundamentals
3,WRITING C++ PROGRAMS \nInternational travelers...,Programming Fundamentals
4,The BMI (Body Mass Index) is calculated using ...,Programming Fundamentals


In [None]:
#@title Prepare Train/Val/Test Splits
import numpy as np

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-z0-9 ?!.,]", "", text)
    return text.strip()

raw_df["clean_text"] = raw_df[TEXT_COLUMN].progress_apply(clean_text)

label_encoder = LabelEncoder()
raw_df["label_id"] = label_encoder.fit_transform(raw_df[TARGET_COLUMN])
num_classes = len(label_encoder.classes_)

train_df, temp_df = train_test_split(
    raw_df,
    test_size=0.3,
    random_state=42,
    stratify=raw_df["label_id"],
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df["label_id"],
)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
print(f"Classes ({num_classes}):", label_encoder.classes_)


100%|██████████| 100000/100000 [00:02<00:00, 37447.93it/s]


Train: 70000 | Val: 15000 | Test: 15000
Classes (17): ['Applied Physics' 'Automata' 'COAL' 'Calculus' 'Cloud Computing'
 'Computer Networks' 'Data Structures and Algorithms'
 'Digital Image Processing' 'ICT' 'Machine Learning and Operations' 'NLP'
 'Object Oriented Programming' 'Operating System'
 'Programming Fundamentals' 'Software Design and Analysis'
 'Software for machine and devices' 'Theory of Computation and Automata']


In [None]:
#@title Prepare Seq2Seq Dataset for T5
from datasets import Dataset as HFDataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

t5_model_name = "t5-small"  #@param {type:"string"}
t5_max_input = 256  #@param {type:"integer"}
t5_max_output = 128  #@param {type:"integer"}

assert "question" in raw_df.columns, "Dataset must contain a 'question' column for targets."

def build_prompt(row):
    subject = row.get("subject", "general")
    topic = row.get("topic", "topic")
    difficulty = row.get("difficulty", "medium")
    qtype = row.get("question_type", "MCQ")
    return f"Generate {difficulty} {qtype} question for {subject} topic: {topic}"

seq_train = train_df.copy()
seq_val = val_df.copy()

seq_train["prompt"] = seq_train.apply(build_prompt, axis=1)
seq_val["prompt"] = seq_val.apply(build_prompt, axis=1)

hf_seq_train = HFDataset.from_pandas(seq_train[["prompt", "question"]])
hf_seq_val = HFDataset.from_pandas(seq_val[["prompt", "question"]])

t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)

def preprocess_t5(batch):
    model_inputs = t5_tokenizer(
        batch["prompt"],
        max_length=t5_max_input,
        padding="max_length",
        truncation=True,
    )
    with t5_tokenizer.as_target_tokenizer():
        labels = t5_tokenizer(
            batch["question"],
            max_length=t5_max_output,
            padding="max_length",
            truncation=True,
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

hf_seq_train = hf_seq_train.map(preprocess_t5, batched=True, remove_columns=["prompt", "question"])
hf_seq_val = hf_seq_val.map(preprocess_t5, batched=True, remove_columns=["prompt", "question"])

hf_seq_train.set_format(type="torch")
hf_seq_val.set_format(type="torch")

t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name).to(DEVICE)
data_collator = DataCollatorForSeq2Seq(t5_tokenizer, model=t5_model)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/70000 [00:00<?, ? examples/s]



Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
#@title Fine-Tune T5 Question Generator
T5_EPOCHS = 3  #@param {type:"integer"}
T5_BATCH = 4  #@param {type:"integer"}
T5_LR = 3e-4  #@param {type:"number"}
T5_OUTPUT_DIR = "./results/t5-question-generator"

seq_args = Seq2SeqTrainingArguments(
    output_dir=T5_OUTPUT_DIR,
    per_device_train_batch_size=T5_BATCH,
    per_device_eval_batch_size=T5_BATCH,
    num_train_epochs=T5_EPOCHS,
    learning_rate=T5_LR,
    logging_steps=100,
    save_total_limit=2,
    predict_with_generate=True,
)

seq_trainer = Seq2SeqTrainer(
    model=t5_model,
    args=seq_args,
    train_dataset=hf_seq_train,
    eval_dataset=hf_seq_val,
    tokenizer=t5_tokenizer,
    data_collator=data_collator,
)

seq_trainer.train()
seq_metrics = seq_trainer.evaluate()
print(seq_metrics)


  seq_trainer = Seq2SeqTrainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mahmad5116492[0m ([33mahmad5116492_[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
100,1.233
200,0.4259
300,0.2766
400,0.1649
500,0.1258
600,0.0991
700,0.0782
800,0.0694
900,0.0487
1000,0.0564


{'eval_loss': 0.0108442772179842, 'eval_runtime': 130.4053, 'eval_samples_per_second': 115.026, 'eval_steps_per_second': 28.756, 'epoch': 3.0}


In [None]:
from google.colab import files
!zip -r t5_only.zip results/t5-question-generator
files.download('t5_only.zip')

  adding: results/t5-question-generator/ (stored 0%)
  adding: results/t5-question-generator/runs/ (stored 0%)
  adding: results/t5-question-generator/runs/Nov26_14-45-59_b096871a15c4/ (stored 0%)
  adding: results/t5-question-generator/runs/Nov26_14-45-59_b096871a15c4/events.out.tfevents.1764175792.b096871a15c4.247.1 (deflated 26%)
  adding: results/t5-question-generator/runs/Nov26_14-45-59_b096871a15c4/events.out.tfevents.1764168362.b096871a15c4.247.0 (deflated 70%)
  adding: results/t5-question-generator/checkpoint-52000/ (stored 0%)
  adding: results/t5-question-generator/checkpoint-52000/spiece.model (deflated 48%)
  adding: results/t5-question-generator/checkpoint-52000/training_args.bin (deflated 54%)
  adding: results/t5-question-generator/checkpoint-52000/special_tokens_map.json (deflated 85%)
  adding: results/t5-question-generator/checkpoint-52000/rng_state.pth (deflated 26%)
  adding: results/t5-question-generator/checkpoint-52000/model.safetensors (deflated 8%)
  adding: r

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>