<a href="https://colab.research.google.com/github/arpitsaxena27/AI-Powered-Question-Generation/blob/main/Question_Generation_from_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers datasets torch accelerate

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Load the SQuAD 2.0 dataset
print("Loading SQuAD 2.0 dataset...")
squad_dataset = load_dataset("squad_v2")

# Reformat the dataset for question generation
def reformat_for_question_generation(example):
    context = example["context"]
    question = example["question"]
    return {"input_text": f"Generate a question from the context: {context}", "target_text": question}

print("Reformatting dataset...")
train_data = squad_dataset['train'].map(reformat_for_question_generation)
validation_data = squad_dataset['validation'].map(reformat_for_question_generation)

# Define the columns for training
columns_to_remove = list(train_data.features.keys())
columns_to_remove.remove("input_text")
columns_to_remove.remove("target_text")

train_data = train_data.remove_columns(columns_to_remove)
validation_data = validation_data.remove_columns(columns_to_remove)

# Load T5 tokenizer and model
model_name = "t5-small"  # Use "t5-small" for faster training, upgrade to "t5-base" or "t5-large" for better performance
print("Loading tokenizer and model...")
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the dataset
def preprocess_data(example):
    inputs = tokenizer(example["input_text"], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(example["target_text"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

print("Tokenizing dataset...")
train_data = train_data.map(preprocess_data, batched=True, remove_columns=train_data.column_names)
validation_data = validation_data.map(preprocess_data, batched=True, remove_columns=validation_data.column_names)

# Set the format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
validation_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])




Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m480.6/480.6 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Reformatting dataset...


Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Define the model directory in Google Drive
drive_dir = "/content/drive/My Drive/t5-question-generator"

# Check if a model exists in Google Drive
if os.path.exists(drive_dir):
    print("Model found in Google Drive. Loading...")
    tokenizer = T5Tokenizer.from_pretrained(drive_dir)
    model = T5ForConditionalGeneration.from_pretrained(drive_dir)
else:
    print("No model found in Google Drive. Initializing a new model...")
    model_name = "t5-small"  # Choose "t5-small" or another base model
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5-question-generator",  # Directory to save outputs and checkpoints
    evaluation_strategy="steps",          # Evaluate after every few steps for better monitoring
    eval_steps=500,                       # Evaluation every 500 steps
    learning_rate=3e-5,                   # Slightly lower learning rate for stable convergence
    per_device_train_batch_size=16,       # Increase batch size if memory allows for smoother gradients
    per_device_eval_batch_size=16,        # Increase evaluation batch size for faster evaluation
    num_train_epochs=3,                   # Increase epochs to allow the model to learn thoroughly
    weight_decay=0.01,                    # Regularization to prevent overfitting
    save_total_limit=3,                   # Save only the 3 most recent checkpoints
    logging_dir="./logs",                 # Directory for logging
    logging_steps=100,                    # More frequent logging for monitoring progress
    save_strategy="steps",                # Save checkpoints after a set number of steps
    save_steps=500,                       # Save every 500 steps
    warmup_steps=500,                     # Warmup phase for stable training
    gradient_accumulation_steps=2,        # Accumulate gradients over 2 steps for larger effective batch size
    load_best_model_at_end=True,          # Load the best-performing model at the end
    fp16=True,                            # Enable mixed-precision training for faster computation on GPUs
    report_to="all",                      # Report metrics to all enabled platforms (like TensorBoard)
)


# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
)

# Train the model if not pre-existing or as an update
if not os.path.exists(drive_dir):
    print("Training a fresh model...")
else:
    print("Fine-tuning the existing model...")

trainer.train()

# Save the model and tokenizer to Google Drive
if not os.path.exists(drive_dir):
    print("Creating directory in Google Drive...")
    os.makedirs(drive_dir)

print("Saving the model to Google Drive...")
trainer.save_model(drive_dir)  # Save model checkpoint
tokenizer.save_pretrained(drive_dir)  # Save tokenizer

print(f"Model and tokenizer saved/updated in {drive_dir}.")


Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model found in Google Drive. Loading...


  trainer = Trainer(


Fine-tuning the existing model...


Step,Training Loss,Validation Loss
500,0.4834,0.231962
1000,0.4821,0.231225
1500,0.4914,0.231227
2000,0.4873,0.230664
2500,0.4923,0.230482
3000,0.4844,0.229838
3500,0.4762,0.229446
4000,0.473,0.229094
4500,0.4717,0.229276
5000,0.4812,0.228597


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to your saved model in Google Drive
drive_model_path = "/content/drive/My Drive/t5-question-generator"

# Check if the path exists
if not os.path.exists(drive_model_path):
    raise FileNotFoundError(f"The model directory '{drive_model_path}' does not exist in Google Drive.")

# Load the fine-tuned model and tokenizer from Google Drive
print(f"Loading model and tokenizer from {drive_model_path}...")
tokenizer = T5Tokenizer.from_pretrained(drive_model_path)
model = T5ForConditionalGeneration.from_pretrained(drive_model_path)

print("Model and tokenizer loaded successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading model and tokenizer from /content/drive/My Drive/t5-question-generator...
Model and tokenizer loaded successfully!


GEMINAI

In [None]:
!pip install PyPDF2
import PyPDF2
import random
import re

# Function to preprocess a paragraph
def preprocess_paragraph(paragraph):
    """
    Preprocesses a paragraph by:
    - Removing extra spaces
    - Removing special characters (optional, depending on the context)
    - Limiting length to a reasonable size for model input
    """
    # Remove extra spaces and line breaks
    paragraph = re.sub(r'\s+', ' ', paragraph.strip())
    # Remove special characters (optional, adjust as per your requirements)
    paragraph = re.sub(r'[^\w\s,.!?]', '', paragraph)
    # Truncate to ensure it's within a reasonable length for the model
    max_length = 500  # Adjust this based on the model's token limit
    if len(paragraph) > max_length:
        paragraph = paragraph[:max_length] + "..."
    return paragraph

# Function to extract and preprocess paragraphs from PDF
def extract_paragraphs_from_pdf(file_path):
    with open(file_path, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"

        # Split text into paragraphs based on double newline
        raw_paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
        # Preprocess each paragraph
        paragraphs = [preprocess_paragraph(p) for p in raw_paragraphs]
        return paragraphs

# Function to select random paragraphs from the extracted and preprocessed list
def select_random_paragraphs_from_pdf(file_path, count=5):
    """
    Extracts paragraphs from the given PDF and selects a random subset.
    """
    paragraphs = extract_paragraphs_from_pdf(file_path)
    if not paragraphs:
        return None  # Return None if no paragraphs were found
    return random.sample(paragraphs, min(count, len(paragraphs)))

# Main script
if __name__ == "__main__":
    pdf_path = "/content/drive/MyDrive/Operating_System_Concepts_8th_EditionA4.pdf"  # Replace with your PDF file path

    # Select random paragraphs from the PDF
    random_paragraphs = select_random_paragraphs_from_pdf(pdf_path, count=5)

    if random_paragraphs:
        print("Selected Random Paragraphs:\n")
        for i, paragraph in enumerate(random_paragraphs, 1):
            print(f"Paragraph {i}:\n{paragraph}\n")
    else:
        print("No paragraphs found in the PDF.")


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m232.6/232.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Selected Random Paragraphs:

Paragraph 1:
Accetta et al. 1986 M. Accetta, R. Baron, W. Bolosky, D. B. Golub, R. Rashid, A. Tevanian, and M. Young, Mach A New Kernel Foundation for UNIX Development, Proceedings of the Summer USENIX Conference 1986, pages 93112. AdlTabatabai et al. 2007 A.R. AdlTabatabai, C. Kozyrakis, and B. Saha, Unlocking Concur rency, Queue, Volume 4, Number 10 2007, pages 2433. Agrawal and Abbadi 1991 D. P. Agrawal and A. E. Abbadi, An Efficient and FaultTolerant Solution of Distributed Mutual Exclusion, ACM Transact...

Paragraph 2:
12.1 The file system can be v

In [None]:
# Assuming paragraph_array contains the paragraphs extracted from the PDF
# Define the context for each paragraph and generate questions

questions = []

for idx, context in enumerate(random_paragraphs, start=1):
    print(f"Generating questions for Paragraph {idx}...\n")

    # Prepare the input text with the current paragraph as context
    input_text = f"Generate a question from the context: {context}"

    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate questions for the current paragraph
    paragraph_questions = []
    for _ in range(1):  # Generate 5 questions per paragraph
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            num_return_sequences=1,  # Generate one question per iteration
            do_sample=True,  # Enable sampling
            top_k=50,        # Limit sampling to top 50 tokens
            top_p=0.95,      # Nucleus sampling (use top 95% of possible tokens)
            temperature=0.7, # Set temperature for more randomness
            early_stopping=True
        )
        question = tokenizer.decode(outputs[0], skip_special_tokens=True)
        paragraph_questions.append(question)

    # Append questions for this paragraph to the main questions list
    questions.append({
        "paragraph": context,
        "generated_questions": paragraph_questions
    })

# Print the generated questions for all paragraphs
print("Generated Questions for All Paragraphs:")
for i, item in enumerate(questions, 1):
    print(f"Paragraph {i}:\n")
    for j, q in enumerate(item['generated_questions'], 1):
        print(f"  Question {j}: {q}")
    print("\n")


Generating questions for Paragraph 1...





Generating questions for Paragraph 2...

Generating questions for Paragraph 3...

Generating questions for Paragraph 4...

Generating questions for Paragraph 5...

Generated Questions for All Paragraphs:
Paragraph 1:

  Question 1: Who was the name of the USENIX Conference?


Paragraph 2:

  Question 1: What is the minimum level of the file system?


Paragraph 3:

  Question 1: What is a distribution system?


Paragraph 4:

  Question 1: What is the name of the process that communicates with one another through various communication networks?


Paragraph 5:

  Question 1: What was the simplest method to execute a program?


