In [None]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9099c5853405acf62d6901ed02faf8738f9e4e69440f38289b496be1bc55e3b0
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
pip install nltk



In [None]:
nltk.download('punkt_tab')

NameError: name 'nltk' is not defined

In [None]:
nltk.download('punkt')

NameError: name 'nltk' is not defined

In [None]:
import json
import pandas as pd
import numpy as np
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import logging
from rouge_score import rouge_scorer
import random
from tqdm import tqdm

In [None]:
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
"""
This class is used to prepare the data for the question generation task.
"""
class QuestionGeneration(Dataset):
    def __init__(self, contexts, questions, difficulties, tokenizer, max_length=512):
        self.contexts = contexts
        self.questions = questions
        self.difficulties = difficulties
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        input_text = f"generate question: {self.contexts[idx]} difficulty: {self.difficulties[idx]}"

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        target_encoding = self.tokenizer(
            self.questions[idx],
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = input_encoding["input_ids"].squeeze()
        attention_mask = input_encoding["attention_mask"].squeeze()
        target_ids = target_encoding["input_ids"].squeeze()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids,
            "context": self.contexts[idx],
            "question": self.questions[idx],
            "difficulty": self.difficulties[idx]
        }

In [None]:
"""
    Preprocess the dataset from a JSON file and returns lists of contexts, questions, answers, and difficulties.
"""
def preprocess_dataset(file):
    try:
        with open(file, 'r') as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error loading JSON data: {e}")
        raise

    contexts = []
    questions = []
    difficulties = []

    for item in data:
        context = item["context"]
        for qa_pair in item["qa_pairs"]:
            contexts.append(context)
            questions.append(qa_pair["question"])
            difficulties.append(qa_pair["difficulty"])

    return contexts, questions, difficulties

In [None]:
"""
    Freeze all layers except the last 2 layers(by default) in the T5 model.
"""
def freeze_model_layers(model, layers_to_train=2):
    all_params = list(model.named_parameters())

    encoder_parameters = [name for name, _ in all_params if 'encoder.block' in name]
    decoder_parameters = [name for name, _ in all_params if 'decoder.block' in name]

    encoder_layers = sorted(list(set([int(name.split('encoder.block.')[1].split('.')[0]) for name in encoder_parameters])))
    decoder_layers = sorted(list(set([int(name.split('decoder.block.')[1].split('.')[0]) for name in decoder_parameters])))

    encoder_layers_freeze = encoder_layers[:-layers_to_train] if len(encoder_layers) > layers_to_train else []
    decoder_layers_freeze = decoder_layers[:-layers_to_train] if len(decoder_layers) > layers_to_train else []

    for name, param in model.named_parameters():
        freeze = False

        for layer_num in encoder_layers_freeze:
            if f'encoder.block.{layer_num}.' in name:
                freeze = True
                break

        for layer_num in decoder_layers_freeze:
            if f'decoder.block.{layer_num}.' in name:
                freeze = True
                break

        if 'embed' in name or 'relative_attention_bias' in name:
            freeze = True

        if freeze:
            param.requires_grad = False

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())

    return model

In [None]:
"""
    Train the T5 model for question generation.
"""
def train_model(train_dataloader, val_dataloader, model, tokenizer,
                num_epochs=3, learning_rate=5e-5, warmup_steps=100):

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for batch in train_progress_bar:

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_progress_bar.set_postfix({"loss": loss.item()})

        avg_train_loss = total_train_loss / len(train_dataloader)

        model.eval()
        total_val_loss = 0

        with torch.no_grad():
            val_progress_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
            for batch in val_progress_bar:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_val_loss += loss.item()
                val_progress_bar.set_postfix({"loss": loss.item()})

        avg_val_loss = total_val_loss / len(val_dataloader)

        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pt")
            print("Model saved!")

    return model


In [None]:
"""
    Calculate BLEU score for generated questions with the reference questions.
"""
def calculate_bleu_score(references, candidates):
    smoothie = SmoothingFunction().method1
    scores = []

    for ref, cand in zip(references, candidates):

        ref_tokens = nltk.word_tokenize(ref.lower())
        cand_tokens = nltk.word_tokenize(cand.lower())

        score = sentence_bleu([ref_tokens], cand_tokens, smoothing_function=smoothie)
        scores.append(score)

    return np.mean(scores)

"""
    Evaluate the model on the test set and calculate BLEU score.
"""
def evaluate_model(model, test_dataloader, tokenizer):
    model.eval()
    all_generated_questions = []
    all_reference_questions = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

            generated_questions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            all_generated_questions.extend(generated_questions)
            all_reference_questions.extend(batch["question"])

    bleu_score = calculate_bleu_score(all_reference_questions, all_generated_questions)
    print(f"BLEU Score: {bleu_score:.4f}")

    return bleu_score, all_generated_questions, all_reference_questions


In [None]:
"""
    Calculate ROUGE scores on test data.
"""
def calculate_rouge_score(model, tokenizer, test_dataloader):

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)


    predictions = []
    references = []
    original_contexts = []
    original_difficulties = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)


            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                num_beams=4,
                early_stopping=True
            )

            pred_questions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            ref_questions = batch["question"]
            contexts = batch["context"]
            difficulties = batch["difficulty"]

            predictions.extend(pred_questions)
            references.extend(ref_questions)
            original_contexts.extend(contexts)
            original_difficulties.extend(difficulties)

    rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}

    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        for key in rouge_scores:
            rouge_scores[key].append(scores[key].fmeasure)

    rouge_results = {
        "rouge1_f1": round(np.mean(rouge_scores["rouge1"]) * 100, 2),
        "rouge2_f1": round(np.mean(rouge_scores["rouge2"]) * 100, 2),
        "rougeL_f1": round(np.mean(rouge_scores["rougeL"]) * 100, 2),
    }

    results_df = pd.DataFrame({
        "difficulty": original_difficulties,
        "rouge1": rouge_scores["rouge1"],
        "rouge2": rouge_scores["rouge2"],
        "rougeL": rouge_scores["rougeL"]
    })

    print("\nROUGE Scores by Difficulty Level:")
    for difficulty in ['easy', 'medium', 'hard']:
        mask = results_df['difficulty'] == difficulty
        if mask.any():
            print(f"\n{difficulty.capitalize()} Questions:")
            print(f"  ROUGE-1: {round(results_df.loc[mask, 'rouge1'].mean() * 100, 2)}")
            print(f"  ROUGE-2: {round(results_df.loc[mask, 'rouge2'].mean() * 100, 2)}")
            print(f"  ROUGE-L: {round(results_df.loc[mask, 'rougeL'].mean() * 100, 2)}")


    return rouge_results

In [None]:
"""
    Generate a question based on a context and difficulty.
"""
def generate_question(model, tokenizer, context, difficulty,top=5):
    input_text = (
    f"Generate {top} diverse and non-repetitive questions of {difficulty} difficulty "
    f"based on the following context:\n{context}")

    input_encoding = tokenizer(
        input_text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Generate question
    # output = model.generate(
    #     input_ids=input_encoding["input_ids"],
    #     attention_mask=input_encoding["attention_mask"],
    #     max_length=128,
    #     num_return_sequences=top,
    #     num_beams=8,
    #     early_stopping=True
    # )

    output = model.generate(
    input_ids=input_encoding["input_ids"],
    attention_mask=input_encoding["attention_mask"],
    max_length=128,
    num_return_sequences=top,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
    early_stopping=True
    )


    questions = []

    for i in range(top):
        print(f"Generated Question {i+1}: {tokenizer.decode(output[i], skip_special_tokens=True)}")
        questions.append(tokenizer.decode(output[i], skip_special_tokens=True))

    question = tokenizer.decode(output[0], skip_special_tokens=True)

    return questions

In [None]:
file_path = "Json_merged_with_difficulty.json"
contexts, questions, difficulties = preprocess_dataset(file_path)

print(f"Dataset loaded: {len(contexts)} examples")

train_contexts, test_contexts, train_questions, test_questions, train_difficulties, test_difficulties = train_test_split(
    contexts, questions, difficulties, test_size=0.2, random_state=42
)

train_contexts, val_contexts, train_questions, val_questions, train_difficulties, val_difficulties = train_test_split(
    train_contexts, train_questions, train_difficulties, test_size=0.1, random_state=42
)

print(f"Train set: {len(train_contexts)} examples")
print(f"Validation set: {len(val_contexts)} examples")
print(f"Test set: {len(test_contexts)} examples")

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

model = freeze_model_layers(model, num_layers_to_train=2)
model = model.to(device)

train_dataset = QuestionGeneration(
    train_contexts, train_questions, train_difficulties, tokenizer
)
val_dataset = QuestionGeneration(
    val_contexts, val_questions, val_difficulties, tokenizer
)
test_dataset = QuestionGeneration(
    test_contexts, test_questions, test_difficulties, tokenizer
)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model = train_model(
    train_dataloader, val_dataloader, model, tokenizer,
    num_epochs=8, learning_rate=5e-5
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch 1/1 [Train]:   0%|          | 0/1709 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1/1 [Train]: 100%|██████████| 1709/1709 [08:34<00:00,  3.32it/s, loss=2.03]
Epoch 1/1 [Val]: 100%|██████████| 190/190 [00:24<00:00,  7.90it/s, loss=2.2]


In [None]:
model.load_state_dict(torch.load("best_model.pt"))
bleu_score, generated_questions, reference_questions = evaluate_model(model, test_dataloader, tokenizer)
rogue_score = calculate_rouge_score(model, tokenizer, test_dataloader)
print(f"BLEU Score: {bleu_score:.4f}")

NameError: name 'model' is not defined

In [None]:
sample_context_1 = ["Medical imaging has revolutionized the way we diagnose and treat diseases. Techniques such as X-rays, CT scans, MRI scans, and ultrasound allow us to visualize the inside of the body without surgery. X-rays are used to visualize bones and detect fractures. CT scans provide detailed images of the body's internal organs and tissues. MRI scans use magnetic fields and radio waves to create images of soft tissues, such as the brain and spinal cord. Ultrasound uses sound waves to create images of organs and tissues. These imaging techniques are essential for diagnosing a wide range of medical conditions, from broken bones to cancer. They also play a crucial role in guiding surgical procedures and monitoring the effectiveness of treatments. Advances in medical imaging technology are constantly improving the resolution and accuracy of these techniques, allowing for earlier and more accurate diagnoses. The development of new contrast agents is also enhancing the ability to visualize specific tissues and organs. Medical imaging continues to be an indispensable tool in modern medicine, providing invaluable information for diagnosis, treatment planning, and monitoring disease progression."]
for difficulty_1 in ["easy", "medium", "hard"]:
    generated_question = generate_question(model, tokenizer, sample_context_1, difficulty_1,top=3)
    print(f"Difficulty: {difficulty_1}")
    print(f"Context (truncated): {sample_context_1[:100]}...")
    print(f"Generated Question: {generated_question}")
    print("-" * 50)



Generated Question 1: How do MRI scans work?
Generated Question 2: What are the most important aspects of medical imaging?
Generated Question 3: How do MRI scans be used?
Difficulty: easy
Context (truncated): ["Medical imaging has revolutionized the way we diagnose and treat diseases. Techniques such as X-rays, CT scans, MRI scans, and ultrasound allow us to visualize the inside of the body without surgery. X-rays are used to visualize bones and detect fractures. CT scans provide detailed images of the body's internal organs and tissues. MRI scans use magnetic fields and radio waves to create images of soft tissues, such as the brain and spinal cord. Ultrasound uses sound waves to create images of organs and tissues. These imaging techniques are essential for diagnosing a wide range of medical conditions, from broken bones to cancer. They also play a crucial role in guiding surgical procedures and monitoring the effectiveness of treatments. Advances in medical imaging technology are co

In [None]:
output = []
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model.load_state_dict(torch.load("/content/model_T5_epochs_8.pt",map_location=torch.device('cpu')))
model.to(device)
sample_context = ["Artificial Intelligence (AI) and Machine Learning (ML) are among the most influential technological advances of the 21st century. These fields involve the development of algorithms that allow machines to learn from data and make decisions. AI is widely used in industries such as healthcare, finance, and autonomous driving. For example, AI models can now diagnose diseases like cancer with accuracy rivaling that of human doctors. Natural Language Processing (NLP), a subfield of AI, powers virtual assistants like Siri and ChatGPT, enabling seamless human-computer interaction."]
for difficulty in ["easy", "medium", "hard"]:
  generated_question = generate_question(model, tokenizer, sample_context, difficulty,top=7)
  print(f"Difficulty: {difficulty}")
  print(f"Context (truncated): {sample_context[:100]}...")
  output.append(generated_question)
  print("-" * 50)



Generated Question 1: What are the two fields used in AI and Machine Learning?
Generated Question 2: What are the seven main fields of AI?
Generated Question 3: What are some of the influential technological advances of the 21st century?
Generated Question 4: What are the two most influential technological advances of the 21st century?
Generated Question 5: What are the two main fields of AI?
Generated Question 6: What are some applications of AI and Machine Learning?
Generated Question 7: What are the two main fields of AI?
Difficulty: easy
Context (truncated): ['Artificial Intelligence (AI) and Machine Learning (ML) are among the most influential technological advances of the 21st century. These fields involve the development of algorithms that allow machines to learn from data and make decisions. AI is widely used in industries such as healthcare, finance, and autonomous driving. For example, AI models can now diagnose diseases like cancer with accuracy rivaling that of human doctor

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def removeSimilarQuestions(generated_questions):

  embedding_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
  embeddings = embedding_model.encode(generated_questions)


  cosine_similarities = cosine_similarity(embeddings)

  threshold = 0.6

  to_remove = set()

  for i in range(len(generated_questions)):
     for j in range(i + 1, len(generated_questions)):
          if cosine_similarities[i][j] > threshold:
              to_remove.add(j)

  unique_questions = [q for i, q in enumerate(generated_questions) if i not in to_remove]
  return unique_questions
print(f"output:{output}")
uniqueQuestions = removeSimilarQuestions(output)
print(f"Unique Quiestions :{uniqueQuestions}")

output:[['What are the two fields used in AI and Machine Learning?', 'What are the seven main fields of AI?', 'What are some of the influential technological advances of the 21st century?', 'What are the two most influential technological advances of the 21st century?', 'What are the two main fields of AI?', 'What are some applications of AI and Machine Learning?', 'What are the two main fields of AI?'], ['What are the five most influential technological advancements of the 21st century?', 'What are the three most influential technological advancements of the 21st century?', 'What are the potential applications of AI for?', 'What are the seven fields of AI and Machine Learning?', 'What are the two fields of AI and Machine Learning (ML)?', 'How do Artificial Intelligence and Machine Learning (ML) contribute to the 21st century?', 'What are the two most influential technological advances of the 21st century?'], ['What is a subfield of AI?', 'What are the most influential technological ad