<a href="https://colab.research.google.com/github/capGoblin/QA_Generation_System/blob/main/QA_Generation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain
!pip install PyPDF2
!pip install pypdf
!pip install tiktoken
!pip install openai
!pip install faiss-gpu

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
from langchain.chains import LLMChain
import os
import re
import json
import time
from PyPDF2 import PdfReader
import csv

In [8]:
from google.colab import files

uploaded = files.upload()

Saving Big Mac Index.pdf to Big Mac Index.pdf


In [None]:
from google.colab import userdata
userdata.get('OPENAI_API_KEY')

In [90]:
def generate_true_false_questions(text):
    """
    Generate true/false questions based on the provided text.

    Args:
        text (str): The text content to base the questions on.

    Returns:
        list: A list of generated true/false questions.
    """
    # Initialize a language model for question generation
    llm_ques_gen = ChatOpenAI(temperature=0.3, model="gpt-3.5-turbo")

    # Define the prompt template for true/false questions
    true_false_prompt_template = """
    You are an expert at creating true/false questions based on the provided text.
    Your goal is to test the knowledge of coders or programmers on the content below:

    ------------
    {text}
    ------------

    Create true/false questions that will assess understanding.
    Ensure questions are clear and concise.

    QUESTIONS:
    """

    # Create a prompt template with the specified input variable
    prompt = PromptTemplate(input_variables=["text"], template=true_false_prompt_template)

    # Initialize an LLMChain for question generation
    llmChain = LLMChain(llm=ChatOpenAI(temperature=0.3, model="gpt-3.5-turbo"), prompt=prompt)

    # Generate questions using the language model
    questions = llmChain.run(text)

    # Parse and format the generated questions
    ques = []
    questions = re.findall(r'(\d+)\.\s+(.*)', questions)
    for number, question in questions:
        ques.append(f"{number}. {question}")

    return ques

def generate_multiple_choice_questions(text):
    """
    Generate multiple-choice questions based on the provided text.

    Args:
        text (str): The text content to base the questions on.

    Returns:
        list: A list of generated multiple-choice questions.
    """
    # Define the prompt template for multiple-choice questions
    prompt_template = """
    You are preparing multiple-choice questions based on the following text chunk:
    ------------
    {text}
    ------------

    Generate multiple-choice questions that cover important concepts.
    Provide clear and relevant options.

    QUESTIONS:
    """

    # Create a prompt template with the specified input variable
    prompt = PromptTemplate(input_variables=["text"], template=prompt_template)

    # Initialize an LLMChain for question generation
    llmChain = LLMChain(llm=ChatOpenAI(temperature=0.3, model="gpt-3.5-turbo"), prompt=prompt)

    # Generate questions using the language model
    questions = llmChain.run(text)

    # Split generated questions into blocks and format them
    question_blocks = questions.strip().split('\n\n')
    question_array = []
    for block in question_blocks:
        lines = block.strip().split('\n')
        question_number, question_content = lines[0].split('. ', 1)
        options = ' '.join(f"{line.strip()}" for line in lines[1:])
        formatted_question = f"{question_number}. {question_content} {options}"
        question_array.append(formatted_question)

    return question_array

def generate_one_word_answer_questions(text):
    """
    Generate one-word answer questions based on the provided text.

    Args:
        text (str): The text content to base the questions on.

    Returns:
        list: A list of generated one-word answer questions.
    """
    # Define the prompt template for one-word answer questions
    prompt_template = """
    You are preparing one-word answer questions based on the following text chunk:
    ------------
    {text}
    ------------

    Generate one-word answer questions that target key information.
    Keep questions precise and focused.

    QUESTIONS:
    """

    # Create a prompt template with the specified input variable
    prompt = PromptTemplate(input_variables=["text"], template=prompt_template)

    # Initialize an LLMChain for question generation
    llmChain = LLMChain(llm=ChatOpenAI(temperature=0.3, model="gpt-3.5-turbo"), prompt=prompt)

    # Generate questions using the language model
    questions = llmChain.run(text)

    # Split generated questions into a list
    questions = questions.split('\n')

    return questions


In [93]:
def process_text_and_generate_questions(text):
    splitter = TokenTextSplitter(model_name='gpt-3.5-turbo', chunk_size=10000, chunk_overlap=200)
    text_chunks = splitter.split_text(text)

    all_questions = {
        "true_false": [],
        "multiple_choice": [],
        "one_word_answer": []
    }

    for chunk in text_chunks:
        if chunk.strip():
            # Generate questions for each chunk and append to respective lists
            all_questions["true_false"].extend(generate_true_false_questions(chunk))
            all_questions["multiple_choice"].extend(generate_multiple_choice_questions(chunk))
            all_questions["one_word_answer"].extend(generate_one_word_answer_questions(chunk))

    return all_questions

# Assuming correct implementation of PyPDFLoader and loader.load() to extract text
loader = PyPDFLoader('/content/Big Mac Index.pdf')
data = loader.load()

question_gen = []

for page in data:
    question_gen.append(page.page_content)  # Accumulate page content in a list

# Join all page contents into a single string
generated_text = '\n'.join(question_gen)

# Process the concatenated text to generate questions
generated_questions = process_text_and_generate_questions(generated_text)

print("True/False Questions:")
print(generated_questions["true_false"])

print("\nMultiple-Choice Questions:")
print(generated_questions["multiple_choice"])

print("\nOne-Word Answer Questions:")
print(generated_questions["one_word_answer"])

True/False Questions:
['1. The Big Mac Index was introduced in The Economist in September 1986 by Pam Woodall.', '2. The Big Mac Index compares the relative price worldwide to purchase a Whopper, a hamburger sold at Burger King restaurants.', '3. The purpose of the Big Mac Index is to calculate an implied exchange rate between two currencies.', "4. The Big Mac Index is limited by geographical coverage due to the presence of the McDonald's franchise.", '5. The Big Mac Index methodology is not affected by the social status of eating at fast food restaurants in a local market.']

Multiple-Choice Questions:
["1. What is the purpose of The Big Mac Index? A. To calculate the price of a Big Mac in different countries B. To measure purchasing power parity between two currencies C. To determine the nutritional value of a Big Mac D. To analyze the market share of McDonald's restaurants worldwide", '2. How is the implied exchange rate calculated in The Big Mac Index? A. By dividing the price of a

In [125]:
def generate_true_false_answers(text, llm_model):
    """
    Generate a true/false answer based on the provided text using a language model.

    Args:
        text (str): The text containing the statement to answer.
        llm_model: The language model used for generating answers.

    Returns:
        str: The generated true/false answer (either 'true' or 'false').
    """
    # Define the prompt template for true/false answer generation
    prompt_template = """
    You are an expert at answering true/false questions based on the provided text.
    Please provide a true or false answer for the following statement:

    ------------
    {text}
    ------------

    ANSWER:
    """

    # Create a prompt template with the specified input variable
    PROMPT_ANSWER = PromptTemplate(template=prompt_template, input_variables=["text"])

    # Initialize an LLMChain for answer generation
    ques_gen_chain = LLMChain(llm=llm_model, prompt=PROMPT_ANSWER)

    # Generate an answer using the language model
    answer = ques_gen_chain.run(text=text)

    # Clean up the generated answer (convert to lowercase and strip whitespace)
    cleaned_answer = answer.strip().lower()

    return cleaned_answer

def generate_multiple_choice_answers(text, llm_model):
    """
    Generate a multiple-choice answer based on the provided text using a language model.

    Args:
        text (str): The text containing the multiple-choice question.
        llm_model: The language model used for generating answers.

    Returns:
        str: The generated multiple-choice answer (e.g., 'A', 'B', 'C', etc.).
    """
    # Define the prompt template for multiple-choice answer generation
    prompt_template = """
    You are an expert at answering multiple-choice questions based on the provided text.
    Please select the correct option (A, B, C, etc.) for the following question:

    ------------
    {text}
    ------------

    ANSWER:
    """

    # Create a prompt template with the specified input variable
    PROMPT_ANSWER = PromptTemplate(template=prompt_template, input_variables=["text"])

    # Initialize an LLMChain for answer generation
    ques_gen_chain = LLMChain(llm=llm_model, prompt=PROMPT_ANSWER)

    # Generate an answer using the language model
    answer = ques_gen_chain.run(text=text)

    # Clean up the generated answer (convert to uppercase and strip whitespace)
    cleaned_answer = answer.strip().upper()

    return cleaned_answer

def generate_one_word_answers(text, llm_model):
    """
    Generate a one-word answer based on the provided text using a language model.

    Args:
        text (str): The text containing the question requiring a one-word answer.
        llm_model: The language model used for generating answers.

    Returns:
        str: The generated one-word answer.
    """
    # Define the prompt template for one-word answer generation
    prompt_template = """
    You are an expert at providing concise one-word answers based on the provided text.
    Please provide a one-word answer to the following question:

    ------------
    {text}
    ------------

    ANSWER:
    """

    # Create a prompt template with the specified input variable
    PROMPT_ANSWER = PromptTemplate(template=prompt_template, input_variables=["text"])

    # Initialize an LLMChain for answer generation
    ques_gen_chain = LLMChain(llm=llm_model, prompt=PROMPT_ANSWER)

    # Generate an answer using the language model
    answer = ques_gen_chain.run(text=text)

    # Clean up the generated answer (extract the first word and strip whitespace)
    cleaned_answer = answer.strip().split()[0]

    return cleaned_answer


In [128]:
def generate_ans_for_generated_ques(generated_questions):
    """
    Generate answers for the generated questions and write them to a CSV file.

    Args:
        generated_questions (dict): A dictionary containing lists of generated questions.

    Returns:
        None
    """
    # Initialize a language model for generating answers
    llm_model = ChatOpenAI(model="gpt-3.5-turbo")

    # Dictionary to store question-answer mappings
    question_answer_mapping = {}

    # Generate answers for true/false questions
    for question in generated_questions["true_false"]:
        generated_answer = generate_true_false_answers(question, llm_model)
        question_answer_mapping[question] = generated_answer

    # Add section header for true/false questions and answers
    question_answer_mapping["=== TRUE/FALSE QUESTIONS ==="] = "=== ANSWERS BELOW ==="

    # Generate answers for multiple-choice questions
    for question in generated_questions["multiple_choice"]:
        generated_answer = generate_multiple_choice_answers(question, llm_model)
        question_answer_mapping[question] = generated_answer

    # Add section header for multiple-choice questions and answers
    question_answer_mapping["=== MULTIPLE CHOICE QUESTIONS ==="] = "=== ANSWERS BELOW ==="

    # Generate answers for one-word answer questions
    for question in generated_questions["one_word_answer"]:
        generated_answer = generate_one_word_answers(question, llm_model)
        question_answer_mapping[question] = generated_answer

    # Add section header for one-word answer questions and answers
    question_answer_mapping["=== ONE-WORD ANSWER QUESTIONS ==="] = "=== ANSWERS BELOW ==="

    # Write question-answer mappings to a CSV file
    with open('question_answer_mapping.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Question', 'Answer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write CSV header
        writer.writeheader()

        # Write question-answer pairs to CSV rows
        for question, answer in question_answer_mapping.items():
            writer.writerow({'Question': question, 'Answer': answer})


In [126]:
generate_ans_for_generated_ques(generated_questions)