In [19]:

import os
from pathlib import Path
import requests
import json
import re


In [12]:
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def chunk_text(text, chunk_size=1024):
    chunks = []
    current_chunk = ""
    words = text.split()
    
    for word in words:
        if len(current_chunk) + len(word) + 1 > chunk_size:
            chunks.append(current_chunk.strip())
            current_chunk = ""
        current_chunk += word + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def process_data_folder(folder_path):
    all_chunks = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.txt'):
                file_path = Path(root) / file
                text = read_text_file(file_path)
                chunks = chunk_text(text)
                all_chunks.extend(chunks)
    return all_chunks

data_folder = 'data'
text_chunks = process_data_folder(data_folder)

In [13]:
len(text_chunks)

526

In [53]:
def generate_qa_pairs(context, num_questions=5):
    # Define the prompt to instruct the model to generate questions based on the context
    prompt = f"""
    Read the following context and generate {num_questions} possible question-answer pairs:
    EACH QUESTION SHOULD BE UNIQUE AND INDEPENDENT OF THE OTHER QUESTIONS.
    DO NOT ASK QUESTIONS RELATED TO THE PREVIOUS QUESTIONS.
    DO NOT ASK MANY QUESTIONS WHICH ARE NOT IN THE CONTEXT.
    ASK THE MOST RELEVANT QUESTIONS TO THE CONTEXT.
    ASK QUESTIONS THAT ARE RELEVANT TO THE CONTEXT LIKE WHAT IS THE NAME OF THE EVENT, WHO IS PERFORMING, WHEN IS THE EVENT, WHERE IS THE EVENT, ETC IF APPLICABLE.
    
    Example Questions:
    
    To help you get started, here are some example questions,

    - Questions that could be answered by just prompting a LLM
        Q: When was Carnegie Mellon University founded?
        A: 1900
    - Questions that can be better answered by augmenting LLM with relevant documents
        Q: What is the name of the annual pickle festival held in Pittsburgh?
        A: Picklesburgh
    - Questions that are likely answered only through augmentation
        Q: When was the Pittsburgh Soul Food Festival established?
        A: 2019
    - Questions that are sensitive to temporal signals(Who is performing at X venue on Y date?)
        Q: What musical artist is performing at PPG Arena on October 13?
        A: Billie Eilish
    
    Context:
    {context}

    Provide the output in the following format:
    Q1: <Question>
    A1: <Answer>
    Q2: <Question>
    A2: <Answer>
    ...
    QUESTIONS SHOULD BE LIKE HOW A NORMAL HUMAN WOULD ASK.  
    QUESTIONS SHOULD BE CONCISE AND STRAIGHTFORWARD.
    QUESTIONS SHOULD NOT BE REPETITIVE.
    QUESTIONS SHOULD NOT BE TOO GENERAL.
    QUESTIONS SHOULD NOT BE TOO LONG.
    QUESTIONS SHOULD NOT BE TOO EASY (FOR EXAMPLE, "What is the name of the event featuring Handel's Messiah?" IS A BAD QUESTION BECAUSE THE ANSWER CAN BE FOUND IN THE QUESTION ITSELF).
    DO NOT USE TOO MANY PRONOUNS(it, they, these, etc).
    
    DO NOT ASK QUESTIONS THAT DO NOT HAVE A CLEAR AND CONCISE ANSWER.
    
    THE QUESTION SHOULD BE A SINGLE LINE, SHORT AND CONCISE.
    
    THE ANSWER TO THE QUESTION SHOULD BE A SINGLE LINE, SHORT AND CONCISE.
    
    ASK QUESTIONS THAT ARE RELEVANT TO THE CONTEXT LIKE WHAT IS THE NAME OF THE EVENT, WHO IS PERFORMING, WHEN IS THE EVENT, WHERE IS THE EVENT, ETC IF APPLICABLE.
    BASED ON THE TOPIC OF THE CONTEXT, ASK QUESTIONS THAT ARE RELEVANT TO THE TOPIC LIKE WHAT IS THE HISTORY OF THE EVENT, WHAT IS THE PURPOSE OF THE EVENT, WHAT IS THE SIGNIFICANCE OF THE EVENT, ETC IF APPLICABLE.
    
    USE SIMPLE WORDS AND LANGUAGE.
    
    """

    # Send the prompt to the Ollama server
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": "llama3.2",
        "prompt": prompt,
        "temperature": 0.5,
        "top_p": 0.9,
        "stream": False 
    }
    
    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()  # Raise an error for bad responses
        result = response.json().get('response', "")
        
        # Parse the output to separate questions and answers
        questions, answers = [], []
        lines = result.strip().split("\n")
        
        for line in lines:
            if line.startswith("Q"):
                questions.append(line.split(": ", 1)[1].strip())
            elif line.startswith("A"):
                answers.append(line.split(": ", 1)[1].strip())
        
        return questions, answers

    except requests.ConnectionError:
        print("Connection error: Could not connect to the Ollama server.")
        return [], []
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return [], []
    except Exception as err:
        print(f"An error occurred: {err}")
        return [], []
# for chunk in text_chunks:
#     qa_pairs = generate_qa_pairs(chunk)
#     print(qa_pairs)


In [48]:
len(text_chunks)

526

In [49]:
print(text_chunks[:10])

['Oct 18 - Nov 8 221 N Main St Greensburg, Pennsylvania 15601 Oct 18, 2024 - Mar 30 2025 The Westmoreland Museum of American Art 221 N. Main St. Greensburg, Pennsylvania 15601 Oct 18 - Nov 2 City-County Building 414 Grant Street Pittsburgh, Pennsylvania 15219 Oct 18, 2024 - Jan 20 2025 The Andy Warhol Museum 117 Sandusky Street Pittsburgh, Pennsylvania 15212 Oct 18, 2024 - Jan 5 2025 Carnegie Science Center One Allegheny Avenue Pittsburgh, Pennsylvania 15212 Oct 17 Pittsburgh Shrine Center 1877 Shriners Way Cheswick, Pennsylvania 15024 Oct 17 Pittsburgh Shrine Center 1877 Shriners Way Cheswick, Pennsylvania 15024 Oct 18 - Dec 8 221 N Main St. Greensburg, Pennsylvania 15601 Oct 18, 2024 - Mar 2 2025 The Westmoreland Museum of American Art 221 N. Main St. Greensburg, Pennsylvania 15601 Oct 18 - Nov 8 ISC Pop-Up Gallery 623 Smithfield St Pittsburgh, Pennsylvania 15222 Oct 18, 2024 - Sep 11 2025 Puzzling Adventures Start & Finish Location 629 Smithfield St Pittsburgh, Pennsylvania 15222 Oc

In [44]:
# Example Usage
context = text_chunks[2]
questions, answers = generate_qa_pairs(context)
print(questions)
print(answers)

['What is the name of the event?', 'Where is the walk taking place?', 'Who is performing at Hofbrauhaus Pittsburgh Brewery & Restaurant?', 'When does the event start?', 'What is the duration of the event for Walk the Burgh Tours?', 'Who is performing at the Carnegie Museum of Natural History?', 'What type of event is Bike the Burgh Tours?', 'Where is Mixtape located?', 'Who performed at Divine Mercy Academy Lourdes Center from October 18 to November 3?', 'How long does the event run for The Westmoreland Museum of American Art?']
['Walk the Burgh Tours', 'Pittsburgh, Pennsylvania', 'Not specified (Hofbrauhaus is a brewery and restaurant)', 'October 18, 2024', 'From October 18 to April 5, 2025', 'Not specified (no performance listed)', 'A bike tour', '4907 Penn Ave, Pittsburgh, PA', 'Not specified (event information not provided)', 'From October 18 to December 31, 2024']


True

In [58]:
qa_pairs = []
for i in range(len(text_chunks)):
    questions, answers = generate_qa_pairs(text_chunks[i])
    if len(questions) == len(answers):
        for j in range(len(questions)):
            qa_pairs.append({"question": questions[j], "answer": answers[j]})
    else:
        print("Questions and answers do not match", i)

Questions and answers do not match 23
Questions and answers do not match 40
Questions and answers do not match 148
Questions and answers do not match 183
Questions and answers do not match 187
Questions and answers do not match 190
Questions and answers do not match 211
Questions and answers do not match 230
Questions and answers do not match 234
Questions and answers do not match 269
Questions and answers do not match 336
Questions and answers do not match 371
Questions and answers do not match 379
Questions and answers do not match 380
Questions and answers do not match 417
Questions and answers do not match 426
Questions and answers do not match 460
Questions and answers do not match 487


In [60]:
print(len(qa_pairs))

2472


In [61]:
import json

with open('qa_pairs.json', 'w') as f:
    json.dump(qa_pairs, f, indent=2)

In [2]:
import pandas as pd

df = pd.read_json('qa_pairs_new.json')
df.to_excel('qa_pairs.xlsx', index=False)