# Generate Question & Answer Pairs

The idea is to generate prompts that follow the following format:

```
[
    {
        "question": "What are the features of the property at 123 Main St?",
        "answer": "The property at 123 Main St has 3 bedrooms, 2 bathrooms, a pool, and a garage."
    },
    {
        "question": "Show me properties with at least 3 bedrooms and 2 bathrooms.",
        "answer": "123 Main St, 456 Oak St, and 789 Pine St are properties with at least 3 bedrooms and 2 bathrooms."
    }
]
```

In [9]:
DATASET_FILE_PATH = '../data/mocks/zillow_dataset.csv'
OUTPUT_FILE_PATH = 'temp/zillow_qa_dataset.json'

In [16]:
import pandas as pd

zillow_data = pd.read_csv(DATASET_FILE_PATH)

def generate_qa_pairs(data):
    qa_pairs = []
    
    for _index, row in data.iterrows():
        address = row['Address']
        price = row['Price']
        bedrooms = row['Bedrooms']
        bathrooms = row['Bathrooms']
        features = row['Features']
        city = row['City']
        
        questions = [
            f"What are the features of the property at {address}?",
            f"Show me properties with at least {bedrooms} bedrooms and {bathrooms} bathrooms.",
            f"What is the price range for properties in {city}?",
            f"Which properties have a pool and a garden?",
            f"List properties within ${price-50000} to ${price+50000}."
        ]
        
        answers = [
            f"The property at {address} has {bedrooms} bedrooms, {bathrooms} bathrooms, and features {features}.",
            f"{address} is a property with at least {bedrooms} bedrooms and {bathrooms} bathrooms.",
            f"Properties in {city} range from ${price-100000} to ${price+100000}.",
            f"{address} has a pool and a garden.",
            f"{address} is within the price range of ${price-50000} to ${price+50000}."
        ]
        
        for question, answer in zip(questions, answers):
            qa_pairs.append({"question": question, "answer": answer})
    
    return qa_pairs

qa_pairs = generate_qa_pairs(zillow_data)

qa_df = pd.DataFrame(qa_pairs)
qa_df.to_json(OUTPUT_FILE_PATH, index=False, orient='records')