# Generate Question & Answer Pairs

The idea is to generate prompts that follow the following format:

```
[
    {
        "question": "What are the features of the property at 123 Main St?",
        "answer": "The property at 123 Main St has 3 bedrooms, 2 bathrooms, a pool, and a garage."
    },
    {
        "question": "Show me properties with at least 3 bedrooms and 2 bathrooms.",
        "answer": "123 Main St, 456 Oak St, and 789 Pine St are properties with at least 3 bedrooms and 2 bathrooms."
    }
]
```

In [1]:
DATASET_FILE_PATH = '../data/zillow/Trimmed_Metro_median_sale_price_all_homes_raw_monthly.csv.pkl'
OUTPUT_FILE_PATH = 'temp/zillow_qa_dataset.json'

In [2]:
import pandas as pd

df = pd.read_pickle(DATASET_FILE_PATH)

df = df[['RegionName', '2024-03-31']]

df.columns = ['Location', 'Price']

df.sample(20) # This is metropolitan area

Unnamed: 0,Location,Price
289,"Ottawa, IL",149750.0
284,"Jefferson City, MO",210000.0
253,"Greenville, NC",216500.0
580,"Bellefontaine, OH",210000.0
669,"Dixon, IL",147500.0
581,"Seymour, IN",187950.0
362,"Whitewater, WI",357500.0
381,"Shelby, NC",222750.0
59,"Bridgeport, CT",505000.0
604,"Muscatine, IA",154950.0


In [None]:
qa_pairs = []
for index, row in df.iterrows():
    # Simple Price Based Questions
    question = f"What is the median home price in {row['Location']}?"
    answer = f"The median home price in {row['Location']} is ${row['Price']:.0f}."
    qa_pairs.append({"question": question, "answer": answer})

    # Income-related question
    # question_income = f"What is the median household income in {row['Location']}?"
    # answer_income = f"The median household income in {row['Location']} is ${row['Median_Household_Income']}."
    # qa_pairs.append({"question": question_income, "answer": answer_income})
    
    # Complex question
    # question_complex = f"Which location would suit a family with a combined household income of ${row['Price']:.0f}?"
    # answer_complex = f"{row['Location']} would suit a family with a combined household income of ${row['Price']:.0f}."
    # qa_pairs.append({"question": question_complex, "answer": answer_complex})

qa_df = pd.DataFrame(qa_pairs)
qa_df.to_json(OUTPUT_FILE_PATH, index=False, orient='records')