In [None]:
from pydantic import BaseModel, ValidationError, validator
from typing import List
import pandas as pd
import json
import time
import os
import google.generativeai as genai
from dotenv import load_dotenv

# 1. Pydantic model for validation
class ElderlySentence(BaseModel):
    text: str
    label: str

# 2. Config
load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")
api_key = 'AIzaSyBlOeJT_o3tNN8XQfz0FOea9pK7gCJoQB8'
if not api_key:
    raise ValueError("GOOGLE_API_KEY not found in environment")

genai.configure(api_key=api_key)

# 3. Prompt template
prompt_template = """
Generate a dataset of short conversational sentences suitable for elderly people in Singapore.
- Each sentence should be natural, realistic, and reflect daily life in Singapore (e.g., HDB, kopitiam, MRT, wet market, CNY, angbao, getai, hawker centres, local shops, public transport, etc.).
- Include a variety of situations: asking questions, making statements, sharing memories, commenting on local events, talking about health, family, food, or hobbies.
- Each output must be exactly one sentence – do not add extra context, explanations, or follow-up sentences.
- Categorise each sentence into one of three labels:
  * "healthcare": health-related, medical visits, physical condition, medicine.
  * "long-term": stable aspects of life like family members, hobbies, owning a car/house, or other things that don’t change quickly.
  * "short-term": day-to-day or situational things like meals, weather, appointments, errands, public transport, or current events.
- Output the dataset as a JSON array of objects with two keys: "text" and "label".
- Generate up to 1000 examples per request.

Example output:
[
    {"text": "I need to collect my medicine from the polyclinic tomorrow.", "label": "healthcare"},
    {"text": "My grandson just started secondary school this year.", "label": "long-term"},
    {"text": "Shall we eat chicken rice at the hawker centre later?", "label": "short-term"}
]

"""

# 4. Function to parse JSON output and validate
def clean_json_text(text: str) -> str:
    """
    Remove markdown code fences like ```json ... ``` from the generated text
    """
    text = text.strip()
    if text.startswith("```") and text.endswith("```"):
        # Remove first and last lines if they are code fences
        lines = text.splitlines()
        if len(lines) >= 3 and lines[0].startswith("```") and lines[-1].startswith("```"):
            return "\n".join(lines[1:-1])
    return text
    
def parse_and_validate_json(json_text: str) -> List[dict]:
    json_text = clean_json_text(json_text)
    
    try:
        data = json.loads(json_text)  # <-- THIS was missing
    except json.JSONDecodeError as e:
        print("JSON decoding error:", e)
        return []
    
    valid_rows = []
    for item in data:
        try:
            validated = ElderlySentence(**item)
            valid_rows.append(validated.model_dump())
        except ValidationError:
            print("Validation error, skipping item:", item)
    return valid_rows

# 5. Generate multiple batches
all_data = []
num_batches = 5  # Adjust as needed

for i in range(num_batches):
    print(f"Generating batch {i+1}/{num_batches}...")
    
    # Create a GenerativeModel instance
    model = genai.GenerativeModel("gemini-2.5-flash")
    
    # Generate content
    response = model.generate_content(
        prompt_template,
        generation_config=genai.types.GenerationConfig(
            temperature=1.2,   # default ~0.7, increase for more variety
            top_p=0.9,         # nucleus sampling
            top_k=40           # restricts to top-k tokens
        )
    )
    # print(response.text)
    
    # Extract generated JSON
    generated_text = response.text
    batch_data = parse_and_validate_json(generated_text)
    all_data.extend(batch_data)
    
    time.sleep(1)  # Polite delay

# 6. Convert to pandas DataFrame
df = pd.DataFrame(all_data)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 20)

print("Total rows generated:", len(df))
print(df.head())

# 7. Save to CSV
csv_path = "elderly_topical_conversational_sentences.csv"

if os.path.exists(csv_path):

    df_existing = pd.read_csv(csv_path)
    df_combined = pd.concat([df_existing, df], ignore_index=True)

    # Count duplicates before dropping
    num_duplicates = df_combined.duplicated(subset=['text']).sum()
    print(f"Number of duplicate rows to be removed: {num_duplicates}")
    
    df_combined.drop_duplicates(subset=['text'], inplace=True)
    df_combined.to_csv(csv_path, index=False)
    print(f"Appended new data. Dataset now has {len(df_combined)} rows, saved to {csv_path}")
else:
    df.to_csv(csv_path, index=False)
    print(f"Saved dataset to {csv_path} with {len(df)} rows")

