In [1]:
import json
import random
from datetime import datetime
from openai import OpenAI
from typing import Dict, List
from faker import Faker
from faker.providers import company, person, address, phone_number, internet
from pathlib import Path
from rich import print

In [2]:
class FinancialAdvisoryDataGenerator:
    def __init__(self, openai_client: OpenAI, output_dir: str = "dataset"):
        self.client = openai_client
        self.fake = Faker("en_US")
        self.fake.add_provider(company)
        self.fake.add_provider(person)
        self.fake.add_provider(address)
        self.fake.add_provider(phone_number)
        self.fake.add_provider(internet)

        # Setup directory structure
        self.output_dir = Path(output_dir)
        self.transcripts_dir = self.output_dir / "transcripts"
        self.structured_dir = self.output_dir / "structured_data"
        self.raw_profiles_dir = self.output_dir / "raw_profiles"

        # Create directories
        self.output_dir.mkdir(exist_ok=True)
        self.transcripts_dir.mkdir(exist_ok=True)
        self.structured_dir.mkdir(exist_ok=True)
        self.raw_profiles_dir.mkdir(exist_ok=True)

        # Initialize dataset index
        self.dataset_index = []
        self.load_existing_index()

    def load_existing_index(self):
        """Load existing dataset index if it exists"""
        index_file = self.output_dir / "dataset_index.json"
        if index_file.exists():
            with open(index_file) as f:
                self.dataset_index = json.load(f)
            print(f"Loaded {len(self.dataset_index)} existing samples from index")

    def save_index(self):
        """Save the current dataset index"""
        with open(self.output_dir / "dataset_index.json", "w") as f:
            json.dump(self.dataset_index, f, indent=2)

    def generate_realistic_salary(self, occupation: str) -> int:
        """Generate realistic salary based on occupation"""
        salary_ranges = {
            "Software Engineer": (70000, 150000),
            "Teacher": (35000, 65000),
            "Nurse": (50000, 85000),
            "Sales Manager": (45000, 95000),
            "Accountant": (45000, 85000),
            "Construction Manager": (55000, 95000),
            "Marketing Director": (70000, 130000),
            "Government Employee": (40000, 80000),
            "Restaurant Manager": (35000, 60000),
            "Logistics Manager": (40000, 70000),
            "Doctor": (120000, 300000),
            "Lawyer": (80000, 200000),
            "Police Officer": (45000, 80000),
            "Firefighter": (45000, 85000),
            "Real Estate Agent": (30000, 120000),
            "Insurance Agent": (35000, 85000),
            "Bank Manager": (60000, 120000),
            "HR Manager": (55000, 95000),
            "Project Manager": (65000, 115000),
            "Electrician": (45000, 80000),
            "Plumber": (40000, 75000),
            "Mechanic": (35000, 65000),
            "Chef": (30000, 70000),
            "Pharmacist": (90000, 140000),
            "Financial Advisor": (50000, 120000),
            "Retired": (0, 0),  # For retirees
        }

        min_sal, max_sal = salary_ranges.get(occupation, (40000, 80000))
        if min_sal == 0 and max_sal == 0:
            return 0
        return random.randint(min_sal, max_sal)

    def generate_client_variations(self) -> List[Dict]:
        """Generate diverse client profile variations using Faker"""

        # Expanded occupation list
        occupations = [
            "Software Engineer",
            "Teacher",
            "Nurse",
            "Sales Manager",
            "Accountant",
            "Construction Manager",
            "Marketing Director",
            "Government Employee",
            "Restaurant Manager",
            "Logistics Manager",
            "Doctor",
            "Lawyer",
            "Police Officer",
            "Firefighter",
            "Real Estate Agent",
            "Insurance Agent",
            "Bank Manager",
            "HR Manager",
            "Project Manager",
            "Electrician",
            "Plumber",
            "Mechanic",
            "Chef",
            "Pharmacist",
            "Financial Advisor",
            "Retired",  # For older single clients
        ]

        # Age and retirement scenarios
        age_scenarios = [
            (58, 62, "Early retirement planning"),
            (61, 65, "Traditional retirement"),
            (64, 67, "Delayed retirement"),
            (55, 60, "Emergency retirement planning"),
            (62, 65, "Standard transition"),
            (59, 62, "Health-related early retirement"),
            (63, 66, "Phased retirement planning"),
            (60, 64, "Corporate buyout retirement"),
            (45, 65, "Mid-career planning"),
            (35, 65, "Early career planning"),
            (50, 62, "Accelerated retirement planning"),
        ]

        # Risk tolerance options
        risk_tolerances = [
            "Conservative",
            "Moderate",
            "Moderately Aggressive",
            "Aggressive",
        ]

        # Marital status options
        marital_statuses = ["Married", "Single", "Divorced", "Widowed"]

        variations = []
        for i in range(1000):  # Generate pool of 1000 variations
            # Determine if single or couple (70% couples, 30% single)
            is_couple = random.random() < 0.7
            marital_status = (
                "Married"
                if is_couple
                else random.choice(["Single", "Divorced", "Widowed"])
            )

            # Generate client 1 data
            client1_first = self.fake.first_name()
            client1_last = self.fake.last_name()
            client1_occupation = random.choice(occupations)
            client1_employer = (
                self.fake.company() if client1_occupation != "Retired" else "Retired"
            )
            client1_income = self.generate_realistic_salary(client1_occupation)

            # Age scenario
            ages = random.choice(age_scenarios)
            client1_age = ages[0]

            # Generate date of birth
            client1_dob = self.fake.date_of_birth(
                minimum_age=client1_age, maximum_age=client1_age
            )

            # Generate contact information
            address = self.fake.address().replace("\n", ", ")
            phone = self.fake.phone_number()
            email1 = f"{client1_first.lower()}.{client1_last.lower()}@{self.fake.domain_name()}"

            # Initialize base variation
            variation = {
                "id": f"client_profile_{i + 1}",
                "is_couple": is_couple,
                "client1": {
                    "first_name": client1_first,
                    "last_name": client1_last,
                    "date_of_birth": client1_dob.isoformat(),
                    "marital_status": marital_status,
                    "occupation": client1_occupation,
                    "employer": client1_employer,
                    "income": client1_income,
                    "age": client1_age,
                    "retirement_age": ages[1],
                    "phone": phone,
                    "email": email1,
                },
                "address": address,
                "scenario": ages[2],
            }

            if is_couple:
                # Generate client 2 data for couples
                client2_first = self.fake.first_name()
                client2_last = client1_last  # Same last name for married couples
                client2_occupation = random.choice(occupations)
                client2_employer = (
                    self.fake.company()
                    if client2_occupation != "Retired"
                    else "Retired"
                )
                client2_income = self.generate_realistic_salary(client2_occupation)
                client2_age = client1_age + random.randint(-3, 3)  # Age difference
                client2_dob = self.fake.date_of_birth(
                    minimum_age=client2_age, maximum_age=client2_age
                )
                email2 = f"{client2_first.lower()}.{client2_last.lower()}@{self.fake.domain_name()}"

                variation["client2"] = {
                    "first_name": client2_first,
                    "last_name": client2_last,
                    "date_of_birth": client2_dob.isoformat(),
                    "marital_status": "Married",
                    "occupation": client2_occupation,
                    "employer": client2_employer,
                    "income": client2_income,
                    "age": client2_age,
                    "retirement_age": ages[1],
                    "email": email2,
                }

                total_household_income = client1_income + client2_income
            else:
                # Single client
                total_household_income = client1_income
                variation["client2"] = None

            # Financial profile generation
            years_worked = random.randint(15, 40)

            if is_couple:
                client1_401k = (
                    int(client1_income * 0.1 * years_worked * random.uniform(0.8, 1.5))
                    if client1_income > 0
                    else 0
                )
                client1_ira = (
                    int(client1_income * 0.05 * years_worked * random.uniform(0.5, 2.0))
                    if client1_income > 0
                    else random.randint(0, 100000)
                )
                client2_401k = (
                    int(client2_income * 0.08 * years_worked * random.uniform(0.6, 1.3))
                    if client2_income > 0
                    else 0
                )
                pension_values = [client1_401k, client1_ira, client2_401k]
            else:
                client1_401k = (
                    int(client1_income * 0.12 * years_worked * random.uniform(0.8, 1.8))
                    if client1_income > 0
                    else 0
                )
                client1_ira = (
                    int(client1_income * 0.08 * years_worked * random.uniform(0.5, 2.5))
                    if client1_income > 0
                    else random.randint(0, 150000)
                )
                pension_values = [client1_401k, client1_ira]

            # Generate realistic expenses based on household size and income
            if is_couple:
                annual_expenses = int(total_household_income * random.uniform(0.3, 0.6))
            else:
                annual_expenses = int(
                    total_household_income * random.uniform(0.4, 0.7)
                )  # Single people often have higher expense ratios

            target_income = int(annual_expenses * random.uniform(1.1, 1.4))

            # Generate additional assets
            home_value = random.randint(150000, 750000)
            mortgage_balance = random.randint(0, min(home_value, 400000))
            savings = random.randint(5000, 150000)
            investments = random.randint(0, 200000)

            variation["financial_profile"] = {
                "pension_values": pension_values,
                "annual_expenses": annual_expenses,
                "target_income": target_income,
                "risk_tolerance": random.choice(risk_tolerances),
                "years_to_retirement": ages[1] - client1_age,
                "household_income": total_household_income,
            }

            variation["assets"] = {
                "home_value": home_value,
                "mortgage_balance": mortgage_balance,
                "savings": savings,
                "investments": investments,
            }

            variations.append(variation)

        return variations

    def generate_transcript_variation(self, client_profile: Dict) -> str:
        """Generate a new transcript based on client profile"""

        c1 = client_profile["client1"]
        fp = client_profile["financial_profile"]
        is_couple = client_profile["is_couple"]

        if is_couple:
            c2 = client_profile["client2"]
            client_info = f"""
CLIENT PROFILES:
• {c1["first_name"]} {c1["last_name"]}: {c1["age"]} years old, {c1["occupation"]} at {c1["employer"]}
  - Annual income: ${c1["income"]:,}
  - Desired retirement age: {c1["retirement_age"]}
• {c2["first_name"]} {c2["last_name"]}: {c2["age"]} years old, {c2["occupation"]} at {c2["employer"]}
  - Annual income: ${c2["income"]:,}

FINANCIAL SNAPSHOT:
• Retirement accounts: ${fp["pension_values"][0]:,} (401k), ${fp["pension_values"][1]:,} (IRA), ${fp["pension_values"][2]:,} (spouse's retirement savings)"""

            conversation_participants = f"Use the actual names {c1['first_name']} and {c2['first_name']} in the conversation."
            speaker_format = (
                f"ADVISOR, {c1['first_name'].upper()}, {c2['first_name'].upper()}"
            )

        else:
            client_info = f"""
CLIENT PROFILE:
• {c1["first_name"]} {c1["last_name"]}: {c1["age"]} years old, {c1["marital_status"]}, {c1["occupation"]} at {c1["employer"]}
  - Annual income: ${c1["income"]:,}
  - Desired retirement age: {c1["retirement_age"]}

FINANCIAL SNAPSHOT:
• Retirement accounts: ${fp["pension_values"][0]:,} (401k), ${fp["pension_values"][1]:,} (IRA)"""

            conversation_participants = (
                f"Use the actual name {c1['first_name']} in the conversation."
            )
            speaker_format = f"ADVISOR, {c1['first_name'].upper()}"

        prompt = f"""
        You are generating a realistic financial advisory meeting transcript. Create an authentic consultation between a professional financial advisor and {
            "married clients" if is_couple else "a single client"
        }, incorporating natural conversation patterns and industry-specific language.

{client_info}
• Current liquid savings: ${client_profile["assets"]["savings"]:,}
• Primary residence: ${client_profile["assets"]["home_value"]:,} (current value), ${client_profile["assets"]["mortgage_balance"]:,} remaining mortgage
• Annual {"household" if is_couple else "personal"} expenses: ${fp["annual_expenses"]:,}
• Target retirement income need: ${fp["target_income"]:,} annually
• Risk tolerance profile: {fp["risk_tolerance"]}
• Years until planned retirement: {fp["years_to_retirement"]}

CONVERSATION REQUIREMENTS:
Create a 15-20 minute meeting transcript that feels authentic and professional. Include:

STRUCTURAL ELEMENTS:
1. Opening pleasantries and agenda setting (2-3 minutes)
2. Income and employment review (3-4 minutes)
3. Asset and liability assessment (4-5 minutes)
4. Retirement goal analysis and gap identification (3-4 minutes)
5. Risk assessment and strategy discussion (4-5 minutes)
6. Action items and next steps (2-3 minutes)

{"COUPLE-SPECIFIC DYNAMICS:" if is_couple else "SINGLE CLIENT DYNAMICS:"}
{
            '''• Show natural interaction between spouses with different risk tolerances or priorities
• Include moments where one spouse defers to the other on certain topics
• Demonstrate collaborative decision-making and occasional disagreements
• Show advisor managing couple dynamics and ensuring both voices are heard'''
            if is_couple
            else '''• Focus on individual concerns about retirement independence
• Address single-person specific challenges (no spousal income backup, healthcare considerations)
• Show client's personal decision-making process
• Include concerns about legacy planning or family support'''
        }

CONVERSATION DYNAMICS:
• Use realistic dialogue with natural interruptions, clarifications, and follow-up questions
• Include advisor's probing questions to gather missing information
• Show {
            "clients" if is_couple else "client"
        } expressing genuine concerns, hesitations, and priorities
• Demonstrate advisor explaining complex concepts in accessible terms
• Include moments where {
            "clients" if is_couple else "client"
        } ask for clarification or express uncertainty

TECHNICAL AUTHENTICITY:
• Use proper financial planning terminology (asset allocation, withdrawal rates, sequence of returns risk, etc.)
• Reference specific investment vehicles and strategies relevant to their situation
• Include realistic fee discussions and product comparisons
• Mention compliance considerations and documentation needs
• Reference industry benchmarks and planning rules of thumb

FORMAT SPECIFICATIONS:
• Use [MM:SS] timestamps starting from [00:00]
• Label speakers as: {speaker_format}
• Include natural speech patterns: "um," "let's see," "actually," brief pauses
• Show overlapping dialogue with [interrupting] or [continuing] tags when appropriate
• Include advisor's note-taking moments: [reviewing documents], [calculating]

TONE AND STYLE:
• Professional but warm and approachable
• Advisor demonstrates expertise while remaining consultative, not pushy
• {
            "Clients" if is_couple else "Client"
        } should sound like real people with varying levels of financial literacy
• Include appropriate humor or light moments to break tension
• Show genuine concern for {'clients if is_couple else "client"s'} wellbeing and goals

{conversation_participants}
The conversation should feel like you're listening to an actual financial planning meeting, complete with the natural flow of discovery, education, and collaborative planning that characterizes quality advisory relationships."""

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert at generating realistic financial advisory meeting transcripts with natural dialogue and specific financial details.",
                },
                {"role": "user", "content": prompt},
            ],
            temperature=0.8,
            max_tokens=4000,
        )

        return response.choices[0].message.content

    def extract_structured_data(self, transcript: str) -> Dict:
        """Extract structured information from transcript"""

        prompt = f"""
        Extract structured financial information from this advisory meeting transcript and format as JSON matching this template structure:
        
        {{
            "personal_details": {{
                "client1": {{
                    "first_name": "",
                    "last_name": "",
                    "date_of_birth": "",
                    "marital_status": "",
                    "phone": "",
                    "email": ""
                }},
                "client2": {{
                    "first_name": "",
                    "last_name": "",
                    "date_of_birth": "",
                    "marital_status": "",
                    "email": ""
                }}
            }},
            "current_address": {{
                "ownership_status": "",
                "full_address": ""
            }},
            "employment": {{
                "client1": {{
                    "occupation": "",
                    "employer": "",
                    "desired_retirement_age": 0,
                    "employment_status": ""
                }},
                "client2": {{
                    "occupation": "",
                    "employer": "",
                    "employment_status": ""
                }}
            }},
            "incomes": [
                {{
                    "owner": "",
                    "name": "",
                    "amount": 0,
                    "frequency": "",
                    "net_gross": ""
                }}
            ],
            "expenses": {{
                "housing_expenses": [{{
                    "name": "",
                    "amount": 0,
                    "frequency": "",
                    "notes": ""
                }}],
                "personal_expenses": [{{
                    "name": "",
                    "amount": 0,
                    "frequency": "",
                    "notes": ""
                }}]
            }},
            "pensions": [
                {{
                    "owner": "",
                    "type": "",
                    "provider": "",
                    "value": 0,
                    "contributions": ""
                }}
            ],
            "other_assets": [
                {{
                    "owner": "",
                    "description": "",
                    "current_value": 0
                }}
            ],
            "loans_mortgages": [
                {{
                    "owner": "",
                    "type": "",
                    "provider": "",
                    "outstanding_value": 0,
                    "monthly_cost": 0
                }}
            ],
            "objectives": {{
                "retirement_date": "",
                "target_annual_income": 0,
                "risk_tolerance": "",
                "key_goals": []
            }}
        }}
        
        IMPORTANT NOTES:
        - If this is a single client meeting, set client2 fields to null
        - If client2 exists, extract their information; if not, set all client2 fields to null
        - For single clients, use "Client1" as the owner for incomes, expenses, pensions, etc.
        - For couples, use "Client1" and "Client2" or their actual names as appropriate
        
        Transcript:
        {transcript}
        
        Extract all relevant information mentioned in the conversation and return valid JSON only.
        Use null for missing values or non-existent second client, not empty strings.
        Ensure all numerical values are properly extracted as numbers, not strings.
        """

        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert at extracting structured data from financial advisory transcripts. Handle both single clients and couples appropriately. Return only valid JSON with accurate numerical values.",
                },
                {"role": "user", "content": prompt},
            ],
            temperature=0.1,  # Lower temperature for more consistent extraction
            max_tokens=3000,
        )

        try:
            return json.loads(response.choices[0].message.content)
        except json.JSONDecodeError:
            # Fallback if JSON parsing fails
            return {
                "error": "Failed to parse JSON",
                "raw_response": response.choices[0].message.content,
            }

    def save_sample(self, sample: Dict, profile: Dict):
        """Save a single sample and update the index"""
        sample_id = profile["id"]
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save transcript
        transcript_file = f"transcript_{sample_id}_{timestamp}.txt"
        with open(self.transcripts_dir / transcript_file, "w") as f:
            f.write(sample["input"])

        # Save structured data
        structured_file = f"structured_{sample_id}_{timestamp}.json"
        with open(self.structured_dir / structured_file, "w") as f:
            json.dump(sample["output"], f, indent=2)

        # Save raw profile
        profile_file = f"profile_{sample_id}_{timestamp}.json"
        with open(self.raw_profiles_dir / profile_file, "w") as f:
            json.dump(profile, f, indent=2)

        # Update index
        if profile["is_couple"]:
            client_names = f"{profile['client1']['first_name']} & {profile['client2']['first_name']} {profile['client1']['last_name']}"
            occupations = f"{profile['client1']['occupation']} & {profile['client2']['occupation']}"
        else:
            client_names = f"{profile['client1']['first_name']} {profile['client1']['last_name']} (Single)"
            occupations = profile["client1"]["occupation"]

        index_entry = {
            "id": sample_id,
            "timestamp": timestamp,
            "transcript_file": str(self.transcripts_dir / transcript_file),
            "structured_file": str(self.structured_dir / structured_file),
            "profile_file": str(self.raw_profiles_dir / profile_file),
            "client_names": client_names,
            "occupations": occupations,
            "household_income": profile["financial_profile"]["household_income"],
            "is_couple": profile["is_couple"],
            "marital_status": profile["client1"]["marital_status"],
            "generation_status": "success",
        }

        self.dataset_index.append(index_entry)
        self.save_index()

        return index_entry

    def generate_complete_dataset(self, num_samples: int = 50):
        """Generate samples one at a time and save immediately"""

        existing_count = len(self.dataset_index)
        print(f"Starting generation from sample {existing_count + 1}")

        client_variations = self.generate_client_variations()

        for i, profile in enumerate(
            client_variations[existing_count : existing_count + num_samples]
        ):
            current_sample = i + existing_count + 1
            print(f"\n{'=' * 60}")
            print(f"Generating sample {current_sample}/{existing_count + num_samples}")

            if profile["is_couple"]:
                print(
                    f"Clients: {profile['client1']['first_name']} & {profile['client2']['first_name']} {profile['client1']['last_name']} (Couple)"
                )
                print(
                    f"Occupations: {profile['client1']['occupation']} & {profile['client2']['occupation']}"
                )
            else:
                print(
                    f"Client: {profile['client1']['first_name']} {profile['client1']['last_name']} (Single - {profile['client1']['marital_status']})"
                )
                print(f"Occupation: {profile['client1']['occupation']}")

            print(f"{'=' * 60}")

            try:
                # Generate transcript
                print("🔄 Generating transcript...")
                transcript = self.generate_transcript_variation(profile)
                print(f"✅ Transcript generated ({len(transcript)} characters)")

                # Extract structured data
                print("🔄 Extracting structured data...")
                structured_data = self.extract_structured_data(transcript)

                if "error" in structured_data:
                    print(
                        f"❌ Error in structured data extraction: {structured_data['error']}"
                    )
                    continue

                print("✅ Structured data extracted successfully")

                # Create and save sample
                sample = {
                    "id": profile["id"],
                    "input": transcript,
                    "output": structured_data,
                    "metadata": {
                        "client_profile": profile,
                        "generated_at": datetime.now().isoformat(),
                        "is_couple": profile["is_couple"],
                        "household_income": profile["financial_profile"][
                            "household_income"
                        ],
                    },
                }

                # Save immediately
                index_entry = self.save_sample(sample, profile)
                print(f"💾 Sample {current_sample} saved successfully")

                # Print some stats
                print(f"📊 Stats:")
                print(
                    f"   Household Income: ${profile['financial_profile']['household_income']:,}"
                )
                print(
                    f"   Risk Tolerance: {profile['financial_profile']['risk_tolerance']}"
                )
                print(
                    f"   Years to Retirement: {profile['financial_profile']['years_to_retirement']}"
                )
                print(
                    f"   Total Assets: ${sum([profile['assets']['home_value'], profile['assets']['savings'], profile['assets']['investments']]):,}"
                )

            except Exception as e:
                print(f"❌ Error generating sample {current_sample}: {str(e)}")
                # Log the error
                client_name = f"{profile['client1']['first_name']} {profile['client1']['last_name']}"
                if profile["is_couple"]:
                    client_name += f" & {profile['client2']['first_name']}"

                error_entry = {
                    "id": profile["id"],
                    "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
                    "client_names": client_name,
                    "is_couple": profile["is_couple"],
                    "error": str(e),
                    "generation_status": "failed",
                }
                self.dataset_index.append(error_entry)
                self.save_index()
                continue

        # Print final statistics
        self.print_dataset_statistics()

    def print_dataset_statistics(self):
        """Print summary statistics of the generated dataset"""
        if not self.dataset_index:
            print("No samples in dataset")
            return

        successful_samples = [
            s for s in self.dataset_index if s.get("generation_status") == "success"
        ]
        failed_samples = [
            s for s in self.dataset_index if s.get("generation_status") == "failed"
        ]

        # Count couples vs singles
        couples = [s for s in successful_samples if s.get("is_couple", False)]
        singles = [s for s in successful_samples if not s.get("is_couple", False)]

        print(f"\n{'=' * 60}")
        print("📈 FINAL DATASET STATISTICS")
        print(f"{'=' * 60}")
        print(f"Total samples generated: {len(self.dataset_index)}")
        print(f"Successful: {len(successful_samples)}")
        print(f"Failed: {len(failed_samples)}")
        print(
            f"Couples: {len(couples)} ({len(couples) / len(successful_samples) * 100:.1f}%)"
        )
        print(
            f"Singles: {len(singles)} ({len(singles) / len(successful_samples) * 100:.1f}%)"
        )

        if successful_samples:
            incomes = [
                sample["household_income"]
                for sample in successful_samples
                if "household_income" in sample
            ]
            if incomes:
                print(f"Average household income: ${sum(incomes) / len(incomes):,.0f}")
                print(f"Income range: ${min(incomes):,} - ${max(incomes):,}")

        print(f"\nFiles saved in: {self.output_dir}")
        print(f"Index file: {self.output_dir}/dataset_index.json")

In [3]:
from dotenv import load_dotenv

load_dotenv(".env")


client = OpenAI()

# Get user preferences
num_samples = 10
output_dir = "financial_dataset"

print(f"\nInitializing generator...")
print(f"Output directory: {output_dir}")
print(f"Target samples: {num_samples}")

In [4]:
# Create generator
generator = FinancialAdvisoryDataGenerator(client, output_dir=output_dir)

# Generate dataset with immediate saving
print(f"\nStarting generation process...")
generator.generate_complete_dataset(num_samples=num_samples)

print(f"\n🎉 Generation complete!")