In [None]:
import sqlite3
import re
import json

def clean_mysql_dump_for_sqlite(sql_content):
    """
    Cleans a MySQL dump to make it compatible with SQLite.
    This version has more robust regular expressions to prevent syntax errors.
    """
    # Remove MySQL-specific comment blocks like /*!40101 ... */;
    # This regex is now more robust.
    sql_content = re.sub(r'/\*!.*?\*/;?', '', sql_content, flags=re.DOTALL)

    # --- FIXED PART ---
    # A single, more robust regex to remove LOCK/UNLOCK TABLES commands,
    # ignoring case and handling potential whitespace.
    sql_content = re.sub(r'^\s*(lock|unlock)\s+tables.*?;', '', sql_content,
                         flags=re.IGNORECASE | re.MULTILINE)

    # Remove MySQL-specific table options (ENGINE, CHARSET, COLLATE, etc.)
    sql_content = re.sub(r'\)\s*ENGINE=.*?;', ');', sql_content, flags=re.DOTALL | re.IGNORECASE)

    # Convert MySQL `datetime(3)` to SQLite-compatible DATETIME
    sql_content = sql_content.replace('datetime(3)', 'DATETIME')

    # Convert MySQL `tinyint(1)` to SQLite-compatible INTEGER
    sql_content = sql_content.replace('tinyint(1)', 'INTEGER')

    # Remove extra KEY definitions that are not PRIMARY KEY
    sql_content = re.sub(r',?\s+KEY `.*?` \(.*?`\)', '', sql_content)

    return sql_content

def convert_sql_to_lookup_json(sql_file_path, json_file_path):
    """
    Reads a MySQL dump, loads it into an in-memory SQLite database,
    and exports all breeds into a single JSON object keyed by breed name.
    """
    print(f"Reading SQL dump from '{sql_file_path}'...")
    try:
        with open(sql_file_path, 'r', encoding='utf-8') as f:
            sql_dump = f.read()
    except FileNotFoundError:
        print(f"ERROR: The file '{sql_file_path}' was not found. Please make sure it's in the same directory.")
        return

    print("Cleaning SQL for SQLite compatibility...")
    cleaned_sql = clean_mysql_dump_for_sqlite(sql_dump)

    # For debugging, you can print the cleaned SQL to a file
    # with open('cleaned_debug.sql', 'w') as f:
    #     f.write(cleaned_sql)

    conn = sqlite3.connect(':memory:')
    cursor = conn.cursor()

    print("Executing SQL script in-memory...")
    try:
        cursor.executescript(cleaned_sql)
        conn.commit()
    except sqlite3.Error as e:
        print(f"An error occurred while executing the SQL script: {e}")
        print("This likely means the cleaning function missed some MySQL-specific syntax.")
        conn.close()
        return

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = [row[0] for row in cursor.fetchall()]
    if not tables:
        print("No tables were created. The SQL script might be empty or failed silently.")
        conn.close()
        return
    print(f"Found tables: {tables}")

    pet_lookup_db = {}
    conn.row_factory = sqlite3.Row

    for table_name in tables:
        pet_type = 'unknown'
        if 'dog' in table_name:
            pet_type = 'dog'
        elif 'cat' in table_name:
            pet_type = 'cat'

        print(f"Extracting and mapping data from table '{table_name}'...")
        cursor = conn.cursor()
        cursor.execute(f"SELECT * FROM {table_name}")
        rows = cursor.fetchall()

        for row in rows:
            pet_data = dict(row)
            breed_name = pet_data.pop('breed_name', None)
            if not breed_name:
                continue

            pet_data['pet_type'] = pet_type
            pet_lookup_db[breed_name] = pet_data

    conn.close()

    print(f"Writing all data to lookup JSON file '{json_file_path}'...")
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(pet_lookup_db, f, indent=4, ensure_ascii=False)

    print("\nConversion complete!")
    print(f"Output saved to '{json_file_path}'")

if __name__ == "__main__":
    sql_input_file = 'breed_details_backup.sql' # Using your file name
    json_output_file = 'pet_database.json'
    convert_sql_to_lookup_json(sql_input_file, json_output_file)

Reading SQL dump from 'breed_details_backup.sql'...
Cleaning SQL for SQLite compatibility...
Executing SQL script in-memory...
Found tables: ['dog_breed_details', 'cat_breed_details']
Extracting and mapping data from table 'dog_breed_details'...
Extracting and mapping data from table 'cat_breed_details'...
Writing all data to lookup JSON file 'pet_database.json'...

Conversion complete!
Output saved to 'pet_database.json'


In [None]:
import json

def simplify_pet_data(pet_name, data):
    """Simplifies and cleans the data for a single pet breed."""

    # 1. Define all keys to be completely removed
    keys_to_remove = [
        'created_at', 'updated_at', 'deleted_at', 'id',
        'full_front', 'full_left', 'full_right', 'face_front', 'photo5'
    ]

    # 2. Define keys that will be consolidated into simpler fields
    #    We will remove these after processing them.
    price_keys = [
        'male_min_price', 'male_max_price', 'female_min_price', 'female_max_price',
        'low_quality_male_min_price', 'low_quality_male_max_price', 'medium_quality_male_min_price',
        'medium_quality_male_max_price', 'show_quality_male_min_price', 'show_quality_male_max_price',
        'champion_quality_male_min_price', 'champion_quality_male_max_price', 'low_quality_female_min_price',
        'low_quality_female_max_price', 'medium_quality_female_min_price', 'medium_quality_female_max_price',
        'show_quality_female_min_price', 'show_quality_female_max_price', 'champion_quality_female_min_price',
        'champion_quality_female_max_price'
    ]
    weight_keys = [
        'male_min_full_grown_weight', 'male_max_full_grown_weight',
        'female_min_full_grown_weight', 'female_max_full_grown_weight'
    ]
    height_keys = [
        'male_min_full_grown_height', 'male_max_full_grown_height',
        'female_min_full_grown_height', 'female_max_full_grown_height'
    ]
    lifespan_keys = ['min_life_span', 'max_life_span']
    litter_keys = ['min_litter_size', 'max_litter_size']

    # Start with a copy of the original data
    clean_data = data.copy()

    # 3. Create simplified, human-readable fields

    # --- Simplify Price ---
    all_prices = [v for k, v in data.items() if k in price_keys and v > 0]
    if all_prices:
        min_price = min(all_prices)
        max_price = max(all_prices)
        # Using Indian formatting for currency
        clean_data['price_range'] = f"₹{min_price:,} - ₹{max_price:,}"

    # --- Simplify Weight ---
    if data.get('male_max_full_grown_weight'):
        m_min_w = data['male_min_full_grown_weight']
        m_max_w = data['male_max_full_grown_weight']
        f_min_w = data['female_min_full_grown_weight']
        f_max_w = data['female_max_full_grown_weight']
        clean_data['weight'] = f"Male: {m_min_w}-{m_max_w} kg, Female: {f_min_w}-{f_max_w} kg"

    # --- Simplify Height ---
    if data.get('male_max_full_grown_height'):
        m_min_h = data['male_min_full_grown_height']
        m_max_h = data['male_max_full_grown_height']
        f_min_h = data['female_min_full_grown_height']
        f_max_h = data['female_max_full_grown_height']
        clean_data['height'] = f"Male: {m_min_h}-{m_max_h} cm, Female: {f_min_h}-{f_max_h} cm"

    # --- Simplify Life Span ---
    if data.get('max_life_span'):
        clean_data['life_span'] = f"{data['min_life_span']} - {data['max_life_span']} years"

    # --- Simplify Litter Size ---
    if data.get('max_litter_size', 0) > 0:
        unit = "puppies" if data.get('pet_type') == 'dog' else "kittens"
        clean_data['litter_size'] = f"{data['min_litter_size']} - {data['max_litter_size']} {unit}"

    # 4. Remove all processed and unnecessary keys
    keys_to_purge = keys_to_remove + price_keys + weight_keys + height_keys + lifespan_keys + litter_keys
    for key in keys_to_purge:
        clean_data.pop(key, None) # Use .pop with a default to avoid errors

    return clean_data


def main():
    input_file = 'pet_database.json'
    output_file = 'chatbot_training_data.json'

    print(f"Loading data from '{input_file}'...")
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            full_db = json.load(f)
    except FileNotFoundError:
        print(f"ERROR: Input file '{input_file}' not found. Please run the previous script first.")
        return

    print("Processing and cleaning data for chatbot...")

    cleaned_db = {}
    for pet_name, pet_data in full_db.items():
        cleaned_db[pet_name] = simplify_pet_data(pet_name, pet_data)

    print(f"Saving cleaned data to '{output_file}'...")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_db, f, indent=4, ensure_ascii=False)

    print("\nProcess complete!")
    print(f"A clean dataset for your chatbot has been saved to '{output_file}'.")

if __name__ == '__main__':
    main()

Loading data from 'pet_database.json'...
Processing and cleaning data for chatbot...
Saving cleaned data to 'chatbot_training_data.json'...

Process complete!
A clean dataset for your chatbot has been saved to 'chatbot_training_data.json'.


In [None]:
pip install beautifulsoup4 requests google-generativeai



In [None]:
#!/usr/bin/env python3
"""
Pet Dataset Enrichment Pipeline - 2-Level Data Enhancement System
==================================================================

This notebook implements a sophisticated 2-level pipeline to enrich pet breed data:
Level 1: Web Scraping from authoritative sources (AKC, Wikipedia)
Level 2: LLM enrichment using Google's Gemini REST API for intelligent data generation

Features Added: 22 new comprehensive features including health, training, history, and lifestyle data

IMPROVEMENTS:
- Enhanced rate limiting with exponential backoff
- Better JSON parsing with fallback mechanisms
- Retry logic for failed requests
- Progress saving and resume capability
"""

# =============================================================================
# SETUP AND INSTALLATION
# =============================================================================

# Install required packages
# !pip install -q beautifulsoup4 requests lxml html5lib

import json
import time
import os
import re
import random
from typing import Dict, List, Optional, Any
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote_plus
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION AND CONSTANTS
# =============================================================================

# File paths
INPUT_FILE = 'chatbot_training_data.json'
OUTPUT_FILE = 'final_enriched_training_data.json'
PROGRESS_FILE = 'pipeline_progress.json'

# Using the recommended latest stable model
GEMINI_MODEL_NAME = 'gemini-1.5-flash-latest'

# Complete list of 22 new features to be added
NEW_FEATURES_ADDED = [
    # Health & Wellness (5 features)
    "common_health_concerns",
    "health_disclaimer",
    "recommended_health_tests",
    "general_dietary_needs",
    "average_exercise_needs",

    # Training & Behavior (6 features)
    "training_difficulty",
    "training_tips",
    "socialization_needs",
    "common_behavioral_issues",
    "mental_stimulation_needs",
    "prey_drive_level",

    # Breed History & Characteristics (5 features)
    "breed_history",
    "breed_group",
    "puppy_availability",
    "distinguishing_features",
    "celebrity_owners",

    # Lifestyle & Home Compatibility (6 features)
    "good_for_first_time_owners",
    "ideal_living_conditions",
    "tolerance_to_being_alone",
    "weather_tolerance_details",
    "grooming_frequency_and_tips",
    "cost_of_ownership_summary"
]

# Request headers to avoid being blocked
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Rate limiting configuration
RATE_LIMIT_CONFIG = {
    'base_delay': 3,  # Base delay between requests (seconds)
    'max_delay': 60,  # Maximum delay for exponential backoff
    'max_retries': 5,  # Maximum number of retries
    'backoff_factor': 2  # Exponential backoff multiplier
}

# =============================================================================
# ENHANCED UTILITIES
# =============================================================================

class ProgressTracker:
    """Track and save pipeline progress to enable resume functionality"""
    def __init__(self, progress_file: str):
        self.progress_file = progress_file
        self.data = self._load_progress()

    def _load_progress(self) -> Dict:
        """Load existing progress or create new"""
        try:
            with open(self.progress_file, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            return {
                'completed_breeds': [],
                'failed_breeds': [],
                'last_updated': None,
                'total_processed': 0
            }

    def save_progress(self):
        """Save current progress"""
        self.data['last_updated'] = datetime.now().isoformat()
        with open(self.progress_file, 'w') as f:
            json.dump(self.data, f, indent=2)

    def mark_completed(self, breed_name: str):
        """Mark a breed as completed"""
        if breed_name not in self.data['completed_breeds']:
            self.data['completed_breeds'].append(breed_name)
            self.data['total_processed'] += 1
            self.save_progress()

    def mark_failed(self, breed_name: str):
        """Mark a breed as failed"""
        if breed_name not in self.data['failed_breeds']:
            self.data['failed_breeds'].append(breed_name)
            self.save_progress()

    def is_completed(self, breed_name: str) -> bool:
        """Check if breed was already processed"""
        return breed_name in self.data['completed_breeds']

    def get_remaining_breeds(self, all_breeds: List[str]) -> List[str]:
        """Get list of breeds that still need processing"""
        return [breed for breed in all_breeds if not self.is_completed(breed)]

class RateLimiter:
    """Enhanced rate limiter with exponential backoff"""
    def __init__(self, config: Dict):
        self.config = config
        self.last_request_time = 0
        self.consecutive_failures = 0

    def wait_if_needed(self):
        """Wait appropriate amount of time before next request"""
        current_time = time.time()
        time_since_last = current_time - self.last_request_time

        # Calculate delay based on consecutive failures
        if self.consecutive_failures > 0:
            delay = min(
                self.config['base_delay'] * (self.config['backoff_factor'] ** self.consecutive_failures),
                self.config['max_delay']
            )
        else:
            delay = self.config['base_delay']

        if time_since_last < delay:
            wait_time = delay - time_since_last
            print(f"  ⏳ Rate limiting: waiting {wait_time:.1f} seconds...")
            time.sleep(wait_time)

        self.last_request_time = time.time()

    def record_success(self):
        """Record successful request"""
        self.consecutive_failures = 0

    def record_failure(self):
        """Record failed request"""
        self.consecutive_failures += 1

# =============================================================================
# GEMINI API KEY SETUP
# =============================================================================

def get_gemini_api_key():
    """Gets the Gemini API key from environment variables or user input."""
    print("🔧 Getting Gemini API key...")
    api_key = os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        print("⚠️ GOOGLE_API_KEY not found in environment variables.")
        print("Please get your free API key from: https://makersuite.google.com/app/apikey")
        api_key = input("Enter your Gemini API key: ").strip()
        if not api_key:
            raise ValueError("API key is required to proceed")
    print("✅ Gemini API key loaded.")
    return api_key

# =============================================================================
# LEVEL 1: WEB SCRAPING FUNCTIONS (Enhanced with better error handling)
# =============================================================================

class WebScraper:
    """Advanced web scraper for pet breed information"""
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(HEADERS)
        self.rate_limiter = RateLimiter({'base_delay': 1, 'max_delay': 10, 'backoff_factor': 1.5, 'max_retries': 3})

    def scrape_akc_data(self, breed_name: str) -> Dict[str, str]:
        """Scrape American Kennel Club data for a breed"""
        print(f"  🔍 [AKC Scraper] Searching for '{breed_name}'...")
        search_name = breed_name.lower().replace(' ', '-').replace("'", "")
        url = f"https://www.akc.org/dog-breeds/{search_name}/"

        self.rate_limiter.wait_if_needed()

        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'lxml')
            scraped_data = {}

            for section_key, keywords in {
                'breed_group': ['breed-group'],
                'health': ['health', 'care'],
                'training': ['training', 'personality', 'temperament'],
                'history': ['history', 'origin'],
                'grooming': ['grooming', 'coat']
            }.items():
                if section_key == 'breed_group':
                    elem = soup.find('span', class_='breed-group')
                    if elem:
                        scraped_data['breed_group'] = elem.get_text(strip=True)
                else:
                    content = self._find_section_content(soup, keywords)
                    if content:
                        scraped_data[section_key] = content[:1000]

            print(f"  ✅ [AKC] Found {len(scraped_data)} sections for '{breed_name}'")
            self.rate_limiter.record_success()
            return scraped_data

        except requests.exceptions.RequestException as e:
            print(f"  ⚠️ [AKC] Network error for '{breed_name}': {e}")
            self.rate_limiter.record_failure()
            return {}

    def scrape_wikipedia_data(self, breed_name: str) -> Dict[str, str]:
        """Scrape Wikipedia data for additional breed information"""
        print(f"  🔍 [Wikipedia] Searching for '{breed_name}'...")
        url = f"https://en.wikipedia.org/wiki/{quote_plus(breed_name.replace(' ', '_'))}"

        self.rate_limiter.wait_if_needed()

        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'lxml')
            scraped_data = {}

            content_div = soup.find('div', {'class': 'mw-parser-output'})
            if content_div:
                paragraphs = content_div.find_all('p', recursive=False)[:3]
                general_info = ' '.join([p.get_text(strip=True) for p in paragraphs])
                if general_info:
                    scraped_data['wikipedia_info'] = re.sub(r'\[.*?\]', '', general_info)[:1500]

            print(f"  ✅ [Wikipedia] Found general info for '{breed_name}'")
            self.rate_limiter.record_success()
            return scraped_data

        except Exception as e:
            print(f"  ⚠️ [Wikipedia] Error for '{breed_name}': {e}")
            self.rate_limiter.record_failure()
            return {}

    def _find_section_content(self, soup: BeautifulSoup, keywords: List[str]) -> Optional[str]:
        """Find content sections based on header keywords"""
        for keyword in keywords:
            header = soup.find(['h1', 'h2', 'h3', 'h4'], string=re.compile(keyword, re.IGNORECASE))
            if header:
                content_elements = []
                for sibling in header.find_next_siblings():
                    if sibling.name in ['h1', 'h2', 'h3', 'h4']:
                        break
                    if sibling.name == 'p':
                        content_elements.append(sibling.get_text(strip=True))
                if content_elements:
                    return ' '.join(content_elements)
        return None

# =============================================================================
# LEVEL 2: ENHANCED GEMINI LLM ENRICHMENT
# =============================================================================

class GeminiEnricher:
    """Enhanced LLM-powered data enrichment using Google Gemini REST API"""
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL_NAME}:generateContent"
        self.rate_limiter = RateLimiter(RATE_LIMIT_CONFIG)

    def enrich_breed_data(self, breed_name: str, original_data: Dict, scraped_data: Dict) -> Dict[str, Any]:
        """Use Gemini REST API to generate comprehensive breed features with retry logic"""
        print(f"  🤖 [Gemini] Enriching data for '{breed_name}'...")

        for attempt in range(RATE_LIMIT_CONFIG['max_retries']):
            try:
                self.rate_limiter.wait_if_needed()

                prompt = self._create_enrichment_prompt(breed_name, original_data, scraped_data)
                payload = {"contents": [{"parts": [{"text": prompt}]}]}
                headers = {'Content-Type': 'application/json'}
                params = {'key': self.api_key}

                response = requests.post(
                    self.api_url,
                    headers=headers,
                    params=params,
                    json=payload,
                    timeout=60
                )

                if response.status_code == 429:
                    print(f"  ⚠️ [Gemini] Rate limit hit (attempt {attempt + 1}/{RATE_LIMIT_CONFIG['max_retries']})")
                    self.rate_limiter.record_failure()
                    if attempt < RATE_LIMIT_CONFIG['max_retries'] - 1:
                        wait_time = RATE_LIMIT_CONFIG['base_delay'] * (RATE_LIMIT_CONFIG['backoff_factor'] ** attempt)
                        print(f"  ⏳ Waiting {wait_time} seconds before retry...")
                        time.sleep(wait_time)
                        continue
                    else:
                        raise requests.exceptions.RequestException("Max retries exceeded for rate limiting")

                response.raise_for_status()

                response_json = response.json()
                content_text = response_json['candidates'][0]['content']['parts'][0]['text']

                enriched_data = self._parse_gemini_response(content_text)
                if enriched_data:
                    print(f"  ✅ [Gemini] Successfully enriched '{breed_name}' with {len(enriched_data)} features")
                    self.rate_limiter.record_success()
                    return enriched_data
                else:
                    raise ValueError("Parsed data is empty")

            except Exception as e:
                print(f"  ❌ [Gemini] Attempt {attempt + 1} failed for '{breed_name}': {e}")
                self.rate_limiter.record_failure()
                if attempt < RATE_LIMIT_CONFIG['max_retries'] - 1:
                    wait_time = RATE_LIMIT_CONFIG['base_delay'] * (RATE_LIMIT_CONFIG['backoff_factor'] ** attempt)
                    print(f"  ⏳ Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"  ❌ [Gemini] All attempts failed for '{breed_name}', using fallback data")
                    return self._create_fallback_data(breed_name)

        return self._create_fallback_data(breed_name)

    def _create_enrichment_prompt(self, breed_name: str, original_data: Dict, scraped_data: Dict) -> str:
        """Creates the detailed prompt for the Gemini model."""
        return f"""You are an expert pet data analyst creating a comprehensive breed profile.

BREED: {breed_name}

EXISTING DATA: {json.dumps(original_data, indent=2)}
SCRAPED WEB DATA: {json.dumps(scraped_data, indent=2)}

Create a JSON object with these exact keys: {json.dumps(NEW_FEATURES_ADDED)}

CRITICAL REQUIREMENTS:
1. Return ONLY valid JSON - no markdown, no extra text
2. Use "health_disclaimer": "This information is not a substitute for professional veterinary advice. Please consult a vet for any health issues."
3. For boolean fields, use true/false (not strings)
4. For arrays, provide actual arrays like ["item1", "item2"] or [] if empty
5. Keep responses concise but informative

OUTPUT FORMAT: Return only the JSON object, nothing else."""

    def _parse_gemini_response(self, response_text: str) -> Optional[Dict]:
        """Enhanced JSON parsing with multiple fallback strategies"""
        # Clean the response text
        cleaned_text = response_text.strip()

        # Remove markdown code blocks if present
        if cleaned_text.startswith('```json'):
            cleaned_text = cleaned_text[7:]
        if cleaned_text.startswith('```'):
            cleaned_text = cleaned_text[3:]
        if cleaned_text.endswith('```'):
            cleaned_text = cleaned_text[:-3]

        cleaned_text = cleaned_text.strip()

        # Try to find JSON within the text
        json_start = cleaned_text.find('{')
        json_end = cleaned_text.rfind('}')

        if json_start != -1 and json_end != -1 and json_end > json_start:
            cleaned_text = cleaned_text[json_start:json_end+1]

        try:
            data = json.loads(cleaned_text)

            # Ensure all required keys are present
            missing_keys = set(NEW_FEATURES_ADDED) - set(data.keys())
            if missing_keys:
                print(f"  ⚠️ [Parser] Missing keys: {missing_keys}. Filling with defaults.")
                for key in missing_keys:
                    data[key] = self._get_default_value(key)

            # Clean up any extra keys
            data = {key: data[key] for key in NEW_FEATURES_ADDED if key in data}

            return data

        except json.JSONDecodeError as e:
            print(f"  ❌ [Parser] JSON parsing failed: {e}")
            print(f"  📄 Response preview: {cleaned_text[:200]}...")
            return None

    def _get_default_value(self, key: str) -> Any:
        """Get appropriate default value for a key"""
        if key == "health_disclaimer":
            return "This information is not a substitute for professional veterinary advice. Please consult a vet for any health issues."
        elif key in ["recommended_health_tests", "common_health_concerns"]:
            return []
        elif key == "good_for_first_time_owners":
            return "Unknown - consult with breed experts"
        else:
            return "Information not available"

    def _create_fallback_data(self, breed_name: str) -> Dict[str, Any]:
        """Create fallback data structure when Gemini fails completely"""
        return {key: self._get_default_value(key) for key in NEW_FEATURES_ADDED}

# =============================================================================
# ENHANCED MAIN PIPELINE CLASS
# =============================================================================

class PetDatasetPipeline:
    """Enhanced main pipeline orchestrator with progress tracking"""
    def __init__(self):
        self.scraper = WebScraper()
        self.api_key = get_gemini_api_key()
        self.enricher = GeminiEnricher(self.api_key)
        self.progress_tracker = ProgressTracker(PROGRESS_FILE)
        self.processed_count = 0
        self.failed_count = 0

    def load_input_data(self, file_path: str) -> Dict:
        """Load and validate input data"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            print(f"📂 Loaded {len(data)} breeds from '{file_path}'")
            return data
        except FileNotFoundError:
            print(f"❌ Input file '{file_path}' not found! Please upload it.")
            return {}

    def process_breed(self, breed_name: str, original_data: Dict) -> Dict:
        """Process a single breed through the complete pipeline"""
        print(f"\n🔄 Processing: {breed_name}")
        print("=" * 50)

        try:
            print("📡 LEVEL 1: Web Scraping")
            akc_data = self.scraper.scrape_akc_data(breed_name)
            wikipedia_data = self.scraper.scrape_wikipedia_data(breed_name)
            all_scraped_data = {**akc_data, **wikipedia_data}

            print("🤖 LEVEL 2: LLM Enrichment")
            enriched_features = self.enricher.enrich_breed_data(
                breed_name, original_data, all_scraped_data
            )

            final_data = {**original_data, **enriched_features}
            self.processed_count += 1
            self.progress_tracker.mark_completed(breed_name)
            print(f"✅ Successfully processed '{breed_name}'")
            return final_data

        except Exception as e:
            print(f"❌ Top-level failure processing '{breed_name}': {e}")
            self.failed_count += 1
            self.progress_tracker.mark_failed(breed_name)
            return original_data

    def run_pipeline(self, input_file: str, output_file: str, limit: Optional[int] = None):
        """Run the complete pipeline with resume capability"""
        print("🚀 STARTING PET DATASET ENRICHMENT PIPELINE")
        print("=" * 60)

        input_data = self.load_input_data(input_file)
        if not input_data:
            return

        all_breeds = list(input_data.keys())
        if limit:
            all_breeds = all_breeds[:limit]

        # Check for resumable progress
        remaining_breeds = self.progress_tracker.get_remaining_breeds(all_breeds)
        already_completed = len(all_breeds) - len(remaining_breeds)

        if already_completed > 0:
            print(f"📋 Resuming pipeline: {already_completed} breeds already completed")
            print(f"📋 Will process {len(remaining_breeds)} remaining breeds")
        else:
            print(f"📋 Will process {len(remaining_breeds)} breeds")

        # Load existing results if available
        final_dataset = {}
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                final_dataset = json.load(f)
                print(f"📂 Loaded existing results: {len(final_dataset)} breeds")
        except FileNotFoundError:
            print("📂 Starting fresh - no existing results found")

        start_time = time.time()

        for i, breed_name in enumerate(remaining_breeds, 1):
            print(f"\n📊 Progress: {i}/{len(remaining_breeds)} (Total: {already_completed + i}/{len(all_breeds)})")
            final_dataset[breed_name] = self.process_breed(breed_name, input_data[breed_name])

            # Save intermediate results every 5 breeds
            if i % 5 == 0:
                self._save_results(final_dataset, output_file)
                print(f"💾 Intermediate save completed ({i} breeds processed)")

        self._save_results(final_dataset, output_file)
        elapsed_time = time.time() - start_time

        print("\n🎉 PIPELINE COMPLETED!")
        print("=" * 60)
        print(f"⏱️  Total time: {elapsed_time:.1f} seconds")
        print(f"✅ Successfully processed: {self.processed_count} breeds")
        print(f"❌ Failed: {self.failed_count} breeds")
        print(f"💾 Final dataset saved to: '{output_file}'")

    def _save_results(self, dataset: Dict, output_file: str):
        """Save the enriched dataset"""
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, indent=4, ensure_ascii=False)

# =============================================================================
# MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    print("\n" + "="*60)
    print("READY TO START PIPELINE")
    print("="*60)
    print("\nChoose an option:")
    print("1. Run demo (3 breeds) - Recommended for testing")
    print("2. Run full pipeline (all breeds)")
    print("3. Resume interrupted pipeline")

    choice = input("\nEnter your choice (1, 2, or 3): ").strip()

    pipeline = PetDatasetPipeline()

    if choice == "1":
        pipeline.run_pipeline(INPUT_FILE, 'demo_' + OUTPUT_FILE, limit=3)
    elif choice == "2":
        pipeline.run_pipeline(INPUT_FILE, OUTPUT_FILE)
    elif choice == "3":
        pipeline.run_pipeline(INPUT_FILE, OUTPUT_FILE)
    else:
        print("❌ Invalid choice. Running demo by default.")
        pipeline.run_pipeline(INPUT_FILE, 'demo_' + OUTPUT_FILE, limit=3)

    print("\n🎉 Script completed! Check the output files for results.")


READY TO START PIPELINE

Choose an option:
1. Run demo (3 breeds) - Recommended for testing
2. Run full pipeline (all breeds)
3. Resume interrupted pipeline

Enter your choice (1, 2, or 3): 2
🔧 Getting Gemini API key...
⚠️ GOOGLE_API_KEY not found in environment variables.
Please get your free API key from: https://makersuite.google.com/app/apikey
Enter your Gemini API key: AIzaSyADI--hSVZgSqFBKFH4Mbc-i-bCglLvTAE
✅ Gemini API key loaded.
🚀 STARTING PET DATASET ENRICHMENT PIPELINE
📂 Loaded 45 breeds from 'chatbot_training_data.json'
📋 Will process 45 breeds
📂 Starting fresh - no existing results found

📊 Progress: 1/45 (Total: 1/45)

🔄 Processing: Rottweiler
📡 LEVEL 1: Web Scraping
  🔍 [AKC Scraper] Searching for 'Rottweiler'...
  ✅ [AKC] Found 0 sections for 'Rottweiler'
  🔍 [Wikipedia] Searching for 'Rottweiler'...
  ⏳ Rate limiting: waiting 0.8 seconds...
  ✅ [Wikipedia] Found general info for 'Rottweiler'
🤖 LEVEL 2: LLM Enrichment
  🤖 [Gemini] Enriching data for 'Rottweiler'...
  ✅

In [None]:
#!/usr/bin/env python3
"""
Pet Dataset Finalization - Missing Value Imputation using Grok LLM
====================================================================
This script scans the final enriched dataset for any missing values (empty strings or lists)
and uses the Grok LLM API (via an OpenAI-compatible endpoint) to intelligently fill them
based on the existing context for each breed.

This is the final step to create a production-ready dataset.
"""

# =============================================================================
# SETUP AND INSTALLATION
# =============================================================================

# Grok's API is OpenAI-compatible, so we use the openai library
!pip install -q openai

import json
import time
import os
import random
from typing import Dict, List, Any
from openai import OpenAI

# =============================================================================
# CONFIGURATION AND CONSTANTS
# =============================================================================

# File paths
INPUT_FILE = 'final_enriched_training_data.json'
OUTPUT_FILE = 'Fluffyn.json'

# Grok API configuration
GROQ_API_BASE_URL = "https://api.groq.com/openai/v1"
GROQ_MODEL_NAME = "llama3-8b-8192" # Using the fast 8b model for this task

# =============================================================================
# GROK API SETUP
# =============================================================================

def get_grok_api_key():
    """Gets the Grok API key from environment variables or user input."""
    print("🔧 Getting Groq API key...")
    api_key = os.environ.get("GROQ_API_KEY")
    if not api_key:
        print("⚠️ GROQ_API_KEY not found in environment variables.")
        print("Please get your free API key from: https://console.groq.com/keys")
        api_key = input("Enter your Groq API key: ").strip()
        if not api_key:
            raise ValueError("Groq API key is required to proceed")
    print("✅ Groq API key loaded.")
    return api_key

# =============================================================================
# CORE LOGIC: Grok-Powered Data Filler
# =============================================================================

class GrokDataFiller:
    """
    Uses Grok's LLM to intelligently fill missing values in a dataset.
    """
    def __init__(self, api_key: str):
        # Initialize the OpenAI client to point to Grok's endpoint
        self.client = OpenAI(
            api_key=api_key,
            base_url=GROQ_API_BASE_URL,
        )
        print(f"🧠 Grok client initialized with model: {GROQ_MODEL_NAME}")

    def _create_fill_prompt(self, breed_name: str, field_to_fill: str, existing_data: Dict) -> str:
        """Creates a highly specific prompt to fill one missing field."""

        # Remove empty fields from existing data to provide clean context
        clean_context = {k: v for k, v in existing_data.items() if v}

        return f"""
        You are a pet data expert. Your task is to fill in a single missing piece of information for a pet breed profile based on the provided context.

        BREED: "{breed_name}"

        EXISTING DATA CONTEXT:
        {json.dumps(clean_context, indent=2)}

        MISSING FIELD TO FILL: "{field_to_fill}"

        Based on all the provided context about the "{breed_name}", generate a concise and accurate value for the missing field.

        IMPORTANT: Output ONLY the value for the field. Do not say "The value for the field is...". Just provide the direct answer. If you are generating a list, provide a Python-style list of strings e.g., ["item1", "item2"].
        """

    def fill_missing_field(self, breed_name: str, field_to_fill: str, existing_data: Dict) -> Any:
        """Calls the Grok API to fill a single field, with exponential backoff."""
        prompt = self._create_fill_prompt(breed_name, field_to_fill, existing_data)

        max_retries = 3
        base_delay = 5  # seconds

        for attempt in range(max_retries):
            try:
                print(f"    🤖 Calling Groq for field: '{field_to_fill}'...")
                chat_completion = self.client.chat.completions.create(
                    messages=[{"role": "user", "content": prompt}],
                    model=GROQ_MODEL_NAME,
                    temperature=0.5, # Be more factual
                    max_tokens=256,
                )

                result = chat_completion.choices[0].message.content.strip()

                # Try to parse if it's a list, otherwise return as string
                if result.startswith('[') and result.endswith(']'):
                    try:
                        return json.loads(result)
                    except json.JSONDecodeError:
                        return result # Return as string if parsing fails

                return result

            except Exception as e:
                # Check for rate limit error (usually a 429 status code in the error message)
                if '429' in str(e):
                    wait_time = base_delay * (2 ** attempt) + random.uniform(0, 1)
                    print(f"    ⚠️ Rate limit hit. Retrying in {wait_time:.1f} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"    ❌ An unexpected error occurred: {e}")
                    return None # Fail on other errors

        print(f"    ❌ Failed to get data for '{field_to_fill}' after {max_retries} retries.")
        return None

    def process_dataset(self, data: Dict) -> Dict:
        """Iterates through the dataset and fills all missing values."""
        final_data = data.copy()
        total_breeds = len(final_data)

        for i, (breed_name, breed_data) in enumerate(final_data.items(), 1):
            print(f"\n🔄 Processing breed {i}/{total_breeds}: {breed_name}")

            # Find fields with missing values (empty string or empty list)
            missing_fields = [k for k, v in breed_data.items() if v == "" or v == []]

            if not missing_fields:
                print("  ✅ No missing values found. Skipping.")
                continue

            print(f"  🔍 Found {len(missing_fields)} missing fields: {missing_fields}")

            for field in missing_fields:
                # Pass the current state of breed_data as context
                filled_value = self.fill_missing_field(breed_name, field, final_data[breed_name])

                if filled_value:
                    print(f"    ✅ Filled '{field}' successfully.")
                    final_data[breed_name][field] = filled_value
                else:
                    print(f"    ❌ Failed to fill '{field}'. Leaving as is.")

                time.sleep(1) # Add a small delay between field requests

        return final_data

# =============================================================================
# MAIN EXECUTION
# =============================================================================
def run():
    """Main function to run the entire pipeline."""
    print("🚀 STARTING FINAL DATASET IMPUTATION PIPELINE")
    print("=" * 60)

    try:
        api_key = get_grok_api_key()
        filler = GrokDataFiller(api_key=api_key)

        print(f"📂 Loading data from '{INPUT_FILE}'...")
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            input_data = json.load(f)

        final_dataset = filler.process_dataset(input_data)

        print("\n🎉 PIPELINE COMPLETED!")
        print("=" * 60)

        print(f"💾 Saving fully enriched dataset to '{OUTPUT_FILE}'...")
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(final_dataset, f, indent=4, ensure_ascii=False)
        print("✅ Success! Your final dataset is ready.")

    except Exception as e:
        print(f"\n❌ A critical error occurred during the pipeline: {e}")

if __name__ == "__main__":
    run()

🚀 STARTING FINAL DATASET IMPUTATION PIPELINE
🔧 Getting Groq API key...
⚠️ GROQ_API_KEY not found in environment variables.
Please get your free API key from: https://console.groq.com/keys
Enter your Groq API key: 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
✅ Groq API key loaded.
🧠 Grok client initialized with model: llama3-8b-8192
📂 Loading data from 'final_enriched_training_data.json'...

🔄 Processing breed 1/45: Rottweiler
  🔍 Found 1 missing fields: ['celebrity_owners']
    🤖 Calling Groq for field: 'celebrity_owners'...
    ✅ Filled 'celebrity_owners' successfully.

🔄 Processing breed 2/45: Lhasa Apso
  🔍 Found 1 missing fields: ['celebrity_owners']
    🤖 Calling Groq for field: 'celebrity_owners'...
    ✅ Filled 'celebrity_owners' successfully.

🔄 Processing breed 3/45: Beagle
  🔍 Found 1 missing fields: ['celebrity_owners']
    🤖 Calling Groq for field: 'celebrity_owners'...
    ✅ Filled 'celebrity_owners' successfully.

🔄 Processing breed 4/45: American Bulldog
  🔍 