# Company Metadata Enrichment with GPT-5

This notebook enriches ASX company metadata using GPT-5 with Deep Research capabilities.

## Features:
- Fetch existing metadata from Payload CMS
- Generate comprehensive company profiles using GPT-5
- Extract company logos from Google Cloud Storage
- Fetch annual reports from ASX and company websites
- Store enriched data in main Postgres database

## Processing:
- Supports subset processing for testing
- Checkpoint-based resumption
- Comprehensive error handling and retry logic


In [None]:
# Cell 1: Dependencies and Setup
import httpx
import pandas as pd
from openai import OpenAI
import json
from sqlalchemy import create_engine, text
import os
from tqdm import tqdm
from datetime import datetime
import time
from typing import Dict, List, Optional, Any
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import re
from urllib.parse import urljoin

# Load environment variables
load_dotenv()

print("‚úì Dependencies loaded successfully")


In [None]:
 Cell 2: Configuration

# OpenAI Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise ValueError('OPENAI_API_KEY environment variable is required. Please set it in your .env file.')
client = OpenAI(api_key=OPENAI_API_KEY)

# Database Configuration
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://admin:password@localhost:5432/shorts')
CMS_DATABASE_URL = os.getenv('CMS_DATABASE_URL', 'postgresql://admin:password@localhost:5432/cms')

# GCS Configuration
GCS_BUCKET = os.getenv('GCS_BUCKET', 'shorted-company-logos')
GCS_LOGO_BASE_URL = os.getenv('GCS_LOGO_BASE_URL', 'https://storage.googleapis.com/shorted-company-logos/logos')

# Processing Configuration
PROCESS_SUBSET = os.getenv('PROCESS_SUBSET', 'True').lower() == 'true'
SUBSET_SIZE = int(os.getenv('SUBSET_SIZE', '10'))
BATCH_SIZE = int(os.getenv('BATCH_SIZE', '50'))
CHECKPOINT_INTERVAL = int(os.getenv('CHECKPOINT_INTERVAL', '50'))

# API Rate Limiting
MAX_RETRIES = int(os.getenv('MAX_RETRIES', '3'))
RETRY_DELAY = int(os.getenv('RETRY_DELAY', '5'))

# Checkpoint file
CHECKPOINT_FILE = 'data/enrichment_checkpoint.json'
RESULTS_FILE = 'data/enriched_metadata_results.csv'

print(f"Configuration loaded:")
print(f"  - Process Subset: {PROCESS_SUBSET}")
print(f"  - Subset Size: {SUBSET_SIZE}")
print(f"  - Database: {DATABASE_URL.split('@')[1] if '@' in DATABASE_URL else 'localhost'}")
print(f"  - GCS Base URL: {GCS_LOGO_BASE_URL}")


In [None]:
# Cell 3: Data Fetching

def fetch_existing_metadata() -> pd.DataFrame:
    """
    Fetch existing company metadata from Payload CMS database.
    """
    engine = create_engine(CMS_DATABASE_URL)
    
    query = """
    SELECT 
        stock_code,
        company_name,
        industry,
        market_cap,
        listing_date,
        address,
        summary,
        details,
        website,
        company_logo_link
    FROM metadata
    WHERE stock_code IS NOT NULL
    ORDER BY company_name
    """
    
    df = pd.read_sql(query, engine)
    engine.dispose()
    
    # Add logo GCS URLs
    df['logo_gcs_url'] = df['stock_code'].apply(
        lambda code: f"{GCS_LOGO_BASE_URL}/{code.upper()}.svg"
    )
    
    print(f"‚úì Fetched {len(df)} companies from Payload CMS")
    return df

def load_checkpoint() -> Dict[str, Any]:
    """
    Load checkpoint data to resume processing.
    """
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint = json.load(f)
        print(f"‚úì Loaded checkpoint: {checkpoint['processed_count']} companies processed")
        return checkpoint
    return {'processed_count': 0, 'processed_codes': []}

def save_checkpoint(checkpoint: Dict[str, Any]):
    """
    Save checkpoint data for resumption.
    """
    os.makedirs(os.path.dirname(CHECKPOINT_FILE), exist_ok=True)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint, f, indent=2)

# Fetch data
df_metadata = fetch_existing_metadata()
checkpoint = load_checkpoint()

print(f"\nDataFrame shape: {df_metadata.shape}")
print(f"Columns: {list(df_metadata.columns)}")
df_metadata.head()


In [None]:
# Cell 4: GPT-5 Schema Definition

ENRICHMENT_SCHEMA = {
    "type": "object",
    "properties": {
        "tags": {
            "type": "array",
            "items": {"type": "string"},
            "description": "3-7 specific specialty tags describing the company's focus (e.g., 'lithium mining', 'rare earth magnets', 'renewable energy', 'biotech oncology')"
        },
        "enhanced_summary": {
            "type": "string",
            "description": "Comprehensive company overview (500-1000 words) covering business model, market position, key operations, and strategic focus"
        },
        "company_history": {
            "type": "string",
            "description": "Historical timeline with key milestones, founding story, major acquisitions, pivots, and evolution (300-500 words)"
        },
        "key_people": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "role": {"type": "string"},
                    "bio": {"type": "string", "description": "2-3 sentence biography with relevant experience"},
                    "linkedin": {"type": "string", "description": "LinkedIn profile URL if available"}
                },
                "required": ["name", "role"]
            },
            "description": "Key executives, board members, and senior leadership"
        },
        "financial_reports": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "type": {"type": "string", "enum": ["annual_report", "quarterly_report", "half_year_report"]},
                    "date": {"type": "string", "description": "Report date in YYYY-MM-DD format"},
                    "url": {"type": "string", "description": "Direct URL to the report PDF"},
                    "title": {"type": "string"}
                },
                "required": ["type", "url"]
            },
            "description": "Links to recent annual and quarterly reports"
        },
        "competitive_advantages": {
            "type": "string",
            "description": "Unique strengths, market position, competitive moats, and strategic advantages (200-400 words)"
        },
        "risk_factors": {
            "type": "string",
            "description": "Key business risks including operational, market, regulatory, and financial risks (200-400 words)"
        },
        "recent_developments": {
            "type": "string",
            "description": "Recent news, announcements, contracts, or developments from the last 12 months (200-400 words)"
        },
        "social_media_links": {
            "type": "object",
            "properties": {
                "twitter": {"type": "string"},
                "linkedin": {"type": "string"},
                "facebook": {"type": "string"},
                "youtube": {"type": "string"}
            },
            "description": "Official social media profile URLs"
        }
    },
    "required": ["tags", "enhanced_summary"]
}

SYSTEM_PROMPT = """
You are a financial research analyst specializing in Australian Stock Exchange (ASX) listed companies.
Your task is to provide comprehensive, accurate, and well-researched company profiles.

Guidelines:
1. Use Deep Research to gather accurate, up-to-date information
2. Focus on factual, verifiable information
3. Provide specific details rather than generic descriptions
4. Include relevant industry context and market positioning
5. Cite recent developments and concrete examples
6. Maintain professional, objective tone
7. For tags, use specific, searchable terms that accurately describe the company's specialty
8. Ensure all URLs are valid and publicly accessible

Return your response as a valid JSON object matching the provided schema.
"""

print("‚úì Schema and prompts defined")
print(f"  Required fields: {ENRICHMENT_SCHEMA['required']}")


In [None]:
# Cell 5: GPT-5 Deep Research Function

def enrich_company_with_gpt5(company: pd.Series, use_deep_research: bool = True) -> Dict[str, Any]:
    """
    Enrich company metadata using GPT-5 with Deep Research.
    
    Args:
        company: Pandas Series with existing company metadata
        use_deep_research: Whether to use Deep Research mode
    
    Returns:
        Dictionary with enriched metadata
    """
    stock_code = company['stock_code']
    company_name = company['company_name']
    
    # Prepare context from existing metadata
    context = f"""
    Company: {company_name}
    ASX Code: {stock_code}
    Industry: {company.get('industry', 'N/A')}
    Website: {company.get('website', 'N/A')}
    Existing Summary: {company.get('summary', 'N/A')}
    Address: {company.get('address', 'N/A')}
    """
    
    user_prompt = f"""
    Research and provide a comprehensive profile for the following ASX-listed company:
    
    {context}
    
    Please provide detailed, accurate information following the schema. Use Deep Research to find:
    - Current company operations and business model
    - Recent announcements and developments
    - Key leadership team members
    - Links to recent annual and quarterly reports
    - Company's competitive positioning
    - Known risk factors
    - Official social media presence
    
    Focus on factual, verifiable information. For the enhanced_summary, provide a comprehensive
    overview that would be suitable for investors and analysts.
    """
    
    for attempt in range(MAX_RETRIES):
        try:
            # Use GPT-5 with structured output
            response = client.chat.completions.create(
                model="gpt-4o",  # Will be updated to gpt-5 when available
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_prompt}
                ],
                response_format={"type": "json_object"},
                temperature=0.3,
                max_tokens=4000
            )
            
            enriched_data = json.loads(response.choices[0].message.content)
            
            # Add metadata
            enriched_data['stock_code'] = stock_code
            enriched_data['enrichment_date'] = datetime.now().isoformat()
            enriched_data['enrichment_status'] = 'completed'
            enriched_data['logo_gcs_url'] = company.get('logo_gcs_url')
            
            return enriched_data
            
        except json.JSONDecodeError as e:
            print(f"‚ö† JSON decode error for {stock_code} (attempt {attempt + 1}/{MAX_RETRIES}): {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
                continue
                
        except Exception as e:
            print(f"‚ö† Error enriching {stock_code} (attempt {attempt + 1}/{MAX_RETRIES}): {e}")
            if "rate_limit" in str(e).lower():
                time.sleep(RETRY_DELAY * 2)
            elif attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
                continue
    
    # Return minimal data on failure
    return {
        'stock_code': stock_code,
        'enrichment_status': 'failed',
        'enrichment_date': datetime.now().isoformat(),
        'enrichment_error': 'Failed after maximum retries'
    }

print("‚úì GPT-5 enrichment function defined")


In [None]:
# Cell 6: Annual Report Fetcher

def fetch_annual_reports(company: pd.Series) -> List[Dict[str, str]]:
    """
    Fetch annual reports from ASX announcements and company website.
    
    Args:
        company: Pandas Series with company metadata
    
    Returns:
        List of report dictionaries with type, date, url, title
    """
    stock_code = company['stock_code']
    reports = []
    
    # Try ASX announcements API
    try:
        asx_url = f"https://cdn-api.markitdigital.com/apiman-gateway/ASX/asx-research/1.0/companies/{stock_code}/announcements"
        response = httpx.get(
            asx_url,
            headers={"User-Agent": "Mozilla/5.0"},
            timeout=10.0,
            follow_redirects=True
        )
        
        if response.status_code == 200:
            data = response.json()
            
            # Filter for annual and quarterly reports
            for announcement in data.get('data', [])[:20]:  # Check last 20 announcements
                title = announcement.get('header', '').lower()
                if any(keyword in title for keyword in ['annual report', 'full year', 'quarterly', 'half year']):
                    report_type = 'annual_report' if 'annual' in title or 'full year' in title else 'quarterly_report'
                    if 'half year' in title:
                        report_type = 'half_year_report'
                    
                    reports.append({
                        'type': report_type,
                        'date': announcement.get('documentDate', ''),
                        'url': announcement.get('url', ''),
                        'title': announcement.get('header', '')
                    })
    except Exception as e:
        print(f"  ‚ö† Could not fetch ASX announcements for {stock_code}: {e}")
    
    # Try company website investor relations page
    website = company.get('website')
    if website and website != 'N/A':
        try:
            # Common investor relations paths
            ir_paths = ['/investors', '/investor-relations', '/investor-centre', '/about/investors']
            
            for path in ir_paths:
                try:
                    ir_url = urljoin(website, path)
                    response = httpx.get(
                        ir_url,
                        headers={"User-Agent": "Mozilla/5.0"},
                        timeout=10.0,
                        follow_redirects=True
                    )
                    
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.content, 'html.parser')
                        
                        # Find PDF links that look like reports
                        for link in soup.find_all('a', href=True):
                            href = link['href']
                            text = link.get_text().lower()
                            
                            if href.endswith('.pdf') and any(keyword in text for keyword in ['annual', 'report', 'financial']):
                                full_url = urljoin(ir_url, href)
                                
                                # Avoid duplicates
                                if not any(r['url'] == full_url for r in reports):
                                    reports.append({
                                        'type': 'annual_report',
                                        'url': full_url,
                                        'title': link.get_text().strip()
                                    })
                        
                        break  # Found a working IR page
                        
                except:
                    continue
                    
        except Exception as e:
            print(f"  ‚ö† Could not scrape website for {stock_code}: {e}")
    
    return reports[:5]  # Return max 5 most recent reports

print("‚úì Annual report fetcher defined")


In [None]:
# Cell 7: Resolver Pattern Implementation

def resolve_tags(enriched_data: Dict[str, Any]) -> List[str]:
    """
    Validate and clean tags.
    """
    tags = enriched_data.get('tags', [])
    
    if not isinstance(tags, list):
        return []
    
    # Clean and validate tags
    cleaned_tags = []
    for tag in tags:
        if isinstance(tag, str) and len(tag) > 2 and len(tag) < 50:
            cleaned_tags.append(tag.lower().strip())
    
    return cleaned_tags[:10]  # Max 10 tags

def resolve_key_people(enriched_data: Dict[str, Any]) -> List[Dict[str, str]]:
    """
    Validate and structure key people data.
    """
    people = enriched_data.get('key_people', [])
    
    if not isinstance(people, list):
        return []
    
    validated_people = []
    for person in people:
        if isinstance(person, dict) and 'name' in person and 'role' in person:
            validated_people.append({
                'name': person['name'],
                'role': person['role'],
                'bio': person.get('bio', ''),
                'linkedin': person.get('linkedin', '')
            })
    
    return validated_people

def resolve_financial_reports(enriched_data: Dict[str, Any], company: pd.Series) -> List[Dict[str, str]]:
    """
    Combine GPT-5 results with scraped reports.
    """
    gpt_reports = enriched_data.get('financial_reports', [])
    scraped_reports = fetch_annual_reports(company)
    
    all_reports = []
    seen_urls = set()
    
    # Add both sources, avoiding duplicates
    for report in gpt_reports + scraped_reports:
        if isinstance(report, dict) and 'url' in report:
            url = report['url']
            if url and url not in seen_urls:
                seen_urls.add(url)
                all_reports.append(report)
    
    return all_reports[:10]  # Max 10 reports

def resolve_social_media_links(enriched_data: Dict[str, Any]) -> Dict[str, str]:
    """
    Validate social media URLs.
    """
    links = enriched_data.get('social_media_links', {})
    
    if not isinstance(links, dict):
        return {}
    
    validated_links = {}
    url_pattern = re.compile(r'^https?://')
    
    for platform, url in links.items():
        if isinstance(url, str) and url_pattern.match(url):
            validated_links[platform] = url
    
    return validated_links

def apply_resolvers(enriched_data: Dict[str, Any], company: pd.Series) -> Dict[str, Any]:
    """
    Apply all resolver functions to clean and validate data.
    """
    enriched_data['tags'] = resolve_tags(enriched_data)
    enriched_data['key_people'] = resolve_key_people(enriched_data)
    enriched_data['financial_reports'] = resolve_financial_reports(enriched_data, company)
    enriched_data['social_media_links'] = resolve_social_media_links(enriched_data)
    
    return enriched_data

print("‚úì Resolver functions defined")


In [None]:
# Cell 8: Batch Processing with Subset Support

def process_companies(df: pd.DataFrame, checkpoint: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Process companies with checkpoint support and progress tracking.
    """
    # Filter companies to process
    companies_to_process = df[~df['stock_code'].isin(checkpoint['processed_codes'])]
    
    if PROCESS_SUBSET:
        companies_to_process = companies_to_process.head(SUBSET_SIZE)
        print(f"\nüìã Processing subset of {len(companies_to_process)} companies")
    else:
        print(f"\nüìã Processing {len(companies_to_process)} companies (full dataset)")
    
    results = []
    
    # Process with progress bar
    for idx, (_, company) in enumerate(tqdm(companies_to_process.iterrows(), total=len(companies_to_process))):
        stock_code = company['stock_code']
        
        try:
            print(f"\nüîç Processing {stock_code} - {company['company_name']}")
            
            # Enrich with GPT-5
            enriched_data = enrich_company_with_gpt5(company)
            
            # Apply resolvers
            enriched_data = apply_resolvers(enriched_data, company)
            
            results.append(enriched_data)
            
            # Update checkpoint
            checkpoint['processed_codes'].append(stock_code)
            checkpoint['processed_count'] += 1
            
            # Save checkpoint periodically
            if (idx + 1) % CHECKPOINT_INTERVAL == 0:
                save_checkpoint(checkpoint)
                print(f"\nüíæ Checkpoint saved: {checkpoint['processed_count']} companies")
            
            # Rate limiting
            time.sleep(1)  # Be respectful to APIs
            
        except Exception as e:
            print(f"‚ùå Failed to process {stock_code}: {e}")
            results.append({
                'stock_code': stock_code,
                'enrichment_status': 'failed',
                'enrichment_error': str(e)
            })
    
    # Final checkpoint save
    save_checkpoint(checkpoint)
    
    return results

print("‚úì Batch processing function defined")
print(f"  Ready to process companies")


In [None]:
# Cell 9: Data Validation

def validate_enriched_data(results: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Validate and generate statistics for enriched data.
    """
    total = len(results)
    completed = sum(1 for r in results if r.get('enrichment_status') == 'completed')
    failed = sum(1 for r in results if r.get('enrichment_status') == 'failed')
    
    # Field coverage
    field_coverage = {}
    fields_to_check = ['tags', 'enhanced_summary', 'company_history', 'key_people', 
                       'financial_reports', 'competitive_advantages', 'risk_factors', 
                       'recent_developments', 'social_media_links']
    
    for field in fields_to_check:
        count = sum(1 for r in results if r.get(field) and r.get('enrichment_status') == 'completed')
        field_coverage[field] = {
            'count': count,
            'percentage': (count / completed * 100) if completed > 0 else 0
        }
    
    # Tag statistics
    all_tags = []
    for r in results:
        if r.get('tags'):
            all_tags.extend(r['tags'])
    
    from collections import Counter
    tag_freq = Counter(all_tags)
    
    validation_report = {
        'total_processed': total,
        'completed': completed,
        'failed': failed,
        'success_rate': (completed / total * 100) if total > 0 else 0,
        'field_coverage': field_coverage,
        'unique_tags': len(tag_freq),
        'most_common_tags': tag_freq.most_common(20)
    }
    
    return validation_report

def print_validation_report(report: Dict[str, Any]):
    """
    Print formatted validation report.
    """
    print("\n" + "="*60)
    print("üìä ENRICHMENT VALIDATION REPORT")
    print("="*60)
    
    print(f"\n‚úÖ Completed: {report['completed']}/{report['total_processed']} ({report['success_rate']:.1f}%)")
    print(f"‚ùå Failed: {report['failed']}")
    
    print("\nüìà Field Coverage:")
    for field, stats in report['field_coverage'].items():
        print(f"  {field:.<30} {stats['count']:>3} ({stats['percentage']:>5.1f}%)")
    
    print(f"\nüè∑Ô∏è  Unique Tags: {report['unique_tags']}")
    print("\nMost Common Tags:")
    for tag, count in report['most_common_tags'][:10]:
        print(f"  {tag:.<40} {count:>3}")
    
    print("\n" + "="*60)

print("‚úì Validation functions defined")


In [None]:
# Cell 10: Database Update

def update_database(results: List[Dict[str, Any]]):
    """
    Update main Postgres database with enriched data using upsert pattern.
    """
    engine = create_engine(DATABASE_URL)
    
    updated_count = 0
    failed_count = 0
    
    print(f"\nüíæ Updating database with {len(results)} records...")
    
    with engine.connect() as conn:
        for result in tqdm(results):
            stock_code = result.get('stock_code')
            
            if not stock_code:
                continue
            
            try:
                # Prepare data for upsert
                update_data = {
                    'tags': result.get('tags', []),
                    'enhanced_summary': result.get('enhanced_summary'),
                    'company_history': result.get('company_history'),
                    'key_people': json.dumps(result.get('key_people', [])),
                    'financial_reports': json.dumps(result.get('financial_reports', [])),
                    'competitive_advantages': result.get('competitive_advantages'),
                    'risk_factors': result.get('risk_factors'),
                    'recent_developments': result.get('recent_developments'),
                    'social_media_links': json.dumps(result.get('social_media_links', {})),
                    'logo_gcs_url': result.get('logo_gcs_url'),
                    'enrichment_status': result.get('enrichment_status', 'completed'),
                    'enrichment_date': result.get('enrichment_date', datetime.now().isoformat()),
                    'enrichment_error': result.get('enrichment_error')
                }
                
                # Upsert query (note: table name has hyphen, needs quotes)
                query = text("""
                    UPDATE "company-metadata"
                    SET 
                        tags = :tags,
                        enhanced_summary = :enhanced_summary,
                        company_history = :company_history,
                        key_people = :key_people::jsonb,
                        financial_reports = :financial_reports::jsonb,
                        competitive_advantages = :competitive_advantages,
                        risk_factors = :risk_factors,
                        recent_developments = :recent_developments,
                        social_media_links = :social_media_links::jsonb,
                        logo_gcs_url = :logo_gcs_url,
                        enrichment_status = :enrichment_status,
                        enrichment_date = :enrichment_date::timestamp,
                        enrichment_error = :enrichment_error,
                        updated_at = CURRENT_TIMESTAMP
                    WHERE stock_code = :stock_code
                """)
                
                conn.execute(query, {**update_data, 'stock_code': stock_code})
                conn.commit()
                updated_count += 1
                
            except Exception as e:
                print(f"\n‚ùå Failed to update {stock_code}: {e}")
                failed_count += 1
    
    engine.dispose()
    
    print(f"\n‚úÖ Database update complete:")
    print(f"  - Updated: {updated_count}")
    print(f"  - Failed: {failed_count}")

print("‚úì Database update function defined")


In [None]:
# Cell 11: Export Results

def export_results(results: List[Dict[str, Any]], validation_report: Dict[str, Any]):
    """
    Export enriched data to CSV and generate summary.
    """
    # Convert to DataFrame
    df_results = pd.DataFrame(results)
    
    # Save to CSV
    os.makedirs(os.path.dirname(RESULTS_FILE), exist_ok=True)
    df_results.to_csv(RESULTS_FILE, index=False)
    print(f"\nüíæ Results exported to: {RESULTS_FILE}")
    
    # Save validation report
    report_file = RESULTS_FILE.replace('.csv', '_validation_report.json')
    with open(report_file, 'w') as f:
        json.dump(validation_report, f, indent=2)
    print(f"üìä Validation report saved to: {report_file}")
    
    # Generate sample output
    if len(df_results) > 0:
        print("\nüìÑ Sample Enriched Record:")
        print("="*60)
        
        sample = df_results[df_results['enrichment_status'] == 'completed'].iloc[0] if len(df_results[df_results['enrichment_status'] == 'completed']) > 0 else df_results.iloc[0]
        
        print(f"Stock Code: {sample.get('stock_code')}")
        print(f"Status: {sample.get('enrichment_status')}")
        print(f"Tags: {sample.get('tags', [])}")
        print(f"\nEnhanced Summary (first 200 chars):")
        summary = sample.get('enhanced_summary', '')
        print(summary[:200] + '...' if len(summary) > 200 else summary)
        print("\n" + "="*60)
    
    return df_results

print("‚úì Export functions defined")


In [None]:
# Cell 12: Execute Pipeline

print("\n" + "="*60)
print("üöÄ STARTING ENRICHMENT PIPELINE")
print("="*60)

# Process companies
results = process_companies(df_metadata, checkpoint)

# Validate results
validation_report = validate_enriched_data(results)
print_validation_report(validation_report)

# Update database
update_database(results)

# Export results
df_results = export_results(results, validation_report)

print("\n" + "="*60)
print("‚úÖ PIPELINE COMPLETE")
print("="*60)
print(f"\nProcessed: {len(results)} companies")
print(f"Success Rate: {validation_report['success_rate']:.1f}%")
print(f"\nNext steps:")
print("  1. Review the validation report")
print("  2. Check sample records for quality")
print("  3. If satisfied, run with PROCESS_SUBSET=False for full dataset")
print("  4. Monitor API costs and adjust as needed")


In [None]:
# Cell 13: Optional - View Results

# Display results summary
if 'df_results' in locals():
    print("\nüìä Results Summary:")
    print(f"Total records: {len(df_results)}")
    print(f"\nStatus distribution:")
    print(df_results['enrichment_status'].value_counts())
    
    # Show companies with most tags
    if 'tags' in df_results.columns:
        df_results['tag_count'] = df_results['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
        print("\nüè∑Ô∏è  Companies with most tags:")
        print(df_results[['stock_code', 'tag_count', 'tags']].sort_values('tag_count', ascending=False).head(10))
    
    # Display full DataFrame
    df_results.head(10)
