# Company Metadata Enrichment with GPT-5

This notebook enriches ASX company metadata using GPT-5 with Deep Research capabilities.

## Features:
- Fetch existing metadata from Payload CMS
- Generate comprehensive company profiles using GPT-5
- Extract company logos from Google Cloud Storage
- Fetch annual reports from ASX and company websites
- Store enriched data in main Postgres database

## Processing:
- Supports subset processing for testing
- Checkpoint-based resumption
- Comprehensive error handling and retry logic


In [30]:
# Cell 1: Dependencies and Setup
import httpx
import pandas as pd
from openai import OpenAI
import json
from sqlalchemy import create_engine, text
import os
from tqdm import tqdm
from datetime import datetime
import time
from typing import Dict, List, Optional, Any
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import re
from urllib.parse import urljoin

# Load environment variables
load_dotenv()

print("‚úì Dependencies loaded successfully")


‚úì Dependencies loaded successfully


In [31]:
# OpenAI Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise ValueError('OPENAI_API_KEY environment variable is required. Please set it in your .env file.')
client = OpenAI(api_key=OPENAI_API_KEY)

# Database Configuration
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://admin:password@localhost:5432/shorts')
CMS_DATABASE_URL = os.getenv('CMS_DATABASE_URL', 'postgresql://admin:password@localhost:5432/cms')

# GCS Configuration
GCS_BUCKET = os.getenv('GCS_BUCKET', 'shorted-company-logos')
GCS_LOGO_BASE_URL = os.getenv('GCS_LOGO_BASE_URL', 'https://storage.googleapis.com/shorted-company-logos/logos')

# Processing Configuration
PROCESS_SUBSET = os.getenv('PROCESS_SUBSET', 'True').lower() == 'true'
SUBSET_SIZE = int(os.getenv('SUBSET_SIZE', '10'))
BATCH_SIZE = int(os.getenv('BATCH_SIZE', '50'))
CHECKPOINT_INTERVAL = int(os.getenv('CHECKPOINT_INTERVAL', '50'))

# API Rate Limiting
MAX_RETRIES = int(os.getenv('MAX_RETRIES', '3'))
RETRY_DELAY = int(os.getenv('RETRY_DELAY', '5'))

# Checkpoint file
CHECKPOINT_FILE = 'data/enrichment_checkpoint.json'
RESULTS_FILE = 'data/enriched_metadata_results.csv'

print(f"Configuration loaded:")
print(f"  - Process Subset: {PROCESS_SUBSET}")
print(f"  - Subset Size: {SUBSET_SIZE}")
print(f"  - Database: {DATABASE_URL.split('@')[1] if '@' in DATABASE_URL else 'localhost'}")
print(f"  - GCS Base URL: {GCS_LOGO_BASE_URL}")


Configuration loaded:
  - Process Subset: True
  - Subset Size: 3
  - Database: aws-0-ap-southeast-2.pooler.supabase.com:5432/postgres
  - GCS Base URL: https://storage.googleapis.com/shorted-company-logos/logos


In [None]:
# Cell 3: Data Fetching

def fetch_existing_metadata() -> pd.DataFrame:
    """
    Fetch existing company metadata from Payload CMS database, including investor links.
    """
    engine = create_engine(CMS_DATABASE_URL)
    
    # Fetch base metadata
    query = """
    SELECT 
        m.id,
        m.stock_code,
        m.company_name,
        m.industry,
        m.market_cap,
        m.listing_date,
        m.address,
        m.summary,
        m.details,
        m.website,
        m.company_logo_link
    FROM metadata m
    WHERE m.stock_code IS NOT NULL
    ORDER BY m.company_name
    """
    
    df = pd.read_sql(query, engine)
    
    # Fetch investor links (2,044 links for 1,931 companies!)
    links_query = """
    SELECT 
        ml._parent_id,
        ml.link,
        ml._order
    FROM metadata_links ml
    ORDER BY ml._parent_id, ml._order
    """
    
    df_links = pd.read_sql(links_query, engine)
    engine.dispose()
    
    # Aggregate links per company
    if not df_links.empty:
        df_links_agg = df_links.groupby('_parent_id')['link'].apply(list).reset_index()
        df_links_agg.columns = ['id', 'investor_links']
        df = df.merge(df_links_agg, on='id', how='left')
    else:
        df['investor_links'] = None
    
    # Fill NaN with empty lists
    df['investor_links'] = df['investor_links'].apply(lambda x: x if isinstance(x, list) else [])
    
    # Add logo GCS URLs
    df['logo_gcs_url'] = df['stock_code'].apply(
        lambda code: f"{GCS_LOGO_BASE_URL}/{code.upper()}.svg"
    )
    
    companies_with_links = (df['investor_links'].str.len() > 0).sum()
    print(f"‚úì Fetched {len(df)} companies from Payload CMS")
    print(f"‚úì {companies_with_links} companies have investor links (avg {df['investor_links'].str.len().mean():.1f} links each)")
    
    return df

def load_checkpoint() -> Dict[str, Any]:
    """
    Load checkpoint data to resume processing.
    """
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint = json.load(f)
        print(f"‚úì Loaded checkpoint: {checkpoint['processed_count']} companies processed")
        return checkpoint
    return {'processed_count': 0, 'processed_codes': []}

def save_checkpoint(checkpoint: Dict[str, Any]):
    """
    Save checkpoint data for resumption.
    """
    os.makedirs(os.path.dirname(CHECKPOINT_FILE), exist_ok=True)
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint, f, indent=2)

# Fetch data
df_metadata = fetch_existing_metadata()
checkpoint = load_checkpoint()

print(f"\nDataFrame shape: {df_metadata.shape}")
print(f"Columns: {list(df_metadata.columns)}")
df_metadata.head()


‚úì Fetched 1954 companies from Payload CMS
‚úì Loaded checkpoint: 12 companies processed

DataFrame shape: (1954, 11)
Columns: ['stock_code', 'company_name', 'industry', 'market_cap', 'listing_date', 'address', 'summary', 'details', 'website', 'company_logo_link', 'logo_gcs_url']


Unnamed: 0,stock_code,company_name,industry,market_cap,listing_date,address,summary,details,website,company_logo_link,logo_gcs_url
0,MML,MCLAREN MINING LIMITED,Materials,3972048.0,02/05/2022,"C/- Argus Corporate Partners, Level 4, 225 St ...","Mining, exploration and development in WA; Tit...",McLaren Mining Limited (ASX:MML formerly Allup...,https://mclarenminerals.com.au,\thttps://mclarenminerals.com.au/wp-content/th...,https://storage.googleapis.com/shorted-company...
1,14D,1414 DEGREES LIMITED,Capital Goods,15480954.0,12/09/2018,"136 Daws Road, MELROSE PARK, SA, AUSTRALIA, 5039","Commercialising energy storage technology, the...",1414 Degrees Limited (ASX:14D) is an innovativ...,https://www.1414degrees.com.au,https://1414degrees.com.au/wp-content/uploads/...,https://storage.googleapis.com/shorted-company...
2,29M,29METALS LIMITED,Materials,371900596.0,02/07/2021,"Level 2,150 Collins Street, MELBOURNE, VIC, AU...","Mineral exploration, development and productio...",29Metals Limited (ASX:29M) is a copper-focused...,https://www.29metals.com,https://companieslogo.com/img/orig/29M.AX-866b...,https://storage.googleapis.com/shorted-company...
3,T3D,333D LIMITED,Commercial & Professional Services,836115.0,27/12/2006,"Level 23, 525 Collins Street, MELBOURNE, VIC, ...",T3D is a digital asset company merging NFTs an...,333D Limited (ASX:T3D) is Australia's 3D print...,https://www.333d.co,https://pbs.twimg.com/profile_images/153184445...,https://storage.googleapis.com/shorted-company...
4,TGP,360 CAPITAL GROUP,Financial Services,153103307.0,26/07/2005,"SUITE 3701 LEVEL 37, 1 MACQUARIE PLACE, SYDNEY...",Real estate investment and funds management.,360 Capital Group (ASX:TGP formerly Trafalgar ...,https://www.360capital.com.au,https://www.360capital.com.au/hubfs/MicrosoftT...,https://storage.googleapis.com/shorted-company...


In [33]:
# Cell 4: GPT-5 Schema Definition

ENRICHMENT_SCHEMA = {
    "type": "object",
    "properties": {
        "tags": {
            "type": "array",
            "items": {"type": "string"},
            "description": "3-7 specific specialty tags describing the company's focus (e.g., 'lithium mining', 'rare earth magnets', 'renewable energy', 'biotech oncology')"
        },
        "enhanced_summary": {
            "type": "string",
            "description": "Comprehensive company overview (500-1000 words) covering business model, market position, key operations, and strategic focus"
        },
        "company_history": {
            "type": "string",
            "description": "Historical timeline with key milestones, founding story, major acquisitions, pivots, and evolution (300-500 words)"
        },
        "key_people": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "role": {"type": "string"},
                    "bio": {"type": "string", "description": "2-3 sentence biography with relevant experience"},
                    "linkedin": {"type": "string", "description": "LinkedIn profile URL if available"}
                },
                "required": ["name", "role"]
            },
            "description": "Key executives, board members, and senior leadership"
        },
        "financial_reports": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "type": {"type": "string", "enum": ["annual_report", "quarterly_report", "half_year_report"]},
                    "date": {"type": "string", "description": "Report date in YYYY-MM-DD format"},
                    "url": {"type": "string", "description": "Direct URL to the report PDF"},
                    "title": {"type": "string"}
                },
                "required": ["type", "url"]
            },
            "description": "Links to recent annual and quarterly reports"
        },
        "competitive_advantages": {
            "type": "string",
            "description": "Unique strengths, market position, competitive moats, and strategic advantages (200-400 words)"
        },
        "risk_factors": {
            "type": "string",
            "description": "Key business risks including operational, market, regulatory, and financial risks (200-400 words)"
        },
        "recent_developments": {
            "type": "string",
            "description": "Recent news, announcements, contracts, or developments from the last 12 months (200-400 words)"
        },
        "social_media_links": {
            "type": "object",
            "properties": {
                "twitter": {"type": "string"},
                "linkedin": {"type": "string"},
                "facebook": {"type": "string"},
                "youtube": {"type": "string"}
            },
            "description": "Official social media profile URLs"
        }
    },
    "required": ["tags", "enhanced_summary"]
}

SYSTEM_PROMPT = """
You are a financial research analyst specializing in Australian Stock Exchange (ASX) listed companies.
Your task is to provide comprehensive, accurate, and well-researched company profiles.

Guidelines:
1. Use Deep Research to gather accurate, up-to-date information
2. Focus on factual, verifiable information
3. Provide specific details rather than generic descriptions
4. Include relevant industry context and market positioning
5. Cite recent developments and concrete examples
6. Maintain professional, objective tone
7. For tags, use specific, searchable terms that accurately describe the company's specialty
8. Ensure all URLs are valid and publicly accessible

Return your response as a valid JSON object matching the provided schema.
"""

print("‚úì Schema and prompts defined")
print(f"  Required fields: {ENRICHMENT_SCHEMA['required']}")


‚úì Schema and prompts defined
  Required fields: ['tags', 'enhanced_summary']


In [34]:
# Cell 5: GPT-5 Deep Research Function

def enrich_company_with_gpt5(company: pd.Series, use_deep_research: bool = True) -> Dict[str, Any]:
    """
    Enrich company metadata using GPT-5 with Deep Research.
    
    Args:
        company: Pandas Series with existing company metadata
        use_deep_research: Whether to use Deep Research mode
    
    Returns:
        Dictionary with enriched metadata
    """
    stock_code = company['stock_code']
    company_name = company['company_name']
    
    # Prepare context from existing metadata
    context = f"""
    Company: {company_name}
    ASX Code: {stock_code}
    Industry: {company.get('industry', 'N/A')}
    Website: {company.get('website', 'N/A')}
    Existing Summary: {company.get('summary', 'N/A')}
    Address: {company.get('address', 'N/A')}
    """
    
    user_prompt = f"""
    Research and provide a comprehensive profile for the following ASX-listed company:
    
    {context}
    
    Please provide detailed, accurate information following the schema. Use Deep Research to find:
    - Current company operations and business model
    - Recent announcements and developments
    - Key leadership team members
    - Links to recent annual and quarterly reports
    - Company's competitive positioning
    - Known risk factors
    - Official social media presence
    
    Focus on factual, verifiable information. For the enhanced_summary, provide a comprehensive
    overview that would be suitable for investors and analysts.
    """
    
    for attempt in range(MAX_RETRIES):
        try:
            # Use GPT-5 with structured output
            response = client.chat.completions.create(
                model="gpt-4o",  # Will be updated to gpt-5 when available
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_prompt}
                ],
                response_format={"type": "json_object"},
                temperature=0.3,
                max_tokens=4000
            )
            
            enriched_data = json.loads(response.choices[0].message.content)
            
            # Add metadata
            enriched_data['stock_code'] = stock_code
            enriched_data['enrichment_date'] = datetime.now().isoformat()
            enriched_data['enrichment_status'] = 'completed'
            enriched_data['logo_gcs_url'] = company.get('logo_gcs_url')
            
            return enriched_data
            
        except json.JSONDecodeError as e:
            print(f"‚ö† JSON decode error for {stock_code} (attempt {attempt + 1}/{MAX_RETRIES}): {e}")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
                continue
                
        except Exception as e:
            print(f"‚ö† Error enriching {stock_code} (attempt {attempt + 1}/{MAX_RETRIES}): {e}")
            if "rate_limit" in str(e).lower():
                time.sleep(RETRY_DELAY * 2)
            elif attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
                continue
    
    # Return minimal data on failure
    return {
        'stock_code': stock_code,
        'enrichment_status': 'failed',
        'enrichment_date': datetime.now().isoformat(),
        'enrichment_error': 'Failed after maximum retries'
    }

print("‚úì GPT-5 enrichment function defined")


‚úì GPT-5 enrichment function defined


In [None]:
# Cell 6: Smart Financial Report Crawler

def crawl_for_reports(start_url: str, max_depth: int = 2, max_pages: int = 20) -> List[Dict[str, str]]:
    """
    ENHANCED intelligent crawler to find financial report PDFs.
    
    Improvements:
    - Multiple PDF detection methods (URL patterns, link context, href analysis)
    - Better deduplication (URL normalization)
    - Smarter link following (priority scoring)
    - Handles both direct PDFs and download pages
    
    Args:
        start_url: Starting URL (usually investor relations page)
        max_depth: Maximum crawl depth (default: 2 levels)
        max_pages: Maximum pages to visit (default: 20)
    
    Returns:
        List of unique report dictionaries
    """
    from urllib.parse import urlparse, urljoin, urlunparse, parse_qs, urlencode
    from collections import deque
    import re
    
    reports = []
    visited = set()
    seen_pdf_urls = set()  # Track unique PDFs
    queue = deque([(start_url, 0, 10)])  # (url, depth, priority)
    base_domain = urlparse(start_url).netloc
    
    # HIGH priority keywords (strong signals for financial reports)
    high_priority_keywords = [
        'annual-report', 'annual_report', 'annualreport',
        'financial-report', 'financial_report',
        'interim-report', 'quarterly-report',
        'investor-reports', 'investor/report'
    ]
    
    # Medium priority keywords
    report_keywords = [
        'report', 'annual', 'financial', 'investor', 
        'result', 'presentation', 'disclosure'
    ]
    
    # Avoid these completely
    avoid_keywords = [
        'login', 'signup', 'register', 'cart', 'checkout',
        'subscribe', 'unsubscribe', 'cookie', 'privacy',
        'terms', 'condition', 'policy'
    ]
    
    def normalize_url(url: str) -> str:
        """Normalize URL for deduplication"""
        parsed = urlparse(url)
        # Remove query parameters that don't affect content
        query = parse_qs(parsed.query)
        # Keep only meaningful params
        cleaned_query = {k: v for k, v in query.items() if k not in ['utm_source', 'utm_medium', 'ei', 'ref']}
        new_query = urlencode(cleaned_query, doseq=True)
        return urlunparse((parsed.scheme, parsed.netloc, parsed.path, '', new_query, ''))
    
    def extract_year_from_text(text: str) -> str:
        """Extract year from text, prefer recent years"""
        years = re.findall(r'20\d{2}', text)
        if years:
            # Return most recent year found
            return max(years)
        return ''
    
    def is_financial_report_link(text: str, href: str) -> bool:
        """Determine if link text/href indicates a financial report"""
        combined = (text + ' ' + href).lower()
        
        # Must have report-related keyword
        has_report_keyword = any(kw in combined for kw in [
            'annual', 'report', 'financial', 'quarter', 'interim', 
            'full year', 'half year', 'result'
        ])
        
        # Must not have avoid keywords
        has_avoid = any(kw in combined for kw in avoid_keywords)
        
        return has_report_keyword and not has_avoid
    
    def get_link_priority(url: str, text: str) -> int:
        """Calculate priority score for following a link"""
        combined = (url + ' ' + text).lower()
        score = 5  # Base score
        
        # High priority paths
        if any(kw in combined for kw in high_priority_keywords):
            score += 10
        
        # Medium priority
        if any(kw in combined for kw in report_keywords):
            score += 5
        
        # Penalize certain patterns
        if any(kw in combined for kw in ['news', 'media', 'blog']):
            score -= 5
        
        return score
    
    while queue and len(visited) < max_pages:
        # Sort queue by priority (highest first)
        queue = deque(sorted(queue, key=lambda x: x[2], reverse=True))
        current_url, depth, priority = queue.popleft()
        
        # Normalize and check if visited
        norm_url = normalize_url(current_url)
        if norm_url in visited or depth > max_depth:
            continue
        
        visited.add(norm_url)
        
        try:
            response = httpx.get(
                current_url,
                headers={
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
                },
                timeout=15.0,
                follow_redirects=True
            )
            
            if response.status_code != 200:
                continue
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract all links
            for a_tag in soup.find_all('a', href=True):
                href = a_tag['href']
                text = a_tag.get_text().strip()
                text_lower = text.lower()
                
                # Resolve relative URLs
                full_url = urljoin(current_url, href)
                parsed = urlparse(full_url)
                
                # Only follow links on same domain
                if parsed.netloc and parsed.netloc != base_domain:
                    continue
                
                # ENHANCED PDF DETECTION
                is_pdf = (
                    full_url.lower().endswith('.pdf') or
                    '.pdf?' in full_url.lower() or
                    'download' in href.lower() and 'pdf' in (text_lower + href.lower()) or
                    parsed.path.lower().endswith('.pdf')
                )
                
                if is_pdf and is_financial_report_link(text, href):
                    # Normalize PDF URL for deduplication
                    norm_pdf_url = normalize_url(full_url)
                    
                    if norm_pdf_url in seen_pdf_urls:
                        continue
                    
                    seen_pdf_urls.add(norm_pdf_url)
                    
                    # Extract year
                    year = extract_year_from_text(text + ' ' + full_url)
                    
                    # Determine report type
                    report_type = 'annual_report'
                    if 'quarterly' in text_lower or 'quarter' in text_lower or 'q1' in text_lower or 'q2' in text_lower or 'q3' in text_lower or 'q4' in text_lower:
                        report_type = 'quarterly_report'
                    elif 'half year' in text_lower or 'interim' in text_lower or 'half-year' in text_lower:
                        report_type = 'half_year_report'
                    
                    # Clean title
                    clean_title = re.sub(r'\s+', ' ', text).strip()[:100]
                    
                    reports.append({
                        'type': report_type,
                        'url': full_url,
                        'title': clean_title if clean_title else f"{year} {report_type}",
                        'date': f"{year}-06-30" if year else '',
                        'source': 'smart_crawler',
                        'depth': depth
                    })
                
                # Should we follow this link?
                elif depth < max_depth:
                    url_lower = full_url.lower()
                    has_avoid = any(kw in url_lower or kw in text_lower for kw in avoid_keywords)
                    
                    if not has_avoid and full_url not in visited:
                        link_priority = get_link_priority(full_url, text)
                        
                        # Only follow if priority is decent
                        if link_priority >= 5:
                            queue.append((full_url, depth + 1, link_priority))
        
        except Exception as e:
            # Silently continue on errors
            pass
    
    # Final deduplication and sorting
    unique_reports = []
    seen_combinations = set()
    
    for report in reports:
        # Create signature: URL + year
        year = extract_year_from_text(report['url'] + report['title'])
        signature = f"{normalize_url(report['url'])}_{year}_{report['type']}"
        
        if signature not in seen_combinations:
            seen_combinations.add(signature)
            unique_reports.append(report)
    
    # Sort by year (most recent first)
    unique_reports.sort(key=lambda r: r.get('date', ''), reverse=True)
    
    return unique_reports

def fetch_annual_reports(company: pd.Series) -> List[Dict[str, str]]:
    """
    Fetch annual reports from PayloadCMS investor links, ASX, and company website.
    
    Priority:
    1. PayloadCMS investor links (most reliable - 1,931 companies have these!)
    2. ASX announcements API (only returns ~5 recent announcements)
    3. Company website fallback
    
    Args:
        company: Pandas Series with company metadata
    
    Returns:
        List of report dictionaries with type, date, url, title
    """
    stock_code = company['stock_code']
    reports = []
    seen_urls = set()
    
    def add_report(report_dict):
        """Helper to avoid duplicates"""
        url = report_dict.get('url', '')
        if url and url not in seen_urls:
            seen_urls.add(url)
            reports.append(report_dict)
            return True
        return False
    
    # 1. PRIORITY: Use PayloadCMS investor links with SMART CRAWLER (GOLD MINE!)
    investor_links = company.get('investor_links', [])
    if isinstance(investor_links, list) and investor_links:
        for link in investor_links[:3]:  # Try first 3 links (reduced since we crawl deeper now)
            try:
                # Skip if obviously not investor-related
                if not link or any(x in link.lower() for x in ['facebook', 'twitter', 'linkedin', 'youtube']):
                    continue
                
                # Use smart crawler to traverse the site
                crawled_reports = crawl_for_reports(link, max_depth=2, max_pages=15)
                
                # Add all found reports (crawler already deduplicates within itself)
                for report in crawled_reports:
                    add_report(report)
                
                # If we found enough reports, stop
                if len(reports) >= 10:
                    break
                        
            except Exception as e:
                # Silently continue to next link
                pass
    
    # 2. Try ASX announcements API (backup, only 5 recent announcements)
    if len(reports) < 3:  # Only if we don't have enough yet
        try:
            asx_url = f"https://cdn-api.markitdigital.com/apiman-gateway/ASX/asx-research/1.0/companies/{stock_code}/announcements"
            response = httpx.get(
                asx_url,
                headers={"User-Agent": "Mozilla/5.0"},
                timeout=10.0,
                follow_redirects=True
            )
            
            if response.status_code == 200:
                data = response.json()
                
                if isinstance(data, dict) and 'data' in data:
                    api_data = data.get('data', {})
                    if isinstance(api_data, dict):
                        announcements = api_data.get('items', [])
                        
                        if isinstance(announcements, list):
                            for announcement in announcements:
                                if not isinstance(announcement, dict):
                                    continue
                                    
                                title = announcement.get('headline', '').lower()
                                if any(kw in title for kw in ['annual report', 'full year', 'quarterly', 'half year']):
                                    report_type = 'annual_report' if 'annual' in title or 'full year' in title else 'quarterly_report'
                                    if 'half year' in title:
                                        report_type = 'half_year_report'
                                    
                                    add_report({
                                        'type': report_type,
                                        'date': announcement.get('date', ''),
                                        'url': announcement.get('url', ''),
                                        'title': announcement.get('headline', ''),
                                        'source': 'asx_api'
                                    })
        except:
            pass  # Silently fail, we have other sources
    
    # 3. Fallback: Try company website
    if len(reports) < 2 and not investor_links:  # Only if we really need it
        website = company.get('website')
        if website and website != 'N/A':
            ir_paths = ['/investors', '/investor-relations', '/investor-centre', '/about/investors', '/annual-reports']
            
            for path in ir_paths:
                if len(reports) >= 5:
                    break
                    
                try:
                    ir_url = urljoin(website, path)
                    response = httpx.get(ir_url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10.0, follow_redirects=True)
                    
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.content, 'html.parser')
                        
                        for link in soup.find_all('a', href=True):
                            href = link['href']
                            text = link.get_text().lower()
                            
                            if href.endswith('.pdf') and any(kw in text for kw in ['annual', 'report', 'financial']):
                                add_report({
                                    'type': 'annual_report',
                                    'url': urljoin(ir_url, href),
                                    'title': link.get_text().strip(),
                                    'source': 'website_scrape'
                                })
                        
                        break
                except:
                    continue
    
    return reports[:10]  # Return max 10 reports

print("‚úì Smart financial report crawler defined")
print("  - Crawls up to 2 levels deep")
print("  - Intelligently follows report-related links")
print("  - Extracts PDFs and metadata")


‚úì Annual report fetcher defined


In [36]:
# Cell 7: Resolver Pattern Implementation

def resolve_tags(enriched_data: Dict[str, Any]) -> List[str]:
    """
    Validate and clean tags.
    """
    tags = enriched_data.get('tags', [])
    
    if not isinstance(tags, list):
        return []
    
    # Clean and validate tags
    cleaned_tags = []
    for tag in tags:
        if isinstance(tag, str) and len(tag) > 2 and len(tag) < 50:
            cleaned_tags.append(tag.lower().strip())
    
    return cleaned_tags[:10]  # Max 10 tags

def resolve_key_people(enriched_data: Dict[str, Any]) -> List[Dict[str, str]]:
    """
    Validate and structure key people data.
    """
    people = enriched_data.get('key_people', [])
    
    if not isinstance(people, list):
        return []
    
    validated_people = []
    for person in people:
        if isinstance(person, dict) and 'name' in person and 'role' in person:
            validated_people.append({
                'name': person['name'],
                'role': person['role'],
                'bio': person.get('bio', ''),
                'linkedin': person.get('linkedin', '')
            })
    
    return validated_people

def resolve_financial_reports(enriched_data: Dict[str, Any], company: pd.Series) -> List[Dict[str, str]]:
    """
    Combine GPT-5 results with scraped reports.
    """
    gpt_reports = enriched_data.get('financial_reports', [])
    scraped_reports = fetch_annual_reports(company)
    
    all_reports = []
    seen_urls = set()
    
    # Add both sources, avoiding duplicates
    for report in gpt_reports + scraped_reports:
        if isinstance(report, dict) and 'url' in report:
            url = report['url']
            if url and url not in seen_urls:
                seen_urls.add(url)
                all_reports.append(report)
    
    return all_reports[:10]  # Max 10 reports

def resolve_social_media_links(enriched_data: Dict[str, Any]) -> Dict[str, str]:
    """
    Validate social media URLs.
    """
    links = enriched_data.get('social_media_links', {})
    
    if not isinstance(links, dict):
        return {}
    
    validated_links = {}
    url_pattern = re.compile(r'^https?://')
    
    for platform, url in links.items():
        if isinstance(url, str) and url_pattern.match(url):
            validated_links[platform] = url
    
    return validated_links

def apply_resolvers(enriched_data: Dict[str, Any], company: pd.Series) -> Dict[str, Any]:
    """
    Apply all resolver functions to clean and validate data.
    """
    enriched_data['tags'] = resolve_tags(enriched_data)
    enriched_data['key_people'] = resolve_key_people(enriched_data)
    enriched_data['financial_reports'] = resolve_financial_reports(enriched_data, company)
    enriched_data['social_media_links'] = resolve_social_media_links(enriched_data)
    
    return enriched_data

print("‚úì Resolver functions defined")


‚úì Resolver functions defined


In [37]:
# Cell 8: Batch Processing with Subset Support

def process_companies(df: pd.DataFrame, checkpoint: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Process companies with checkpoint support and progress tracking.
    """
    # Filter companies to process
    companies_to_process = df[~df['stock_code'].isin(checkpoint['processed_codes'])]
    
    if PROCESS_SUBSET:
        companies_to_process = companies_to_process.head(SUBSET_SIZE)
        print(f"\nüìã Processing subset of {len(companies_to_process)} companies")
    else:
        print(f"\nüìã Processing {len(companies_to_process)} companies (full dataset)")
    
    results = []
    
    # Process with progress bar
    for idx, (_, company) in enumerate(tqdm(companies_to_process.iterrows(), total=len(companies_to_process))):
        stock_code = company['stock_code']
        
        try:
            print(f"\nüîç Processing {stock_code} - {company['company_name']}")
            
            # Enrich with GPT-5
            enriched_data = enrich_company_with_gpt5(company)
            
            # Apply resolvers
            enriched_data = apply_resolvers(enriched_data, company)
            
            results.append(enriched_data)
            
            # Update checkpoint
            checkpoint['processed_codes'].append(stock_code)
            checkpoint['processed_count'] += 1
            
            # Save checkpoint periodically
            if (idx + 1) % CHECKPOINT_INTERVAL == 0:
                save_checkpoint(checkpoint)
                print(f"\nüíæ Checkpoint saved: {checkpoint['processed_count']} companies")
            
            # Rate limiting
            time.sleep(1)  # Be respectful to APIs
            
        except Exception as e:
            print(f"‚ùå Failed to process {stock_code}: {e}")
            results.append({
                'stock_code': stock_code,
                'enrichment_status': 'failed',
                'enrichment_error': str(e)
            })
    
    # Final checkpoint save
    save_checkpoint(checkpoint)
    
    return results

print("‚úì Batch processing function defined")
print(f"  Ready to process companies")


‚úì Batch processing function defined
  Ready to process companies


In [38]:
# Cell 9: Data Validation

def validate_enriched_data(results: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Validate and generate statistics for enriched data.
    """
    total = len(results)
    completed = sum(1 for r in results if r.get('enrichment_status') == 'completed')
    failed = sum(1 for r in results if r.get('enrichment_status') == 'failed')
    
    # Field coverage
    field_coverage = {}
    fields_to_check = ['tags', 'enhanced_summary', 'company_history', 'key_people', 
                       'financial_reports', 'competitive_advantages', 'risk_factors', 
                       'recent_developments', 'social_media_links']
    
    for field in fields_to_check:
        count = sum(1 for r in results if r.get(field) and r.get('enrichment_status') == 'completed')
        field_coverage[field] = {
            'count': count,
            'percentage': (count / completed * 100) if completed > 0 else 0
        }
    
    # Tag statistics
    all_tags = []
    for r in results:
        if r.get('tags'):
            all_tags.extend(r['tags'])
    
    from collections import Counter
    tag_freq = Counter(all_tags)
    
    validation_report = {
        'total_processed': total,
        'completed': completed,
        'failed': failed,
        'success_rate': (completed / total * 100) if total > 0 else 0,
        'field_coverage': field_coverage,
        'unique_tags': len(tag_freq),
        'most_common_tags': tag_freq.most_common(20)
    }
    
    return validation_report

def print_validation_report(report: Dict[str, Any]):
    """
    Print formatted validation report.
    """
    print("\n" + "="*60)
    print("üìä ENRICHMENT VALIDATION REPORT")
    print("="*60)
    
    print(f"\n‚úÖ Completed: {report['completed']}/{report['total_processed']} ({report['success_rate']:.1f}%)")
    print(f"‚ùå Failed: {report['failed']}")
    
    print("\nüìà Field Coverage:")
    for field, stats in report['field_coverage'].items():
        print(f"  {field:.<30} {stats['count']:>3} ({stats['percentage']:>5.1f}%)")
    
    print(f"\nüè∑Ô∏è  Unique Tags: {report['unique_tags']}")
    print("\nMost Common Tags:")
    for tag, count in report['most_common_tags'][:10]:
        print(f"  {tag:.<40} {count:>3}")
    
    print("\n" + "="*60)

print("‚úì Validation functions defined")


‚úì Validation functions defined


In [39]:
# Cell 10: Database Update

def update_database(results: List[Dict[str, Any]]):
    """
    Update main Postgres database with enriched data using upsert pattern.
    """
    engine = create_engine(DATABASE_URL)
    
    updated_count = 0
    failed_count = 0
    
    print(f"\nüíæ Updating database with {len(results)} records...")
    
    with engine.connect() as conn:
        for result in tqdm(results):
            stock_code = result.get('stock_code')
            
            if not stock_code:
                continue
            
            try:
                # Prepare data for upsert
                # Convert list to PostgreSQL array format and JSONB to strings
                tags_array = result.get('tags', [])
                
                update_data = {
                    'tags': tags_array if tags_array else None,
                    'enhanced_summary': result.get('enhanced_summary'),
                    'company_history': result.get('company_history'),
                    'key_people': json.dumps(result.get('key_people', [])),
                    'financial_reports': json.dumps(result.get('financial_reports', [])),
                    'competitive_advantages': result.get('competitive_advantages'),
                    'risk_factors': result.get('risk_factors') if isinstance(result.get('risk_factors'), str) else json.dumps(result.get('risk_factors', [])),
                    'recent_developments': result.get('recent_developments'),
                    'social_media_links': json.dumps(result.get('social_media_links', {})),
                    'logo_gcs_url': result.get('logo_gcs_url'),
                    'enrichment_status': result.get('enrichment_status', 'completed'),
                    'enrichment_date': result.get('enrichment_date', datetime.now().isoformat()),
                    'enrichment_error': result.get('enrichment_error'),
                    'stock_code': stock_code
                }
                
                # Upsert query (note: table name has hyphen, needs quotes)
                # Use :param style for SQLAlchemy text()
                # Note: updated_at column doesn't exist in this table, using enrichment_date instead
                query = text("""
                    UPDATE "company-metadata"
                    SET 
                        tags = :tags,
                        enhanced_summary = :enhanced_summary,
                        company_history = :company_history,
                        key_people = :key_people,
                        financial_reports = :financial_reports,
                        competitive_advantages = :competitive_advantages,
                        risk_factors = :risk_factors,
                        recent_developments = :recent_developments,
                        social_media_links = :social_media_links,
                        logo_gcs_url = :logo_gcs_url,
                        enrichment_status = :enrichment_status,
                        enrichment_date = :enrichment_date,
                        enrichment_error = :enrichment_error
                    WHERE stock_code = :stock_code
                """)
                
                conn.execute(query, update_data)
                conn.commit()
                updated_count += 1
                
            except Exception as e:
                print(f"\n‚ùå Failed to update {stock_code}: {e}")
                failed_count += 1
    
    engine.dispose()
    
    print(f"\n‚úÖ Database update complete:")
    print(f"  - Updated: {updated_count}")
    print(f"  - Failed: {failed_count}")

print("‚úì Database update function defined")


‚úì Database update function defined


In [40]:
# Cell 11: Export Results

def export_results(results: List[Dict[str, Any]], validation_report: Dict[str, Any]):
    """
    Export enriched data to CSV and generate summary.
    """
    # Convert to DataFrame
    df_results = pd.DataFrame(results)
    
    # Save to CSV
    os.makedirs(os.path.dirname(RESULTS_FILE), exist_ok=True)
    df_results.to_csv(RESULTS_FILE, index=False)
    print(f"\nüíæ Results exported to: {RESULTS_FILE}")
    
    # Save validation report
    report_file = RESULTS_FILE.replace('.csv', '_validation_report.json')
    with open(report_file, 'w') as f:
        json.dump(validation_report, f, indent=2)
    print(f"üìä Validation report saved to: {report_file}")
    
    # Generate sample output
    if len(df_results) > 0:
        print("\nüìÑ Sample Enriched Record:")
        print("="*60)
        
        sample = df_results[df_results['enrichment_status'] == 'completed'].iloc[0] if len(df_results[df_results['enrichment_status'] == 'completed']) > 0 else df_results.iloc[0]
        
        print(f"Stock Code: {sample.get('stock_code')}")
        print(f"Status: {sample.get('enrichment_status')}")
        print(f"Tags: {sample.get('tags', [])}")
        print(f"\nEnhanced Summary (first 200 chars):")
        summary = sample.get('enhanced_summary', '')
        print(summary[:200] + '...' if len(summary) > 200 else summary)
        print("\n" + "="*60)
    
    return df_results

print("‚úì Export functions defined")


‚úì Export functions defined


In [41]:
# Cell 12: Execute Pipeline

print("\n" + "="*60)
print("üöÄ STARTING ENRICHMENT PIPELINE")
print("="*60)

# Process companies
results = process_companies(df_metadata, checkpoint)

# Validate results
validation_report = validate_enriched_data(results)
print_validation_report(validation_report)

# Update database
update_database(results)

# Export results
df_results = export_results(results, validation_report)

print("\n" + "="*60)
print("‚úÖ PIPELINE COMPLETE")
print("="*60)
print(f"\nProcessed: {len(results)} companies")
print(f"Success Rate: {validation_report['success_rate']:.1f}%")
print(f"\nNext steps:")
print("  1. Review the validation report")
print("  2. Check sample records for quality")
print("  3. If satisfied, run with PROCESS_SUBSET=False for full dataset")
print("  4. Monitor API costs and adjust as needed")



üöÄ STARTING ENRICHMENT PIPELINE

üìã Processing subset of 3 companies


  0%|          | 0/3 [00:00<?, ?it/s]


üîç Processing 5GN - 5G NETWORKS LIMITED.


 33%|‚ñà‚ñà‚ñà‚ñé      | 1/3 [00:11<00:23, 11.77s/it]


üîç Processing 88E - 88 ENERGY LIMITED

üíæ Checkpoint saved: 14 companies


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2/3 [00:25<00:12, 12.85s/it]


üîç Processing 8CO - 8COMMON LIMITED


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:40<00:00, 13.37s/it]



üìä ENRICHMENT VALIDATION REPORT

‚úÖ Completed: 3/3 (100.0%)
‚ùå Failed: 0

üìà Field Coverage:
  tags..........................   2 ( 66.7%)
  enhanced_summary..............   2 ( 66.7%)
  company_history...............   0 (  0.0%)
  key_people....................   0 (  0.0%)
  financial_reports.............   1 ( 33.3%)
  competitive_advantages........   0 (  0.0%)
  risk_factors..................   2 ( 66.7%)
  recent_developments...........   0 (  0.0%)
  social_media_links............   0 (  0.0%)

üè∑Ô∏è  Unique Tags: 10

Most Common Tags:
  cloud services..........................   1
  data networks...........................   1
  managed it services.....................   1
  digital transformation..................   1
  australian it services..................   1
  expense management software.............   1
  saas....................................   1
  enterprise software.....................   1
  government contracts....................   1
  performance mana

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00,  6.90it/s]


‚úÖ Database update complete:
  - Updated: 3
  - Failed: 0

üíæ Results exported to: data/enriched_metadata_results.csv
üìä Validation report saved to: data/enriched_metadata_results_validation_report.json

üìÑ Sample Enriched Record:
Stock Code: 5GN
Status: completed
Tags: ['cloud services', 'data networks', 'managed it services', 'digital transformation', 'australian it services']

Enhanced Summary (first 200 chars):
5G Networks Limited (5GN) is an Australian digital services provider specializing in cloud solutions, data networks, and managed IT services. The company is focused on delivering integrated technology...


‚úÖ PIPELINE COMPLETE

Processed: 3 companies
Success Rate: 100.0%

Next steps:
  1. Review the validation report
  2. Check sample records for quality
  3. If satisfied, run with PROCESS_SUBSET=False for full dataset
  4. Monitor API costs and adjust as needed





## üìä Financial Reports Validation

Validate the quality and coverage of fetched financial reports.


In [43]:
# Financial Reports Statistics
print("=" * 60)
print("üìà FINANCIAL REPORTS COVERAGE ANALYSIS")
print("=" * 60)

# Overall statistics
total_companies = len(results)
companies_with_reports = sum(1 for r in results if r.get('financial_reports'))
companies_without_reports = total_companies - companies_with_reports

print(f"\n‚úÖ Companies with reports: {companies_with_reports}/{total_companies} ({companies_with_reports/total_companies*100:.1f}%)")
print(f"‚ùå Companies without reports: {companies_without_reports}/{total_companies} ({companies_without_reports/total_companies*100:.1f}%)")

# Report count distribution
report_counts = []
for r in results:
    reports = r.get('financial_reports', [])
    if isinstance(reports, str):
        try:
            reports = json.loads(reports)
        except:
            reports = []
    report_counts.append(len(reports))

if report_counts:
    print(f"\nüìä Report Count Statistics:")
    print(f"   Average reports per company: {sum(report_counts)/len(report_counts):.1f}")
    print(f"   Max reports: {max(report_counts)}")
    print(f"   Min reports: {min(report_counts)}")
    
    # Distribution
    print(f"\nüìâ Distribution:")
    for count in range(0, max(report_counts) + 1):
        num_companies = report_counts.count(count)
        if num_companies > 0:
            bar = "‚ñà" * int(num_companies / total_companies * 50)
            print(f"   {count} reports: {num_companies:3d} companies {bar}")

print("\n" + "=" * 60)


üìà FINANCIAL REPORTS COVERAGE ANALYSIS

‚úÖ Companies with reports: 1/3 (33.3%)
‚ùå Companies without reports: 2/3 (66.7%)

üìä Report Count Statistics:
   Average reports per company: 0.3
   Max reports: 1
   Min reports: 0

üìâ Distribution:
   0 reports:   2 companies ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   1 reports:   1 companies ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà



In [44]:
# Sample Financial Reports Inspection
print("=" * 60)
print("üîç SAMPLE FINANCIAL REPORTS")
print("=" * 60)

# Show detailed reports for first 5 companies with reports
sample_count = 0
for result in results:
    if sample_count >= 5:
        break
    
    reports = result.get('financial_reports', [])
    if isinstance(reports, str):
        try:
            reports = json.loads(reports)
        except:
            reports = []
    
    if reports:
        stock_code = result.get('stock_code', 'UNKNOWN')
        company_name = result.get('company_name', 'Unknown Company')
        
        print(f"\nüìÑ {stock_code} - {company_name}")
        print(f"   Found {len(reports)} report(s):")
        
        for i, report in enumerate(reports, 1):
            print(f"\n   Report #{i}:")
            print(f"      Type:  {report.get('type', 'N/A')}")
            print(f"      Title: {report.get('title', 'N/A')[:80]}...")
            print(f"      Date:  {report.get('date', 'N/A')}")
            print(f"      URL:   {report.get('url', 'N/A')[:80]}...")
        
        sample_count += 1

if sample_count == 0:
    print("\n‚ö†Ô∏è  No companies with financial reports found in results")

print("\n" + "=" * 60)


üîç SAMPLE FINANCIAL REPORTS

üìÑ 8CO - 8COMMON LIMITED
   Found 1 report(s):

   Report #1:
      Type:  annual_report
      Title: Veritas Securities 8CO Research Report...
      Date:  N/A
      URL:   https://www.8common.com/wp-content/uploads/2021/07/8CO-Veritas-Research-July-202...



In [45]:
# Report Type Analysis
print("=" * 60)
print("üìã REPORT TYPE BREAKDOWN")
print("=" * 60)

report_types = {}
for result in results:
    reports = result.get('financial_reports', [])
    if isinstance(reports, str):
        try:
            reports = json.loads(reports)
        except:
            reports = []
    
    for report in reports:
        report_type = report.get('type', 'unknown')
        report_types[report_type] = report_types.get(report_type, 0) + 1

if report_types:
    print(f"\nüìä Report Types Found:")
    for report_type, count in sorted(report_types.items(), key=lambda x: x[1], reverse=True):
        bar = "‚ñà" * int(count / sum(report_types.values()) * 50)
        print(f"   {report_type:25s}: {count:3d} {bar}")
else:
    print("\n‚ö†Ô∏è  No reports found to analyze")

print("\n" + "=" * 60)


üìã REPORT TYPE BREAKDOWN

üìä Report Types Found:
   annual_report            :   1 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà



In [46]:
# Companies Missing Reports
print("=" * 60)
print("‚ö†Ô∏è  COMPANIES WITHOUT FINANCIAL REPORTS")
print("=" * 60)

companies_without_reports = []
for result in results:
    reports = result.get('financial_reports', [])
    if isinstance(reports, str):
        try:
            reports = json.loads(reports)
        except:
            reports = []
    
    if not reports:
        companies_without_reports.append({
            'stock_code': result.get('stock_code', 'UNKNOWN'),
            'company_name': result.get('company_name', 'Unknown'),
            'website': result.get('website', 'N/A')
        })

if companies_without_reports:
    print(f"\nFound {len(companies_without_reports)} companies without reports:\n")
    for i, company in enumerate(companies_without_reports, 1):
        print(f"{i:3d}. {company['stock_code']:6s} - {company['company_name'][:50]:50s}")
        print(f"     Website: {company['website']}")
    
    print(f"\nüí° Note: GPT-5 Deep Research may have found reports not accessible via ASX API")
    print(f"   Check the 'enhanced_summary' and 'company_history' fields for report mentions")
else:
    print("\n‚úÖ All companies have at least one financial report!")

print("\n" + "=" * 60)


‚ö†Ô∏è  COMPANIES WITHOUT FINANCIAL REPORTS

Found 2 companies without reports:

  1. 5GN    - 5G Networks Limited                               
     Website: https://www.5gnetworks.au
  2. 88E    - Unknown                                           
     Website: N/A

üí° Note: GPT-5 Deep Research may have found reports not accessible via ASX API
   Check the 'enhanced_summary' and 'company_history' fields for report mentions



In [None]:
# Report Date Range Analysis
print("=" * 60)
print("üìÖ REPORT DATE RANGE ANALYSIS")
print("=" * 60)

from datetime import datetime

report_dates = []
invalid_dates = 0

for result in results:
    reports = result.get('financial_reports', [])
    if isinstance(reports, str):
        try:
            reports = json.loads(reports)
        except:
            reports = []
    
    for report in reports:
        date_str = report.get('date', '')
        if date_str:
            try:
                # Try parsing common date formats
                for fmt in ['%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d', '%d-%m-%Y']:
                    try:
                        date_obj = datetime.strptime(date_str, fmt)
                        report_dates.append(date_obj)
                        break
                    except:
                        continue
                else:
                    invalid_dates += 1
            except:
                invalid_dates += 1

if report_dates:
    oldest = min(report_dates)
    newest = max(report_dates)
    
    print(f"\nüìä Date Range:")
    print(f"   Oldest report: {oldest.strftime('%Y-%m-%d')}")
    print(f"   Newest report: {newest.strftime('%Y-%m-%d')}")
    print(f"   Total reports with valid dates: {len(report_dates)}")
    
    if invalid_dates > 0:
        print(f"   ‚ö†Ô∏è  Reports with invalid dates: {invalid_dates}")
    
    # Year distribution
    print(f"\nüìÖ Reports by Year:")
    years = {}
    for date_obj in report_dates:
        year = date_obj.year
        years[year] = years.get(year, 0) + 1
    
    for year in sorted(years.keys(), reverse=True):
        count = years[year]
        bar = "‚ñà" * int(count / max(years.values()) * 40)
        print(f"   {year}: {count:3d} {bar}")
else:
    print("\n‚ö†Ô∏è  No valid report dates found")

print("\n" + "=" * 60)


In [None]:
# Report URL Validation
print("=" * 60)
print("üîó REPORT URL VALIDATION")
print("=" * 60)

url_sources = {
    'ASX API': 0,
    'Company Website': 0,
    'Other/Invalid': 0
}

missing_urls = 0
duplicate_urls = {}

for result in results:
    reports = result.get('financial_reports', [])
    if isinstance(reports, str):
        try:
            reports = json.loads(reports)
        except:
            reports = []
    
    for report in reports:
        url = report.get('url', '')
        
        if not url or url == 'N/A':
            missing_urls += 1
        else:
            # Track URL sources
            if 'asx.com.au' in url.lower():
                url_sources['ASX API'] += 1
            elif any(domain in url.lower() for domain in ['.com', '.com.au', '.net', '.org']):
                url_sources['Company Website'] += 1
            else:
                url_sources['Other/Invalid'] += 1
            
            # Check for duplicates
            duplicate_urls[url] = duplicate_urls.get(url, 0) + 1

print(f"\nüìä URL Source Breakdown:")
for source, count in sorted(url_sources.items(), key=lambda x: x[1], reverse=True):
    if count > 0:
        bar = "‚ñà" * int(count / sum(url_sources.values()) * 40) if sum(url_sources.values()) > 0 else ""
        print(f"   {source:20s}: {count:3d} {bar}")

if missing_urls > 0:
    print(f"\n‚ö†Ô∏è  Reports with missing URLs: {missing_urls}")

# Check for duplicates
duplicates = {url: count for url, count in duplicate_urls.items() if count > 1}
if duplicates:
    print(f"\n‚ö†Ô∏è  Duplicate URLs found: {len(duplicates)}")
    for url, count in list(duplicates.items())[:3]:
        print(f"   {url[:70]}... (appears {count}x)")
else:
    print(f"\n‚úÖ No duplicate URLs found")

print("\n" + "=" * 60)


## üß™ Test Smart Crawler

Test the smart crawler on a specific company to see how it finds reports.


In [None]:
# Test the smart crawler on 5GN (should find 13 years of reports!)
test_company = df_metadata[df_metadata['stock_code'] == '5GN'].iloc[0]

print("=" * 80)
print(f"üß™ TESTING SMART CRAWLER: {test_company['stock_code']} - {test_company['company_name']}")
print("=" * 80)

investor_links = test_company.get('investor_links', [])
print(f"\nüìã Investor links from PayloadCMS: {len(investor_links)}")
for i, link in enumerate(investor_links, 1):
    print(f"   {i}. {link}")

if investor_links:
    print(f"\nüîç Crawling first link: {investor_links[0]}")
    print("   (This may take 10-30 seconds...)\n")
    
    import time
    start_time = time.time()
    
    # Run the smart crawler
    reports = crawl_for_reports(investor_links[0], max_depth=2, max_pages=20)
    
    elapsed = time.time() - start_time
    
    print(f"‚úÖ Crawl completed in {elapsed:.1f}s")
    print(f"üìä Found {len(reports)} financial reports:\n")
    
    if reports:
        for i, report in enumerate(reports, 1):
            print(f"   {i:2d}. [{report['type']:20s}] {report['title'][:60]}")
            print(f"       URL: {report['url'][:80]}")
            print(f"       Date: {report.get('date', 'N/A'):12s} | Depth: {report.get('depth', 0)} | Source: {report.get('source', 'N/A')}")
            print()
    else:
        print("   ‚ö†Ô∏è  No reports found - the crawler may need adjustment")
    
    print(f"\nüí° The full enrichment will use this data plus ASX API and GPT-5 research")
else:
    print("\n‚ö†Ô∏è  No investor links found for this company")

print("=" * 80)


In [42]:
# Cell 13: Optional - View Results

# Display results summary
if 'df_results' in locals():
    print("\nüìä Results Summary:")
    print(f"Total records: {len(df_results)}")
    print(f"\nStatus distribution:")
    print(df_results['enrichment_status'].value_counts())
    
    # Show companies with most tags
    if 'tags' in df_results.columns:
        df_results['tag_count'] = df_results['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
        print("\nüè∑Ô∏è  Companies with most tags:")
        print(df_results[['stock_code', 'tag_count', 'tags']].sort_values('tag_count', ascending=False).head(10))
    
    # Display full DataFrame
    df_results.head(10)



üìä Results Summary:
Total records: 3

Status distribution:
enrichment_status
completed    3
Name: count, dtype: int64

üè∑Ô∏è  Companies with most tags:
  stock_code  tag_count                                               tags
0        5GN          5  [cloud services, data networks, managed it ser...
2        8CO          5  [expense management software, saas, enterprise...
1        88E          0                                                 []
