In [14]:
import requests
from bs4 import BeautifulSoup
from typing import List, Optional
from pydantic import BaseModel, HttpUrl, Field
import pandas as pd
from urllib.parse import urljoin
import logging
import json
from dotenv import load_dotenv
from openai import OpenAI
import os


In [15]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [16]:
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [21]:
import requests
from bs4 import BeautifulSoup
import logging
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def get_page_content(url, timeout=10):
    """
    Fetch webpage content with robust error handling and retry mechanism.
    
    Args:
        url (str): URL to fetch
        timeout (int): Request timeout in seconds
    
    Returns:
        BeautifulSoup object or None
    """
    # Configure retry strategy
    retry_strategy = Retry(
        total=3,  # Total number of retries
        backoff_factor=0.1,  # Exponential backoff
        status_forcelist=[500, 502, 503, 504, 403, 429],  # Retry on these status codes
        allowed_methods=["GET"]  # Replace method_whitelist with allowed_methods
    )
    
    # Create a session with retry and timeout
    session = requests.Session()
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # Custom headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'DNT': '1',  # Do Not Track Request Header
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    }
    
    try:
        # Send GET request with custom headers and timeout
        response = session.get(
            url, 
            headers=headers, 
            timeout=timeout,
            verify=True  # SSL certificate verification
        )
        
        # Raise an exception for bad status codes
        response.raise_for_status()
        
        # Parse and return BeautifulSoup object
        return BeautifulSoup(response.text, 'html.parser')
    
    except requests.exceptions.HTTPError as http_err:
        logging.error(f"HTTP error occurred: {http_err}")
        logging.error(f"Status code: {response.status_code}")
        logging.error(f"Response content: {response.text}")
    except requests.exceptions.ConnectionError as conn_err:
        logging.error(f"Error connecting: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        logging.error(f"Timeout error: {timeout_err}")
    except requests.exceptions.RequestException as req_err:
        logging.error(f"Unexpected error: {req_err}")
    except Exception as e:
        logging.error(f"Unexpected error fetching {url}: {str(e)}")
    
    return None

In [39]:
html = str(get_page_content("https://tinyseed.com/portfolio"))

In [31]:
type(html)

str

In [37]:
def suggest_scraping_method(html_content: str) -> dict:
    """
    Analyze the HTML content and suggest the most appropriate scraping method.
    
    Args:
        html_content (BeautifulSoup or str): HTML content of the portfolio page
    
    Returns:
        dict: Suggestions for scraping method and confidence
    """
    # Convert to BeautifulSoup if it's a string
    if isinstance(html_content, str):
        soup = BeautifulSoup(html_content, 'html.parser')
    else:
        soup = html_content
    
    # Analysis criteria
    analysis = {
        'individual_urls': {
            'score': 0,
            'indicators': [
                soup.find_all('a', href=re.compile(r'/company/|/portfolio/|/startup/')),
                soup.select('a[href*="company"]'),
                soup.find_all(class_=re.compile(r'company-link|portfolio-item'))
            ]
        },
        'inline_list': {
            'score': 0,
            'indicators': [
                soup.find_all(class_=re.compile(r'company-list|portfolio-companies|company-grid')),
                soup.find_all('table', class_=re.compile(r'companies')),
                soup.find_all('div', class_=re.compile(r'company-info|portfolio-item'))
            ]
        },
        'text_extraction': {
            'score': 0,
            'indicators': [
                soup.find_all(['p', 'div'], text=re.compile(r'\b[A-Z][a-z]+ (Inc\.|LLC|Technologies|Company)\b')),
                soup.find_all(class_=re.compile(r'description|company-text'))
            ]
        }
    }
    
    # Calculate scores
    for method, data in analysis.items():
        # Sum the lengths of found elements for each indicator
        data['score'] = sum(len(indicator) for indicator in data['indicators'] if isinstance(indicator, list))
    
    # Determine best method
    best_method = max(analysis, key=lambda x: analysis[x]['score'])
    
    # Prepare detailed suggestion
    suggestion = {
        'method': best_method,
        'confidence': min(analysis[best_method]['score'] / 10, 1.0),
        'details': {
            'individual_urls_count': len([url for group in analysis['individual_urls']['indicators'] for url in group]),
            'inline_list_count': len([item for group in analysis['inline_list']['indicators'] for item in group]),
            'text_extraction_count': len([text for group in analysis['text_extraction']['indicators'] for text in group])
        }
    }
    
    return suggestion

# Example usage
def analyze_portfolio_page(html_content):
    suggestion = suggest_scraping_method(html_content)
    
    print("Suggested Scraping Method:")
    print(f"Method: {suggestion['method']}")
    print(f"Confidence: {suggestion['confidence']:.2%}")
    print("Detailed Indicators:")
    for key, value in suggestion['details'].items():
        print(f"  {key.replace('_', ' ').title()}: {value}")
    
    return suggestion

In [40]:
suggest_scraping_method(html)

  soup.find_all(['p', 'div'], text=re.compile(r'\b[A-Z][a-z]+ (Inc\.|LLC|Technologies|Company)\b')),


{'method': 'individual_urls',
 'confidence': 0.2,
 'details': {'individual_urls_count': 2,
  'inline_list_count': 0,
  'text_extraction_count': 1}}

In [None]:
suggest_scraping_method(html)