In [73]:
import requests
from bs4 import BeautifulSoup
from typing import List, Optional
from pydantic import BaseModel, HttpUrl, Field
import pandas as pd
from urllib.parse import urljoin
import logging
import json
from dotenv import load_dotenv
from openai import OpenAI
import os


In [77]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [74]:
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [30]:
def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        logger.error(f"Error fetching {url}: {str(e)}")
        return None

In [37]:
html_content = get_page_content("https://www.nvfund.com/portfolio/amphista")

In [88]:
print(get_page_content("https://www.blackbird.vc/portfolio"))

<!DOCTYPE html><!-- Last Published: Fri Jan 24 2025 02:27:46 GMT+0000 (Coordinated Universal Time) --><html data-wf-domain="www.blackbird.vc" data-wf-page="642a5cb525146171310f77fa" data-wf-site="62d67952134ba00982cbd56b" lang="en"><head><meta charset="utf-8"/><title>Portfolio | Blackbird</title><meta content="We believe that giant leaps forward are made by a passionate few, so we invest in founders, not sectors or stages. Our 100+ companies are united only by their ambition to tackle the worldâs greatest problems, led by founders who are doing their lifeâs work." name="description"/><meta content="Portfolio | Blackbird" property="og:title"/><meta content="We believe that giant leaps forward are made by a passionate few, so we invest in founders, not sectors or stages. Our 100+ companies are united only by their ambition to tackle the worldâs greatest problems, led by founders who are doing their lifeâs work." property="og:description"/><meta content="https://cdn.prod.website-f

In [36]:
def get_page_content(url):
    """Fetch and return the page content."""
    try:
        response = requests.get(url, timeout=10)  # Added timeout
        response.raise_for_status()
        
        if not response.text:
            logger.error(f"Empty response from {url}")
            return None
            
        return response.text
        
    except requests.Timeout:
        logger.error(f"Timeout fetching {url}")
        return None
    except requests.RequestException as e:
        logger.error(f"Error fetching {url}: {str(e)}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error fetching {url}: {str(e)}")
        return None

In [43]:
def clean_html(html_content):
    """Clean HTML content by removing scripts, styles, and unnecessary elements."""
    if not html_content:
        logger.error("Received empty HTML content")
        return ""
        
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'meta']):
            element.decompose()
            
        # Get text while preserving some structure
        lines = []
        for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'div']):
            text = element.get_text(strip=True)
            if text:
                tag_name = element.name
                class_names = ' '.join(element.get('class', []))
                lines.append(f"{tag_name} {class_names}: {text}")
        
        return '\n'.join(lines)
    except Exception as e:
        logger.error(f"Error cleaning HTML: {str(e)}")
        return ""

In [45]:
print(clean_html(html_content))

div dialog-off-canvas-main-canvas: It looks like you are using an older version of Internet Explorer which is not supported. We advise that you update your browser to the latest version of Microsoft Edge, or consider using other browsers such as Chrome, Firefox or Safari.BackAmphista TherapeuticsBart Dzikowski- DirectorPlatformCambridge, UKAmphista Therapeutics is developing medicines for hard-to-treat diseases using next-generation targeted protein degradation approaches.Visit Amphista Therapeutics
div line-height-spacing: It looks like you are using an older version of Internet Explorer which is not supported. We advise that you update your browser to the latest version of Microsoft Edge, or consider using other browsers such as Chrome, Firefox or Safari.BackAmphista TherapeuticsBart Dzikowski- DirectorPlatformCambridge, UKAmphista Therapeutics is developing medicines for hard-to-treat diseases using next-generation targeted protein degradation approaches.Visit Amphista Therapeutics


In [84]:
import json
import logging
import re

# Set up logging
logger = logging.getLogger(__name__)

def extract_with_llm(text: str, source_url: str, client) -> dict:
    """Use LLM to extract company information from text."""
    try:
        prompt = f"""
        Extract company information from the following webpage content. 
        Look for:
        1. Company name (usually in headings or title)
        2. Company description (usually in paragraphs explaining what the company does)
        3. Company website URL (look for external links and return ONLY a valid URL starting with http:// or https://)
        4. Location (where the company is located, usually a country or a city)
        5. Domain (the industry or field the company operates in, such as "Finance", "Biotech", etc.)

        Source website: {source_url}

        Webpage content:
        {text}

        Return ONLY a valid JSON object with these fields:
        {{
            "name": "company name",
            "description": "main description of what the company does",
            "url": "company website URL (must start with http:// or https://)",
            "location": "where the company is located, either a country or a city",
            "domain": "the industry or field the company operates in"
        }}

        Do not include any additional text or explanations. Only return the JSON object.
        """

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a data extraction tool. Extract only the requested information and return it in JSON format."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2
        )

        # Get the raw response content
        raw_content = response.choices[0].message.content

        # Debug: Print the raw response content
        print("Raw response content:", raw_content)

        # Remove Markdown code formatting (triple backticks and "json")
        cleaned_content = re.sub(r'```json|\```', '', raw_content).strip()

        # Debug: Print the cleaned content
        print("Cleaned content:", cleaned_content)

        # Parse the cleaned JSON content
        result = json.loads(cleaned_content)

        # Validate the URL
        if not result.get("url", "").startswith(("http://", "https://")):
            logger.warning(f"Invalid URL format: {result.get('url')}")
            result["url"] = ""  # Set URL to empty if invalid

        # Add source website to the result
        result['source'] = source_url
        return result

    except json.JSONDecodeError as e:
        logger.error(f"Invalid JSON response: {raw_content}")
        return {}

    except Exception as e:
        logger.error(f"Error in LLM extraction: {str(e)}")
        return {}

In [85]:
extract_with_llm(clean_html(html_content=html_content), "https://www.nvfund.com/portfolio/", client)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw response content: ```json
{
    "name": "Amphista Therapeutics",
    "description": "Amphista Therapeutics is developing medicines for hard-to-treat diseases using next-generation targeted protein degradation approaches.",
    "url": "https://www.amphista.com",
    "location": "Cambridge, UK",
    "domain": "Biotech"
}
```
Cleaned content: {
    "name": "Amphista Therapeutics",
    "description": "Amphista Therapeutics is developing medicines for hard-to-treat diseases using next-generation targeted protein degradation approaches.",
    "url": "https://www.amphista.com",
    "location": "Cambridge, UK",
    "domain": "Biotech"
}


{'name': 'Amphista Therapeutics',
 'description': 'Amphista Therapeutics is developing medicines for hard-to-treat diseases using next-generation targeted protein degradation approaches.',
 'url': 'https://www.amphista.com',
 'location': 'Cambridge, UK',
 'domain': 'Biotech',
 'source': 'https://www.nvfund.com/portfolio/'}