In [None]:
from dotenv import load_dotenv
from openai import AsyncOpenAI
from agents import Agent, Runner, trace, function_tool, OpenAIChatCompletionsModel, input_guardrail, GuardrailFunctionOutput, WebSearchTool
from openai.types.responses import ResponseTextDeltaEvent
from typing import Dict, List, Optional
from pydantic import BaseModel
import sendgrid
import os
from sendgrid.helpers.mail import Mail, Email, To, Content
import asyncio
from scraper import fetch_website_contents, fetch_website_links
from IPython.core.display import Markdown

In [None]:
load_dotenv(override=True)

In [None]:
google_api_key = os.getenv('GOOGLE_API_KEY')
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
gemini_client = AsyncOpenAI(base_url=GEMINI_BASE_URL, api_key=google_api_key)
gemini_model = OpenAIChatCompletionsModel(model="gemini-2.5-flash", openai_client=gemini_client)

In [40]:
CBTW_URL = "https://cbtw.tech/"

In [41]:
# PageSummarizer agent

page_summarizer_instruction ="""
You are a customer service content analyzer. Your task is to create concise summaries of content to support customer service teams.

INPUT: Website content (HTML or text) from a company website

OUTPUT: A brief, structured markdown summary (max 500 words)
"""
page_summarizer_agent = Agent(
        name="PageSummarizer Agent",
        instructions=page_summarizer_instruction,
        model=gemini_model
)

In [48]:
async def summarize(url):
    print(f"summarize page {url}")
    web_content = fetch_website_contents(url)
    print(f"PageSummarizer Agent is running {url}")
    with trace("PageSummarizer Agent"):
        result = await Runner.run(page_summarizer_agent, web_content)
        return result.final_output

# web_content = await summarize(CBTW_URL)
# print(web_content)

In [43]:
# Get relevant links Agent
class RelevantLinks(BaseModel):
    links: List[str]

get_relevant_links_system_prompt = """
You are a link filtering agent. Your task is to identify and return only the links that are relevant to the base website.

INPUT:
- Base URL: The main website URL 
- List of URLs: All links found on the page

TASK:
Filter and return ONLY links that belong to the same domain/website as the base URL.

RULES:
1. KEEP links that:
   - Share the same domain as base URL (e.g., if base is example.com, keep www.example.com, blog.example.com, example.com/page)
   - Are internal pages, subdomains, or paths of the base domain

2. REMOVE links that:
   - Point to external domains (social media, partners, ads, third-party sites)
   - Are mailto:, tel:, javascript:, or # anchors
   - Point to external CDNs, analytics, or tracking domains

3. NORMALIZATION (to remove duplicates):
- Treat URLs with and without trailing slashes as identical
  Example: https://cbtw.tech/careers and https://cbtw.tech/careers/ are duplicates
- Remove trailing slash before comparing
- Convert all to lowercase for comparison
- Remove fragment identifiers (#section)
- Keep the first occurrence when duplicates are found

4. OUTPUT:
   - Return a clean list of full, absolute URLs
   - No duplicates
   - Preserve the original URL format (don't modify paths or parameters)

EXAMPLE:
Base URL: https://example.com
Links: ['https://example.com/about', 'https://example.com/about/', 'https://facebook.com', '/contact', 'https://blog.example.com/post']
Output: 
- https://example.com/about
- https://example.com/contact
- https://blog.example.com/post

Return only the filtered list of relevant URLs.
"""

def generate_relevant_filter_user_prompt(base_url): 
   links = fetch_website_links(base_url)
   return f"""
   <BaseUrl>{base_url}</BaseUrl>
   <Links>{links}</Links>
   """

get_relevant_links_agent = Agent(
        name="Link filtering agent",
        instructions=get_relevant_links_system_prompt,
        output_type=RelevantLinks,
        model=gemini_model
)

In [44]:
async def get_relevant_links(url):
    print(f"get relevant links {url}")
    user_prompt= generate_relevant_filter_user_prompt(url)
    print(f"Link filtering agent is running {url}")
    with trace("Link filtering agent"):
        result = await Runner.run(get_relevant_links_agent, user_prompt)
        return result.final_output


# links = await get_relevant_links(CBTW_URL)
# print(links)

In [45]:
def write_markdown(content: str, filename: str = "output.md"):
    """
    Write markdown content to a local file.
    Overrides file if it exists, creates new file if it doesn't.
    
    Args:
        content: Markdown text to write
        filename: Output filename (default: output.md)
    """

    with open(f"profiles/{filename}", 'w', encoding='utf-8') as f:
        f.write(content)

In [49]:
# Function to process URLs in batches of 10
async def run_summarize_in_batches(urls, batch_size=10, delay=60):
    results = []

    for i in range(0, len(urls), batch_size):
        batch = urls[i:i+batch_size]
        print(f"\n🚀 Running batch {i//batch_size + 1} ({len(batch)} URLs)...")

        # Run this batch in parallel
        tasks = [asyncio.create_task(summarize(url)) for url in batch]
        batch_results = await asyncio.gather(*tasks)
        results.extend(batch_results)

        # If there are more batches left, sleep
        if i + batch_size < len(urls):
            print(f"⏳ Waiting {delay}s before next batch...\n")
            await asyncio.sleep(delay)

    print("\n✅ All batches completed.")
    return results

In [50]:
async def fetch_page_and_summurize(url, company_name):
    print(f"start fetch {url}")
    contents = await summarize(url)
    relevant_links = await get_relevant_links(url)
    summarized_contents = f"## Landing page:\n\n{contents}\n## Relevant Information:"

    results = await run_summarize_in_batches(relevant_links.links, batch_size=8, delay=60)
    for relevant_link, summary in zip(relevant_links.links, results):
        summarized_contents += f"\n\n### Link: {relevant_link}\n"
        summarized_contents += summary

    write_markdown(summarized_contents, f"{company_name}.md")
    return summarized_contents

print(await fetch_page_and_summurize(CBTW_URL, "CBTW"))

start fetch https://cbtw.tech/
summarize page https://cbtw.tech/
PageSummarizer Agent is running https://cbtw.tech/
get relevant links https://cbtw.tech/
Link filtering agent is running https://cbtw.tech/

🚀 Running batch 1 (8 URLs)...
summarize page https://cbtw.tech/partners/
PageSummarizer Agent is running https://cbtw.tech/partners/
summarize page https://cbtw.tech/about/
PageSummarizer Agent is running https://cbtw.tech/about/
summarize page https://cbtw.tech/contact/
PageSummarizer Agent is running https://cbtw.tech/contact/
summarize page https://cbtw.tech/service-overview/ai-data-platforms
PageSummarizer Agent is running https://cbtw.tech/service-overview/ai-data-platforms
summarize page https://cbtw.tech/service-overview/cloud-and-enterprise-platforms
PageSummarizer Agent is running https://cbtw.tech/service-overview/cloud-and-enterprise-platforms
summarize page https://cbtw.tech/service-overview/cybersecurity
PageSummarizer Agent is running https://cbtw.tech/service-overview/

In [None]:
async def fetch_page_and_summurize(url):
    print(f"start fetch {url}")
    contents = await summarize(url)
    relevant_links = await get_relevant_links(url)
    summarized_contents = f"## Landing page:\n\n{contents}\n## Relevant Information:\n"
    
    for relevant_link in relevant_links.links:
        summarized_contents += f"\n\n### Link: {relevant_link}\n"
        summarized_contents += await summarize(relevant_link)
    return summarized_contents

print(await fetch_page_and_summurize(CBTW_URL))