In [None]:
import os
import sys
import json
from dotenv import load_dotenv
from openai import OpenAI

current_dir = os.getcwd()
if os.path.basename(current_dir) == 'notebooks':
    project_dir = os.path.dirname(current_dir)
else:
    project_dir = current_dir

project_dir = os.path.abspath(project_dir)
if project_dir not in sys.path:
    sys.path.insert(0, project_dir)

from utils.scraper import fetch_website_contents, fetch_website_links

In [None]:
load_dotenv(override=True)
api_key = os.getenv("GEMINI_API_KEY")

if not api_key:
    print("No API key was found")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end")
else:
    print("API key found!")

In [None]:
MODEL = "gemini-2.5-flash"
gemini = OpenAI(api_key=api_key,
                base_url="https://generativelanguage.googleapis.com/v1beta/openai/")

In [None]:
links = fetch_website_links("https://www.google.com")
links

In [None]:
link_system_prompt = """
You are a company marketing intelligence extractor.

Your job is to analyze a list of website URLs and return every link that is useful for:
- Branding / Brochure design / Company presentation
- Product & solution overview
- Business positioning, strategy, clients & value
- Careers & hiring
- Investor & corporate-level communication
- Press, PR, news, media, events
- Sustainability/ESG/initiatives/vision
- Customer success stories or case studies
- Contact & business onboarding pathways

You must extract **every possibly relevant link**, not just the obvious ones.

Answer STRICTLY in JSON:

{
    "links":[
        {
            "url":"full url here",
            "type":"category label",
            "specific_type":"specific type of link",
            "importance_score": 1-100,
            "why_useful": "short marketing benefit reason"
        }
    ]
}

RELEVANT CATEGORIES YOU SHOULD CONSIDER:
• About / Company / Mission / Vision
• Products / Services / Solutions
• Pricing & plans (if available)
• Enterprise packages or cloud offerings
• Advertising & business solutions
• Developers platform / API ecosystem
• Careers
• Press / Blog / Media / News
• Investors / Corporate governance
• Contact / Support channels
• Sustainability & ESG initiatives
• Partnerships / Affiliations / Research programs
• Case studies / Testimonials / Portfolio

DO NOT include:
• Login/account/dashboard
• TOS / Privacy / Cookies / Legal
• Mailto links
• Useless navigation endpoints

Return everything valuable for marketing, not minimal results.
"""


In [None]:
def get_links_user_prompt(url):
    links = fetch_website_links(url)

    user_prompt = f"""
Extract **all marketing-valuable links** from the website:

URL: {url}

You are given raw links below — your job is to identify every link that can be useful in marketing,
branding, brochures, business presentations, product communication, investor material, hiring,
or public-facing messaging.

Raw links discovered from site:

{"\n".join(links)}

Return output **strictly in JSON** only — NO commentary, NO explanation, NO Markdown.
"""

    return user_prompt


In [None]:
print(get_links_user_prompt("https://www.google.com"))

In [None]:
def select_relevant_links(url):
    ress = gemini.chat.completions.create(
        model=MODEL,
        messages=[
        {
            "role": "system",
            "content": link_system_prompt
        },
        {
            "role": "user",
            "content": get_links_user_prompt(url)
        }
        ],
        response_format={"type": "json_object"}
    )
    return json.loads(ress.choices[0].message.content)

In [None]:
select_relevant_links("https://www.google.com")

In [None]:
select_relevant_links("https://abdoasem.com")

In [None]:
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)

    relevant_links = select_relevant_links(url)

    result = f"## Landing page contents\n\n{contents}\n\n## Relevant links\n"

    for link in relevant_links["links"]:
        result += f"* [{link['specific_type']}]({link['url']})\n"
        result += f"{fetch_website_contents(link['url'])}\n\n"

    return result


fetch_page_and_all_relevant_links("https://abdoasem.com")


In [None]:
print(fetch_page_and_all_relevant_links("https://www.google.com"))

In [None]:
brochure_system_prompt = """
You are the ultimate assistant for analyzing company websites and creating high-impact brochures.
Your goal is to summarize the company's identity in a concise, professional, and persuasive manner
for prospective customers, investors, and recruits. Respond in Markdown without code blocks.

Make sure to include:

1. **Company Overview:** Who they are, what they do, and their market presence.
2. **Products & Services:** Core offerings and unique selling points.
3. **Company Culture:** Values, mission, work environment, and team dynamics.
4. **Customers & Partners:** Key clients, target audience, and strategic partners.
5. **Careers & Opportunities:** Available roles, career growth, and employee benefits.
6. **Contact & Online Presence:** Website, social media, and other relevant links.

Write it in a way that is engaging, professional, and ready to be distributed to stakeholders.
Use headings, bullet points, and short paragraphs to improve readability.
Always prioritize clarity, impact, and persuasiveness.

Sectionize the output into sections with headings and lines breaks.
also return as a markdown
"""


In [None]:
def get_brochure_user_prompt(company_name, url):
    base_prompt = f"""
You are creating a professional brochure for the company: {company_name}.
Use the following contents from its landing page and other relevant pages
to summarize the company for prospective customers, investors, and recruits.
Respond in Markdown without code blocks.

Focus on these sections if the information is available:
1. Company Overview: Who they are, what they do, and their market presence.
2. Products & Services: Core offerings and unique selling points.
3. Company Culture: Values, mission, work environment, and team dynamics.
4. Customers & Partners: Key clients, target audience, and strategic partners.
5. Careers & Opportunities: Available roles, career growth, and employee benefits.
6. Contact & Online Presence: Website, social media, and other relevant links.
"""

    website_contents = fetch_page_and_all_relevant_links(url)

    user_prompt = base_prompt + "\n\n" + website_contents

    user_prompt = user_prompt[:30_000]

    return user_prompt


In [None]:
print(get_brochure_user_prompt("Abdo Asem", "https://abdoasem.com"))

In [None]:
from IPython.display import display, Markdown, update_display

def create_brochure(company_name, url, stream=False):
    user_prompt = get_brochure_user_prompt(company_name, url)
    
    if stream:
        response_text = ""
        display_handle = display(Markdown(""), display_id=True)
        
        stream_obj = gemini.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": brochure_system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            stream=True
        )
        
        for chunk in stream_obj:
            delta_content = chunk.choices[0].delta.content or ""
            response_text += delta_content
            update_display(Markdown(response_text), display_id=display_handle.display_id)
        
        return response_text
    else:
        response = gemini.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": brochure_system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        return response.choices[0].message.content


In [None]:
brochure = create_brochure("Abdo Asem", "https://abdoasem.com", stream=True)
brochure

In [None]:
brochure = create_brochure("Abdo Asem", "https://abdoasem.com")
print(brochure)

In [None]:
brochure = create_brochure("Hugging Face", "https://huggingface.co")
print(brochure)