COMPANY BROCHURE BY WEBSITE SCRAPING


In [None]:
import nest_asyncio

nest_asyncio.apply()

In [157]:
import os
import time
import json
import ollama
import asyncio
from typing import List
from openai import OpenAI
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from playwright.async_api import async_playwright

In [None]:
load_dotenv()

_raw_openai_key = os.getenv("OPENAI_API_KEY")
OPENAI_API_KEY = None
if _raw_openai_key:
    # Remove all whitespace characters anywhere in the key (spaces, tabs, newlines)
    sanitized_key = "".join(_raw_openai_key.split())
    if sanitized_key != _raw_openai_key:
        print("Sanitized OPENAI_API_KEY by removing whitespace.")
    OPENAI_API_KEY = sanitized_key
    # Ensure downstream libraries that read from env get the sanitized key
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    print("OPENAI_API_KEY found!")
else:
    print(
        "OPENAI_API_KEY is not set. The app will start, but calls that require OpenAI will fail until it is configured."
    )

In [None]:
def time_now():
    return time.strftime("%Y-%m-%d %H:%M:%S")

In [None]:
class Website:
    url: str
    title: str
    text: str
    links: List[str]

    def __init__(self, url: str):
        self.url = url
        self.title = None
        self.text = None

    async def scrape(self):
        print(time_now() + " Scraping Started..." + "(" + self.url + ")")
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            try:
                await page.goto(self.url, wait_until="networkidle", timeout=20000)
                await page.wait_for_load_state("domcontentloaded")
                content = await page.content()
            except Exception as e:
                print(f"Failed to scrape {self.url}: {e}")
                self.title = "Failed to load"
                self.text = ""
                self.links = []
                return
            finally:
                await browser.close()

        soup = BeautifulSoup(content, "html.parser")
        self.title = soup.title.string if soup.title else "No Title Found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)
        links = [link.get("href") for link in soup.find_all("a")]
        self.links = [
            link
            for link in links
            if link
            and not any(
                social in link
                for social in [
                    "twitter.com",
                    "instagram.com",
                    "youtube.com",
                    "facebbok.com"
                ]
            )
        ]
        print(time_now() + " Scraping Completed")

    def get_contents(self):
        return f"Webpage title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [None]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""


def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [None]:
openai = OpenAI()


def get_links(url: str):
    website = Website(url)
    asyncio.run(website.scrape())

    print(time_now() + " Links requested...")

    # OpenAI
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)},
        ],
        response_format={"type": "json_object"},
    )
    print(time_now() + " Links received")
    result = response.choices[0].message.content
    return json.loads(result)

    # # Ollama
    # response = ollama.chat(
    #     model="gemma3:1b",
    #     messages=messages_for(website)
    # )
    # print(time_now() + " Summary received")
    # return response["message"]["content"]

In [None]:
def get_all_details(url: str):
    site = Website(url)
    asyncio.run(site.scrape())

    result = "Landing Page:\n"
    result += site.get_contents()

    links = get_links(url)
    for link in links['links']:
        result+= f"\n\n{link['type']}\n"
        s = Website(link['url'])
        asyncio.run(s.scrape())
        result+=s.get_contents()
    
    return result

In [161]:
# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [None]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [163]:
def create_brochure(company_name, url):
    print(time_now() + " Brochure requested...")
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    print(time_now() + " Brochure Received")
    display(Markdown(result))


def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
         model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    print(time_now() + " Brochure Streaming...")
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
create_brochure("Nesl-it", "https://nesl-it.com")

In [162]:
stream_brochure("Nesl-it", "https://nesl-it.com")

2025-09-04 01:19:17 Scraping Started...(https://nesl-it.com)
2025-09-04 01:19:33 Scraping Completed
2025-09-04 01:19:33 Scraping Started...(https://nesl-it.com)
2025-09-04 01:19:41 Scraping Completed
2025-09-04 01:19:41 Links requested...
2025-09-04 01:19:44 Links received
2025-09-04 01:19:44 Scraping Started...(https://nesl-it.com/about)
2025-09-04 01:19:49 Scraping Completed
2025-09-04 01:19:49 Scraping Started...(https://nesl-it.com/careers)
2025-09-04 01:19:53 Scraping Completed
2025-09-04 01:19:53 Scraping Started...(https://nesl-it.com/contact-us)
2025-09-04 01:19:58 Scraping Completed
2025-09-04 01:19:58 Scraping Started...(https://nesl-it.com/services)
2025-09-04 01:20:03 Scraping Completed
2025-09-04 01:20:03 Scraping Started...(https://nesl-it.com/case-studies)
2025-09-04 01:20:08 Scraping Completed


# Welcome to NESL-IT: New Era Software Logics

---

## Who Are We? ü§î

We‚Äôre NESL-IT, your friendly neighborhood tech wizards! Whether you need a few coding superheroes or a full-fledged team of tech geniuses, we‚Äôve got your back. With our top-shelf vetting process, expect only the cr√®me de la cr√®me of software developers. No capes required!

---

## What Do We Do? üíª

- **Staff Augmentation**: More developers? Yes, please! We‚Äôll have our talent join your existing team faster than you can say ‚Äúdebugging!‚Äù
  
- **Dedicated Teams**: We drop our expert teams right into your organization, like a surprise party but with fewer balloons and more code.

- **Software Development Services**: Need a website, an app, or an AI that can ask you how your day was? Look no further!

---

## Our Culture: Where Tech Meets Fun üéâ

At NESL-IT, we believe in more than just deadlines and deliverables; we celebrate victories (and occasionally, the occasional snack) together! Here‚Äôs a taste of our vibrant culture:
  
- **No stress zone**: We know that creativity flows best in a laid-back atmosphere. Meetings are usually accompanied by coffee, and we even have a ‚Äúsip and chat‚Äù policy for brainstorming sessions.

- **Learning & Growth**: Like fine wine or cheese, our developers get better with experience. We invest heavily in training because who doesn‚Äôt love knowing more than one programming language?

- **Community Driven**: We‚Äôre not just about business; we‚Äôre about building relationships that last longer than your last Tinder date! 

---

## Our Clients: Who's in the NESL-IT Family? üë®‚Äçüë©‚Äçüëß‚Äçüë¶

With over **120 successful projects** and relationships lasting over three years, we cater to clients of all sizes. From app developers to health tech innovators, we love creating connections that fuel progress. 

Here are some cool cats that have partnered with us:
- AppEvolve
- Trally
- Here.News
- And many more!

---

## Careers: Join the Tech Revolution! üöÄ

Looking for a job that doesn‚Äôt suck? (Yes, we went there.) At NESL-IT, we promise:
  
- Engaging projects that'll challenge your brain 
- A team that feels more like family  
- Opportunities to learn and grow (and not just by eating lots of snacks)

So if you have coding superpowers and a passion for teamwork, drop us a note at [career@nesl-it.com](mailto:career@nesl-it.com). 

---

## Let‚Äôs Connect!

Got a big idea? Have questions or want to collaborate? We're just a click away! Let‚Äôs make magic happen together.   

üìû Call us at: **051 6166699**  
üì© Email us: **[contact@nesl-it.com](mailto:contact@nesl-it.com)**   

---

**Join us at NESL-IT** ‚Äî where new-era software logics meet a touch of fun!  ü•≥