## MEMBUAT RINGASAN WEBSITE MENGGUNAKAN OPEN AI

In [1]:
#install selenium untuk render website 
%pip install selenium webdriver-manager

Collecting seleniumNote: you may need to restart the kernel to use updated packages.

  Downloading selenium-4.34.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Collecting sortedcontainers (from trio~=0.30.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading 

In [2]:
# System & Environment
import os
from dotenv import load_dotenv

# Web Scraping
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# AI environtment
from IPython.display import Markdown, display
from openai import OpenAI

In [5]:
#model dan autentifikasi
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if not api_key:
   raise ValueError("OPENAI_API_KEY not found in environment variables")

print("✅ API key loaded successfully!")
openai = OpenAI()
model = "gpt-4o-mini"

✅ API key loaded successfully!


## WEBSCRAPPING

In [6]:
#webscraping 

class WebsiteCrawler:
    def __init__(self, url):
        self.url = url
        self.title = ""
        self.text = ""
        self.scrape()

    def scrape(self):
        try:
            # Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

            # Try to find Chrome
            chrome_paths = [
                r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
                r"C:\Users\{}\AppData\Local\Google\Chrome\Application\chrome.exe".format(os.getenv('USERNAME')),
            ]

            chrome_binary = None
            for path in chrome_paths:
                if os.path.exists(path):
                    chrome_binary = path
                    break

            if chrome_binary:
                chrome_options.binary_location = chrome_binary

            # Create driver
            driver = webdriver.Chrome(options=chrome_options)
            driver.set_page_load_timeout(30)

            print(f"🔍 Loading: {self.url}")
            driver.get(self.url)

            # Wait for page to load
            time.sleep(5)

            # Try to wait for main content
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "main"))
                )
            except Exception:
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, "body"))
                    )
                except Exception:
                    pass  # Continue anyway

            # Get title and page source
            self.title = driver.title
            page_source = driver.page_source
            driver.quit()

            print(f"✅ Page loaded: {self.title}")

            # Parse with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')

            # Remove unwanted elements
            for element in soup(["script", "style", "img", "input", "button", "nav", "footer", "header"]):
                element.decompose()

            # Get main content
            main = soup.find('main') or soup.find('article') or soup.find('.content') or soup.find('body')
            if main:
                self.text = main.get_text(separator="\n", strip=True)
            else:
                self.text = soup.get_text(separator="\n", strip=True)

            # Clean up text
            lines = [line.strip() for line in self.text.split('\n') if line.strip() and len(line.strip()) > 2]
            self.text = '\n'.join(lines[:200])  # Limit to first 200 lines

            print(f"📄 Extracted {len(self.text)} characters")

        except Exception as e:
            print(f"❌ Error occurred: {e}")
            self.title = "Error occurred"
            self.text = "Could not scrape website content"



## INISIASI PROMPT 

In [7]:
system_prompt = "Anda adalah asisten AI yang mampu menganalisa sebuah website \
dan memberikan ringkasan dengan detail dan baik. Anda juga mampu memberikan uraian ringkasan dalam bentuk markdown."

In [8]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [9]:
def message_for(website):
    return[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)},
    ]

## FUNGSI SUMMARY

In [10]:
def summarize(url):
    """Scrape website and summarize with GPT"""
    site = WebsiteCrawler(url)

    if "Error occurred" in site.title or len(site.text) < 50:
        print(f"❌ Failed to scrape meaningful content from {url}")
        return

    print("🤖 Creating summary...")

    # Create summary
    response = openai.chat.completions.create(
        model= model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt_for(site)}
        ]
    )

    web_summary = response.choices[0].message.content
    display(Markdown(web_summary))

summarize('https://rumahcoding.co.id')

🔍 Loading: https://rumahcoding.co.id
✅ Page loaded: Profesional Training Web, Mobile App dan Data Science untuk Perusahaan - Rumah Coding
📄 Extracted 300 characters
🤖 Creating summary...


# Ringkasan Website: Profesional Training Web, Mobile App dan Data Science untuk Perusahaan - Rumah Coding

Website ini menawarkan program pelatihan profesional di bidang pengembangan web, aplikasi mobile, dan data science khusus untuk perusahaan. Fokus utamanya adalah mengembangkan keterampilan backend yang penting dalam teknologi informasi.

## Konten Utama:
- **Pengembangan Backend**: 
  - Menjelaskan proses pembuatan dan pengelolaan bagian aplikasi yang tidak terlihat oleh pengguna akhir.
  - Menyentuh aspek-aspek seperti server, database, dan komponen lain yang menyokong fungsi dan logika aplikasi.

## Berita dan Pengumuman:
Tidak ada berita atau pengumuman khusus yang disebutkan dalam konten yang disediakan. 

Website ini mengedukasi perusahaan tentang pentingnya pengembangan teknologi dan menyediakan pelatihan yang relevan untuk meningkatkan kapabilitas mereka dalam dunia digital.