# Scrapping Data dengan LLM - berbasis Streamlit

- penjelasan mengenai project: project ini bertujuan untuk membuat web scrapping berbasis streamlit. dimana nantinya akan menggunakan model seperti LLM, sistem ini nantinya dapat membantu kita salam melakukan scrapping data secara otomatis tanpa menghabiskan banyak waktu.
- penjelasan model dan alasan memilih model tersebut: model yang digunakan adalah LLM, tujuan menggunakan model LLM karena model ini dapat memerikan respon yang baik untuk hasil scrapping data yang dilakukan nantinya.

In [None]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# mempersiapkan model LLM OLLAMA versi mistral
model = OllamaLLM(model="mistral")

In [3]:
# Template prompt untuk ekstraksi data dari HTML
template = (
    "You are tasked with extracting specific information from the following text content: {dom_content}.\n"
    "Please follow these instructions carefully:\n\n"
    "1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}.\n"
    "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response.\n"
    "3. **Empty Response:** If no information matches the description, return an empty string ('').\n"
    "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text.\n"
    "5. **Numerical Data Priority:** Dates, numbers, and quantitative information are important.\n"
    "6. **Table Design:** Format table with appropriate headers with each row representing a product.\n"
    "7. **CSV Ready:** Output should be in CSV format, with each row representing a product, and the columns for name, price, location, seller."
)

In [None]:
#  mengambil HTML dari web yang di inputkan nantinya
def scrape_website(website):
    print("Meluncurkan browser...")

    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(website)
        time.sleep(10)
        html = driver.page_source
        return html
    finally:
        driver.quit()

In [5]:
# Ambil isi <body> dari HTML
def extract_body_content(html):
    soup = BeautifulSoup(html, "html.parser")
    body_content = soup.body
    return str(body_content) if body_content else ""

In [6]:
#  membersihkan halaman web
def clean_body_content(body_content):
    soup = BeautifulSoup(body_content, "html.parser")
    for tag in soup(["script", "style"]):
        tag.extract()
    cleaned = soup.get_text(separator="\n")
    return "\n".join(line.strip() for line in cleaned.splitlines() if line.strip())

In [7]:
# mengirim ke model LLM untuk dilakukan proses ekstraksi
def split_dom_content(dom_content, max_length=6000):
    return [dom_content[i:i+max_length] for i in range(0, len(dom_content), max_length)]

In [14]:
# Parsing dengan model LLM berdasarkan potongan DOM dan deskripsi
def parse_with_ollama(dom_chunks, parse_description, max_items=None):
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model

    parsed_results = []

    for i, chunk in enumerate(dom_chunks, 1):
        result = chain.invoke({"dom_content": chunk, "parse_description": parse_description})
        parsed_results.append(result)
        print(f"✅ Batch {i}/{len(dom_chunks)} selesai")

    combined_csv = "\n".join(parsed_results)

    # Batasi jumlah item jika max_items diberikan
    if max_items and max_items > 0:
        lines = combined_csv.strip().split("\n")
        header = lines[0]
        rows = lines[1:max_items + 1]
        limited_csv = "\n".join([header] + rows)
        return limited_csv

    return combined_csv

In [17]:
# Uji Coba

# URL yang ingin di-scrape
url = "https://www.olx.co.id/jakarta-selatan_g4000030/mobil-bekas_c198/q-mobil-bekas"

# Deskripsi data yang ingin diekstrak
parse_description = "Extract table with Product Name, Price, Location, Seller."

# Jumlah maksimum hasil yang ingin ditampilkan
max_items = 30  # ← Diatur oleh user

# Proses scraping dan parsing
html = scrape_website(url)
body = extract_body_content(html)
clean_text = clean_body_content(body)
chunks = split_dom_content(clean_text)
result_csv = parse_with_ollama(chunks, parse_description, max_items=max_items)

Meluncurkan browser...
✅ Batch 1/3 selesai
✅ Batch 2/3 selesai
✅ Batch 3/3 selesai


In [18]:
# Simpan hasil ke file CSV
with open("output.csv", "w", encoding="utf-8") as f:
    f.write(result_csv)

print(f"📁 Hasil ekstraksi ({max_items} item) disimpan ke 'output.csv'")

📁 Hasil ekstraksi (30 item) disimpan ke 'output.csv'
