<a href="https://colab.research.google.com/github/amien1410/amien-scrapers/blob/main/Detikcom_News_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import json
import time
import requests
from datetime import datetime, timedelta
from bs4 import BeautifulSoup

In [3]:
def safe_text(soup_el, selectors, default=None):
    """Try multiple selectors on the element; return first found text or default."""
    for sel in selectors:
        node = soup_el.select_one(sel)
        if node:
            text = node.get_text(strip=True)
            if text:
                return text
    return default

def safe_attr(soup_el, selectors, attr, default=None):
    """Try multiple selectors and return attribute if exists."""
    for sel in selectors:
        node = soup_el.select_one(sel)
        if node and node.has_attr(attr):
            return node.get(attr)
    return default

def extract_image_from_style(span_style):
    """Extract URL from background-image(...) style."""
    if not span_style:
        return None
    # handle url("...") and url(&quot;...&quot;)
    m = re.search(r'url\((?:&quot;|&\#34;|\'|")?(.*?)(?:&quot;|&\#34;|\'|")?\)', span_style)
    return m.group(1) if m else None

def parse_absolute_time(date_text):
    if not date_text or not date_text.strip():
        return None
    part = date_text.split(",")[-1].strip()
    part = re.sub(r'\bWIB\b|\bWITA\b|\bWIT\b', '', part).strip()

    short_map = {
        "Jan":"Jan","Feb":"Feb","Mar":"Mar","Apr":"Apr","Mei":"May","Jun":"Jun",
        "Jul":"Jul","Agu":"Aug","Sep":"Sep","Okt":"Oct","Nov":"Nov","Des":"Dec"
    }

    for indo, eng in short_map.items():
        part = re.sub(rf'\b{indo}\b', eng, part)

    for fmt in ("%d %B %Y %H:%M", "%d %b %Y %H:%M", "%d %B %Y", "%d %b %Y"):
        try:
            dt = datetime.strptime(part, fmt)
            if fmt in ("%d %B %Y", "%d %b %Y"):
                dt = datetime(dt.year, dt.month, dt.day, 0, 0, 0)
            return dt   # return datetime, not string
        except Exception:
            continue

    m = re.search(r'(\d{1,2}\s+\w+\s+\d{4}\s+\d{2}:\d{2})', part)
    if m:
        try:
            dt = datetime.strptime(m.group(1), "%d %b %Y %H:%M")
            return dt
        except Exception:
            pass

    return None


def parse_relative_time(text, now=None):
    if now is None:
        now = datetime.now()

    text = text.lower().strip()
    match = re.match(r'(\d+)\s+(\w+)', text)
    if not match:
        return None

    value = int(match.group(1))
    unit = match.group(2)

    if "menit" in unit:
        dt = now - timedelta(minutes=value)
    elif "jam" in unit:
        dt = now - timedelta(hours=value)
    else:
        return None

    return dt

In [None]:
url = "https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page=1"
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
soup

In [None]:
url = "https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page=1"
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
news = []
articles = soup.find_all("article", class_="list-content__item")

for article in articles:
    # URL: try link inside image and link inside title
    url = safe_attr(article, ["a.media__link", "h3.media__title a", "a"], "href")

    # Title: look into title anchor or h3
    title = safe_text(article, ["h3.media__title a", "h3.media__title", ".media__title"])

    # Category / source
    category = safe_text(article, ["h2.media__subtitle", ".media__subtitle"])

    # Lead / description: try several possible selectors
    lead = safe_text(article, [".media__desc", ".articleLead", ".media__abstract", "p"], default="")

    # Image: try <img src>, then ratiobox style background-image
    img_src = safe_attr(article, ["img"], "src")
    if not img_src:
        span = article.select_one(".ratiobox, .media__image span")
        if span:
            style = span.get("style", "")
            img_src = extract_image_from_style(style)
    # image alt/title
    img_alt = safe_attr(article, ["img"], "alt") or safe_attr(article, ["img"], "title")

    # Date: try time span or generic date container
    date_tag = article.find("div", class_="media__date")
    date_str = date_tag.get_text(strip=True) if date_tag else None

    # Parsing tanggal (bisa absolute atau relative)
    if date_str:
        if "lalu" in date_str.lower():
            dt = parse_relative_time(date_str)
        else:
            dt = parse_absolute_time(date_str)
    else:
        dt = None

    date_formatted = dt.strftime("%Y-%m-%d %H:%M:%S") if dt else None

    news.append({
        "url": url,
        "title": title,
        "source": category,
        "image": img_src,
        "image_alt": img_alt,
        "date_raw": date_str,
        "date": date_formatted,
        "lead": lead,
    })


news

In [14]:
news = []
page = 1

while True:
    url = f"https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page={page}"
    print(f"Fetching page {page} ...")
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")

    articles = soup.find_all("article", class_="list-content__item")
    print(f"Found {len(articles)} articles")

    # stop kalau hasil kurang dari 10
    if len(articles) < 10:
        break

    for article in articles:
        # URL
        url = safe_attr(article, ["a.media__link", "h3.media__title a", "a"], "href")

        # Title
        title = safe_text(article, ["h3.media__title a", "h3.media__title", ".media__title"])

        # Category / source
        category = safe_text(article, ["h2.media__subtitle", ".media__subtitle"])

        # Lead / description
        lead = safe_text(article, [".media__desc", ".articleLead", ".media__abstract", "p"], default="")

        # Image
        img_src = safe_attr(article, ["img"], "src")
        if not img_src:
            span = article.select_one(".ratiobox, .media__image span")
            if span:
                style = span.get("style", "")
                img_src = extract_image_from_style(style)

        img_alt = safe_attr(article, ["img"], "alt") or safe_attr(article, ["img"], "title")

        # Date
        date_tag = article.find("div", class_="media__date")
        date_str = date_tag.get_text(strip=True) if date_tag else None

        if date_str:
            if "lalu" in date_str.lower():
                dt = parse_relative_time(date_str)
            else:
                dt = parse_absolute_time(date_str)
        else:
            dt = None

        date_formatted = dt.strftime("%Y-%m-%d %H:%M:%S") if dt else None

        news.append({
            "url": url,
            "title": title,
            "source": category,
            "image": img_src,
            "image_alt": img_alt,
            "date_raw": date_str,
            "date": date_formatted,
            "lead": lead,
        })

    page += 1

print(f"Total collected: {len(news)} articles")

Fetching page 1 ...
Found 12 articles
Fetching page 2 ...
Found 11 articles
Fetching page 3 ...
Found 11 articles
Fetching page 4 ...
Found 11 articles
Fetching page 5 ...
Found 11 articles
Fetching page 6 ...
Found 11 articles
Fetching page 7 ...
Found 11 articles
Fetching page 8 ...
Found 11 articles
Fetching page 9 ...
Found 11 articles
Fetching page 10 ...
Found 11 articles
Fetching page 11 ...
Found 11 articles
Fetching page 12 ...
Found 11 articles
Fetching page 13 ...
Found 11 articles
Fetching page 14 ...
Found 11 articles
Fetching page 15 ...
Found 11 articles
Fetching page 16 ...
Found 11 articles
Fetching page 17 ...
Found 11 articles
Fetching page 18 ...
Found 11 articles
Fetching page 19 ...
Found 11 articles
Fetching page 20 ...
Found 11 articles
Fetching page 21 ...
Found 11 articles
Fetching page 22 ...
Found 11 articles
Fetching page 23 ...
Found 10 articles
Fetching page 24 ...
Found 10 articles
Fetching page 25 ...
Found 10 articles
Fetching page 26 ...
Found 10 arti

In [4]:
queries = ["khalid basalamah", "korupsi kuota haji", "yaqut"]
all_news = []

for query in queries:
    print(f"\n=== Collecting for query: {query} ===")
    news = []
    page = 1

    while True:
        url = f"https://www.detik.com/search/searchall?query={query.replace(' ', '%20')}&result_type=latest&page={page}&fromdatex=01/08/2025&todatex=20/09/2025"
        print(f"Fetching page {page} ... {url}")
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "html.parser")

        articles = soup.find_all("article", class_="list-content__item")
        print(f"Found {len(articles)} articles")

        # stop kalau hasil kurang dari 10
        if len(articles) < 10:
            break

        for article in articles:
            url = safe_attr(article, ["a.media__link", "h3.media__title a", "a"], "href")
            title = safe_text(article, ["h3.media__title a", "h3.media__title", ".media__title"])
            category = safe_text(article, ["h2.media__subtitle", ".media__subtitle"])
            lead = safe_text(article, [".media__desc", ".articleLead", ".media__abstract", "p"], default="")

            # Image
            img_src = safe_attr(article, ["img"], "src")
            if not img_src:
                span = article.select_one(".ratiobox, .media__image span")
                if span:
                    style = span.get("style", "")
                    img_src = extract_image_from_style(style)
            img_alt = safe_attr(article, ["img"], "alt") or safe_attr(article, ["img"], "title")

            # Date
            date_tag = article.find("div", class_="media__date")
            date_str = date_tag.get_text(strip=True) if date_tag else None

            if date_str:
                if "lalu" in date_str.lower():
                    dt = parse_relative_time(date_str)
                else:
                    dt = parse_absolute_time(date_str)
            else:
                dt = None

            date_formatted = dt.strftime("%Y-%m-%d %H:%M:%S") if dt else None

            news.append({
                "query": query,
                "url": url,
                "title": title,
                "source": category,
                "image": img_src,
                "image_alt": img_alt,
                "date_raw": date_str,
                "date": date_formatted,
                "lead": lead,
            })

        page += 1
        time.sleep(1)  # biar aman, delay 1 detik

    print(f"Collected {len(news)} articles for query '{query}'")
    all_news.extend(news)

print(f"\n=== Total collected: {len(all_news)} articles ===")


=== Collecting for query: khalid basalamah ===
Fetching page 1 ... https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page=1&fromdatex=01/08/2025&todatex=20/09/2025
Found 11 articles
Fetching page 2 ... https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page=2&fromdatex=01/08/2025&todatex=20/09/2025
Found 10 articles
Fetching page 3 ... https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page=3&fromdatex=01/08/2025&todatex=20/09/2025
Found 10 articles
Fetching page 4 ... https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page=4&fromdatex=01/08/2025&todatex=20/09/2025
Found 10 articles
Fetching page 5 ... https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page=5&fromdatex=01/08/2025&todatex=20/09/2025
Found 10 articles
Fetching page 6 ... https://www.detik.com/search/searchall?query=khalid%20basalamah&result_type=latest&page=6&fr