<a href="https://colab.research.google.com/github/afifahnita/T1_WEB_SCRAPING/blob/main/Untitled13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Daftar kategori RSS Tempo
rss_feeds = {
    "Nasional": "https://rss.tempo.co/nasional",
    "Bisnis": "https://rss.tempo.co/bisnis",
    "Metro": "https://rss.tempo.co/metro",
    "Dunia": "https://rss.tempo.co/dunia",
    "Olahraga": "https://rss.tempo.co/olahraga",
    "Teknologi": "https://rss.tempo.co/tekno",
    "Seleb": "https://rss.tempo.co/seleb",
    "Otomotif": "https://rss.tempo.co/otomotif",
}

articles = []

# Loop semua kategori
for category, url in rss_feeds.items():
    res = requests.get(url, timeout=10)
    soup = BeautifulSoup(res.content, "xml")

    for item in soup.find_all("item"):
        title = item.title.get_text(strip=True) if item.title else None
        link = item.link.get_text(strip=True) if item.link else None
        pub_date = item.pubDate.get_text(strip=True) if item.pubDate else None
        description = item.description.get_text(strip=True) if item.description else None

        articles.append({
            "category": category,
            "title": title,
            "link": link,
            "date": pub_date,
            "summary": description
        })

# Konversi ke DataFrame
df = pd.DataFrame(articles)

# Simpan ke CSV
df.to_csv("tempo_articles.csv", index=False, encoding="utf-8-sig")

# Simpan ke Excel (XLSX)
df.to_excel("tempo_articles.xlsx", index=False)

# Simpan ke JSON
with open("tempo_articles.json", "w", encoding="utf-8") as f:
    json.dump(articles, f, indent=2, ensure_ascii=False)

print(f"✅ Total artikel terkumpul: {len(df)}")
print("➡️ Data berhasil disimpan ke tempo_articles.csv, tempo_articles.xlsx, tempo_articles.json")

# Preview 20 artikel teratas
print(df.head(20))


✅ Total artikel terkumpul: 348
➡️ Data berhasil disimpan ke tempo_articles.csv, tempo_articles.xlsx, tempo_articles.json
    category                                              title  \
0   Nasional  Dua Warga Australia Didakwa Jual Senjata ke TP...   
1   Nasional  Istana Pasikan Kekosongan Dua Kursi Menteri Ta...   
2   Nasional  UMKM Binaan Pertamina Berhasil Berdayakan Peta...   
3   Nasional  Komisi X akan Cek Langsung Distribusi Papan Di...   
4   Nasional          Mengapa Revisi UU Pemilu Perlu Dilakukan?   
5   Nasional             Bagaimana Seharusnya BAIS TNI Bekerja?   
6   Nasional  Respons TNI Atas Pembentukan TGPF Demonstrasi ...   
7   Nasional  Istana Setuju Pembangunan Alun-alun Demokrasi ...   
8   Nasional  Jakarta Mengalami 1.195 Kebakaran hingga Septe...   
9   Nasional  Kementerian Pendidikan Dasar Hanya Dapat Tamba...   
10  Nasional  Anggota DPR: Publik Berhak Melihat Data Capres...   
11  Nasional  Menteri Abdul Mu'ti: Yang Benar Program Intera...   
12  Nasi

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

BASE_URL = "https://www.cnnindonesia.com"


def get_article_links(limit=10):
    """Ambil tautan artikel dari halaman utama CNN Indonesia"""
    r = requests.get(BASE_URL, timeout=10)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    links = []
    for a in soup.select("article a"):
        href = a.get("href")
        if href and href.startswith("https://www.cnnindonesia.com"):
            if href not in links:
                links.append(href)
        if len(links) >= limit:
            break
    return links


def scrape_article(url):
    """Scrape detail artikel: judul, konten, metadata"""
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        title = soup.find("h1", class_="title").get_text(strip=True) if soup.find("h1", class_="title") else None
        content = " ".join([p.get_text(strip=True) for p in soup.select(".detail_text p")])
        author = soup.find("div", class_="author")
        author = author.get_text(strip=True) if author else None
        date = soup.find("div", class_="date").get_text(strip=True) if soup.find("div", class_="date") else None
        tags = [tag.get_text(strip=True) for tag in soup.select(".tag a")]

        return {
            "url": url,
            "title": title,
            "author": author,
            "date": date,
            "content": content,
            "tags": tags,
        }
    except Exception as e:
        print(f"Gagal scrape {url}: {e}")
        return None


if __name__ == "__main__":
    print("Mengambil link artikel dari CNN Indonesia...")
    article_links = get_article_links(limit=20)
    print(f"Ditemukan {len(article_links)} link artikel.")

    data = []
    for link in article_links:
        artikel = scrape_article(link)
        if artikel:
            data.append(artikel)
        time.sleep(1)  # delay biar ga dianggap bot

    # Simpan ke JSON
    with open("cnn_articles.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    # Simpan ke CSV
    df = pd.DataFrame(data)
    df.to_csv("cnn_articles.csv", index=False, encoding="utf-8-sig")

    # Simpan ke XLSX
    df.to_excel("cnn_articles.xlsx", index=False)

    print("Scraping selesai! Data disimpan ke cnn_articles.json, cnn_articles.csv, cnn_articles.xlsx")


Mengambil link artikel dari CNN Indonesia...
Ditemukan 20 link artikel.
Scraping selesai! Data disimpan ke cnn_articles.json, cnn_articles.csv, cnn_articles.xlsx
