# Requirement

In [None]:
## jalankan ini dahulu sebelum running
%pip install -qq -r requirements.txt

# A. Case: Detik

In [None]:
## global
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup #scraping web statis
import time
from tqdm import tqdm #info progress
from urllib.parse import quote_plus, quote #parsing string "spasi" menjadi "%20" dan "+"

In [None]:
## check if there is a response, if it's 200, we are good to go
s = requests.Session()
url = 'https://www.detik.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
}
response = s.get(url, headers=headers, timeout=60)
print(response.status_code)

### A1. Crawling URLs dengan memasukkan keyword/query
```https://www.detik.com/search/searchall?query=makan%20bergizi%20gratis&page=5&result_type=relevansi``` -> pakai modulo 20

```https://www.detik.com/search/searchall?query=makan+bergizi+gratis&page=5&result_type=relevansi``` -> pakai +


```https://www.detik.com/search/searchall?query={kata_kunci}&page={halaman}&result_type=relevansi```

In [None]:
url = f"https://www.detik.com/search/searchall?query=makan%20bergizi&page=1&result_type=relevansi"
response = requests.get(url, headers=headers, timeout=60)
soup = BeautifulSoup(response.content, 'html.parser')
time.sleep(1)

In [None]:
## fungsi untuk scraping artikel berdasarkan keyword yang diinputkan
def scrape_detik_byquery(kata_kunci, halaman) -> pd.DataFrame:
    # Parameter input
    keyword = kata_kunci
    max_pages = halaman
    results = []

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
    }

    for page in tqdm(range(1, max_pages + 1)):
        encoded_query = quote(keyword)
        url = f"https://www.detik.com/search/searchall?query={encoded_query}&page={page}&result_type=relevansi"
        try:
            response = requests.get(url, headers=headers, timeout=60)
            soup = BeautifulSoup(response.content, 'html.parser')
            time.sleep(1)

            beritas = soup.find_all('div', class_="media__text")
            for berita in beritas:
                try:
                    a = berita.find('a')
                    d = berita.find('div', class_="media__desc")
                    t = berita.find('div', class_="media__date")
                    k = berita.find('h2', class_="media__subtitle")

                    results.append({
                        "judul": a.text.strip() if a else np.nan,
                        "link": a.get('href') if a else np.nan,
                        "desc": d.text.strip() if d else np.nan,
                        "tanggal": t.text.strip() if t else np.nan,
                        "kategori": k.text.strip() if k else np.nan,
                        "keyword": keyword
                    })
                except Exception as e:
                    print("Skip 1 berita:", e)
        except Exception as e:
            print(f"Skip page {page}:", e)

    df = pd.DataFrame(results)
    print("Selesai. Total berita:", len(df))
    return df

In [None]:
df = scrape_detik_byquery("makan bergizi gratis", 3)
df.info()

In [None]:
df.tail(4)

"Dedi Mulyadi kirim siswa nakal"

"barak militer siswa Jawa Barat"

"pelatihan militer siswa nakal"

"pendidikan karakter Dedi Mulyadi"

"kontroversi siswa ke militer"


In [None]:
## looping list keyword

# Daftar keyword yang ingin di-scrape
list_keyword = [
    "Dedi Mulyadi kirim siswa nakal",
    "barak militer siswa Jawa Barat",
    "pelatihan militer siswa nakal",
    "pendidikan karakter Dedi Mulyadi",
    "kontroversi siswa ke militer"
]

# Set jumlah halaman yang ingin di-scrape per keyword
halaman = 5

# List untuk menyimpan semua DataFrame
dfs = []

for keyword in list_keyword:
    print(f"Scraping untuk keyword: {keyword}")
    df_keyword = scrape_detik_byquery(keyword, halaman)
    dfs.append(df_keyword)

# Gabungkan semua hasil
df = pd.concat(dfs, ignore_index=True)

In [None]:
df.info()

In [None]:
# Meremove duplicate berita
df_final = df.drop_duplicates(subset=['judul', 'link'], keep='first')
df_final.info()

In [None]:
df_final.to_csv("./files/detik_links.csv", index=False, encoding='utf-8-sig')

### A2. Scraping satu artikel

In [None]:
## read data
df_read = pd.read_csv("./files/detik_links.csv", encoding='utf-8-sig')
df_read.head()

In [None]:
df_read.info()

In [None]:
df['keyword'].unique()

In [None]:
response = requests.get("https://sport.detik.com/sepakbola/liga-indonesia/d-7932517/kdm-soal-bobotoh-rusak-rumput-gbla-pidana-atau-barak-militer", headers=headers, timeout=60)
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
body_div = soup.find('div', class_='detail__body-text itp_bodycontent')

In [None]:
all_paragraphs = body_div.find_all('p') if body_div else np.nan

In [None]:
# 2. Temukan semua <p class="para_caption"> dalam div.parallaxindetail.scrollpage
excluded_paragraphs = set()
for para_section in body_div.find_all('div', class_='parallaxindetail scrollpage'):
    for p in para_section.find_all('p', class_='para_caption'):
        excluded_paragraphs.add(p)

# 3. Filter: ambil semua <p> dari body_div yang tidak ada dalam excluded_paragraphs
final_paragraphs = [p for p in all_paragraphs if p not in excluded_paragraphs]
combined_text = '\n'.join(p.get_text(strip=True) for p in final_paragraphs)
combined_text

In [None]:
## fungsi melakukan scraping satu artikel
def scrape_detik_satu(url: str) -> pd.DataFrame:
    response = requests.get(url, headers=headers, timeout=60)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Ambil teks tanggal
    tanggal_utuh = soup.find('div', class_='detail__date')
    if tanggal_utuh:
        waktu = tanggal_utuh.get_text(strip=True)
        try:
            # Pisahkan jadi hari, tanggal, dan jam
            hari, sisanya = waktu.split(',', 1)
            sisanya = sisanya.strip()
            parts = sisanya.rsplit(' ', 2)
            tanggal_bersih = parts[0]
            jam = f"{parts[1]} {parts[2]}"
        except Exception:
            hari = np.nan
            tanggal_bersih = np.nan
            jam = np.nan
    else:
        hari = np.nan
        tanggal_bersih = np.nan
        jam = np.nan

    # Ambil kategori
    kat_div = soup.find('div', class_='page__breadcrumb')
    kategori = np.nan
    sub_kategori = np.nan
    if kat_div:
        kategori = kat_div.find('a').get_text(strip=True)
        a_tag = kat_div.find('a', attrs={'dtr-ttl': True})
        if a_tag:
            sub_kategori = a_tag.get('dtr-ttl')

    #Ambil Isi
    # 1. Ambil semua <p> dalam div.detail__body-text.itp_bodycontent
    body_div = soup.find('div', class_='detail__body-text itp_bodycontent')
    if body_div is None:
        # Kalau tidak ketemu, langsung ambil semua <p> di halaman
        final_paragraphs = soup.find_all('p')
    else:
        # Ambil semua <p> dalam div
        all_paragraphs = body_div.find_all('p')

        # Temukan semua <p class="para_caption"> dalam div.parallaxindetail.scrollpage
        excluded_paragraphs = set()
        for para_section in body_div.find_all('div', class_='parallaxindetail scrollpage'):
            for p in para_section.find_all('p', class_='para_caption'):
                excluded_paragraphs.add(p)

        # Filter: ambil semua <p> dari body_div yang tidak ada dalam excluded_paragraphs
        final_paragraphs = [p for p in all_paragraphs if p not in excluded_paragraphs]

    # Gabungkan hasil
    final_isi = '\n\n'.join(p.get_text(strip=True) for p in final_paragraphs)
    
    hasil = {
        "judul": soup.find('h1').get_text(strip=True) if soup.find('h1') else np.nan,
        "isi": final_isi,
        "hari": hari,
        "tanggal": tanggal_bersih,
        "jam": jam,
        # "kategori": soup.find('div', class_="page__breadcrumb").get_text(strip=True) if soup.find('div', class_="page__breadcrumb") else np.nan,
        "kategori": kategori,
        "sub_kategori": sub_kategori,
        "link": url
    }

    df = pd.DataFrame([hasil])
    return df

In [None]:
df_read["link"][1]

In [None]:
scrape_detik_satu(df_read["link"][1])

### A3. Scraping artikel dari URLs yang telah diperoleh


In [None]:
def scrape_detik_dari_csv(path_csv: str) -> pd.DataFrame:
    # Membaca CSV hasil scraping link
    df_links = pd.read_csv(path_csv)

    # Memastikan kolom 'link' ada
    if 'link' not in df_links.columns:
        raise ValueError("CSV tidak mengandung kolom 'link'.")

    hasil_semua = []
    for i,row in tqdm(df_links.iterrows(), total=len(df_links)): #tuple (i,series)
        url = row['link']
        df_artikel = scrape_detik_satu(url) #memanggil dan menjalankan fungsi scrape_detik satu artikel
        keyword = row['keyword']
        if df_artikel is not None:
            df_artikel['keyword'] = keyword
            hasil_semua.append(df_artikel)

    # Gabungkan semua DataFrame
    if hasil_semua:
        df_final = pd.concat(hasil_semua, ignore_index=True)
        df_final.to_csv('./files/detik_semua_artikel_query.csv', index=False, encoding='utf-8-sig') #disesuaikan dengan path teman2
        print("Selesai menyimpan semua artikel.")
        return df_final
    else:
        print("Tidak ada artikel yang berhasil di-scrape.")
        return pd.DataFrame()

In [None]:
df = scrape_detik_dari_csv('./files/detik_links.csv')
df.info()

In [None]:
df.head(3)

In [None]:
df_clean = df.dropna()

In [None]:
df_clean.to_csv('./files/detik_semua_artikel_query_clean.csv', index=False, encoding='utf-8-sig')

### A4. Detik.com menyediakan sitemap xml

In [None]:
# Ambil sitemap (allowed dari detik.com/robots.txt)
url_sitemap = 'https://www.detik.com/sitemap.xml'
response = requests.get(url_sitemap)
soup = BeautifulSoup(response.content, 'xml')  # parsing sebagai XML

# Ambil semua URL sitemap yang disediakan
sitemap_urls = [loc.text for loc in soup.find_all('loc')]
print("Contoh:", sitemap_urls[0])
print("Total sitemap url:", len(sitemap_urls))

In [None]:
## cek satu sitemap ada berapa artikel
sitemap_berita = sitemap_urls[0] 
resp = requests.get(sitemap_berita)
soup2 = BeautifulSoup(resp.content, 'xml')
artikel_urls = [loc.text.strip() for loc in soup2.find_all('loc')]
print("Total artikel:", len(artikel_urls))

In [None]:
artikel_urls[0]

In [None]:
berita = []
for url in artikel_urls[:5]:  # batasi dulu misalnya 5
    try:
        hasil = scrape_detik_satu(url)
        berita.append(hasil)
    except Exception as e:
        print("Gagal:", url, e)

df = pd.concat(berita, ignore_index=True)
df

### A5. Scraping seluruh isi sitemap

In [None]:
#fungsi ambil url sitemap dari sitemap utama
def get_sitemap_urls(master_sitemap_url: str):
    response = requests.get(master_sitemap_url, headers=headers, timeout=60)
    soup = BeautifulSoup(response.content, 'xml')
    sitemap_urls = [loc.get_text().strip() for loc in soup.find_all('loc')]
    return sitemap_urls

#fungsi ambil url article
def get_article_urls(sitemap_url: str):
    response = requests.get(sitemap_url, headers=headers, timeout=60)
    soup = BeautifulSoup(response.content, 'xml')
    article_urls = [loc.get_text().strip() for loc in soup.find_all('loc')]
    return article_urls

def scrape_detik(url: str):
    response = requests.get(url, headers=headers, timeout=60)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Ambil teks tanggal utuh
    tanggal_utuh = soup.find('div', class_='detail__date')
    if tanggal_utuh:
        waktu = tanggal_utuh.get_text(strip=True)
        try:
            hari, sisanya = waktu.split(',', 1)
            sisanya = sisanya.strip()
            parts = sisanya.rsplit(' ', 2)
            tanggal_bersih = parts[0]
            jam = f"{parts[1]} {parts[2]}"
        except Exception:
            hari = np.nan
            tanggal_bersih = np.nan
            jam = np.nan
    else:
        hari = np.nan
        tanggal_bersih = np.nan
        jam = np.nan

    hasil = {
        "judul": soup.find('h1').get_text(strip=True) if soup.find('h1') else np.nan,
        "isi": "\n\n".join(p.get_text(strip=True) for p in soup.find_all('p')),
        "hari": hari,
        "tanggal": tanggal_bersih,
        "jam": jam,
        "kategori": soup.find('div', class_="page__breadcrumb").get_text(strip=True) if soup.find('div', class_="page__breadcrumb") else np.nan,
        "link": url
    }

    return hasil

In [None]:
def scrape_all_from_sitemap(master_sitemap_url: str, max_sitemap=None, max_articles=None, output_csv: str="detik_semua_artikel_sitemap.csv"):
    """
    Robot scraping semua artikel dari sitemap utama Detik.

    Args:
        master_sitemap_url (str): URL sitemap utama.
        max_sitemap (int, optional): Batasi jumlah sitemap yang di-scrape. None artinya semua.
        max_articles (int, optional): Batasi jumlah artikel per sitemap. None artinya semua.
        output_csv (str): Nama file output CSV.

    Returns:
        pd.DataFrame: DataFrame berisi hasil scraping artikel.
    """
    all_articles = []

    print("Mengambil daftar sitemap...")
    sitemap_urls = get_sitemap_urls(master_sitemap_url)

    if max_sitemap:
        sitemap_urls = sitemap_urls[:max_sitemap]

    for sitemap in tqdm(sitemap_urls, desc="Sitemap"):
        try:
            article_urls = get_article_urls(sitemap)
            if max_articles:
                article_urls = article_urls[:max_articles]

            for url in tqdm(article_urls, desc="Artikel", leave=False):
                try:
                    article = scrape_detik(url)
                    all_articles.append(article)
                    time.sleep(1)  # Hindari ban
                except Exception as e:
                    print("Gagal scraping artikel:", url, e)

        except Exception as e:
            print("Gagal akses sitemap:", sitemap, e)

    # Simpan ke CSV
    df = pd.DataFrame(all_articles)
    df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    print(f"Selesai. Total artikel: {len(df)}. Hasil disimpan di {output_csv}")
    return df

In [None]:
df = scrape_all_from_sitemap('https://www.detik.com/sitemap.xml', max_sitemap=2, max_articles=10)
df.head()

# B. Case: Tempo

In [None]:
## global
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup #scraping web statis
import time
from tqdm import tqdm #info progress

## selenium, scraping web dinamis
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
## check if there is a response, if it's 200, we are good to go
s = requests.Session()
url = 'https://www.tempo.co/search?q=makan+bergizi+gratis&page=1'
response = s.get(url)
print(response.status_code)

### B1. Crawling URLs

In [None]:
##Fungsi melakukan scraping URL Tempo, masukkan string keyword dan max halaman
def scrape_tempo_search_selenium(kata_kunci: str, halaman: int) -> pd.DataFrame: 
    """
    Robot crawling url yang diinginkan berdasarkan kata kunci yang user input.

    Args:
        kata_kunci (str): Query yang ingin dimasukkan.
        halaman (int): Batasi jumlah halaman yang di-scrape.

    Returns:
        pd.DataFrame: DataFrame berisi hasil crawling url.
    """
    # Set User-Agent
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
    opts = Options()
    opts.add_argument(f"user-agent={user_agent}")
    opts.add_argument("--headless")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
        
    # Parameter input
    keyword = kata_kunci #contoh: "makan bergizi gratis"
    max_pages = halaman #contoh: 2
    results = []

    # Loop halaman
    for page in tqdm(range(1, max_pages + 1)):
        # print(f"Scraping page {page}...")

        # Format URL pencarian
        encoded_query = quote_plus(keyword)
        url = f"https://www.tempo.co/search?q={encoded_query}&page={page}"
        
        driver.get(url)
        time.sleep(10)

        try:
            container = driver.find_element("css selector", "div.flex.flex-col.divide-y.divide-neutral-500")
            beritas = container.find_elements("css selector", "figure.flex.flex-row.gap-3.py-4.container.lg\\:mx-0.lg\\:px-0")
            for berita in beritas:
                try:
                    a = berita.find_element("tag name", "a")
                    p = berita.find_element("tag name", "p")
                    results.append({
                        "judul": p.text,
                        "link": a.get_attribute("href"),
                        "keyword": keyword
                    })
                except Exception as e:
                    print("Skip 1 berita:", e)

        except Exception as e:
            print("Skip page:", e)

    driver.quit()

    # Simpan ke DataFrame
    df = pd.DataFrame(results)
    print("Selesai. Total berita:", len(df))
    return df

In [None]:
#max pages yang bisa diakses publik hanya 100 pages untuk tiap keyword (pengecekan manual)
scrape_tempo_search_selenium("makan bergizi gratis", 2)

In [None]:
## looping list keyword

# Daftar keyword yang ingin di-scrape
list_keyword = [
    "makan bergizi gratis",
    "efisiensi anggaran",
    "CPNS 2025",
    "kemiskinan world bank"
]

# Set jumlah halaman yang ingin di-scrape per keyword
halaman = 2

# List untuk menyimpan semua DataFrame
dfs = []

for keyword in list_keyword:
    print(f"Scraping untuk keyword: {keyword}")
    df_keyword = scrape_tempo_search_selenium(keyword, halaman)
    dfs.append(df_keyword)

# Gabungkan semua hasil
df_final = pd.concat(dfs, ignore_index=True)

In [None]:
df_final.to_csv("./files/tempo_links.csv", index=False, encoding='utf-8-sig')
df_final

### B2. Scraping satu artikel

In [None]:
##Fungsi melakukan scraping data 1 halaman Tempo, masukkan string url
def scrape_tempo(url: str) -> pd.DataFrame: 
    ## Inisiasi dictionary hasil
    hasil = {}

    try:
        response = requests.get(url, headers=headers, timeout=60)
        soup = BeautifulSoup(response.text, 'html.parser')

        ##Judul
        judul = soup.find('h1', class_='text-[26px] font-bold leading-[122%] text-neutral-1200')
        hasil['judul'] = judul.get_text(strip=True) if judul else np.nan

        ## Sub judul
        sub_judul = soup.find('div', class_='font-roboserif leading-[156%] text-neutral-1100')
        hasil['sub_judul'] = sub_judul.get_text(strip=True) if sub_judul else np.nan

        ## Isi berita
        isi_paragraf = []
        isi_berita = soup.find_all('div', id='content-wrapper', class_='max-lg:container xl')

        for i in isi_berita:
            paragraf = i.find_all('p')
            for p in paragraf:
                teks = p.get_text(strip=True)
                if teks:  #menambahkan teks bila ada
                    isi_paragraf.append(teks)
        ringkasan = '\n\n'.join(isi_paragraf)
        hasil['isi'] = ringkasan if ringkasan else np.nan

        ## Tanggal & Jam publikasi
        tanggal_publikasi = soup.find('p', class_='text-neutral-900 text-sm')
        if tanggal_publikasi:
            waktu = tanggal_publikasi.get_text(strip=True)
            if '|' in waktu:
                tanggal, jam = [part.strip() for part in waktu.split('|')]
                hasil['tanggal'] = tanggal
                hasil['jam'] = jam
            else:
                hasil['tanggal'] = waktu
                hasil['jam'] = np.nan
        else:
            hasil['tanggal'] = np.nan
            hasil['jam'] = np.nan

        ## Kategori
        kategori = soup.find('span', class_='text-sm font-medium text-primary-main')
        hasil['kategori'] = kategori.get_text(strip=True) if kategori else np.nan

        ## Link
        hasil['link'] = url 

    except Exception as e:
        print(f"Terjadi kesalahan saat scraping: {e}")
        return None

    ## Kembalikan juga sebagai DataFrame
    df = pd.DataFrame([hasil])
    # print('selesai scraping')
    return df

In [None]:
url = 'https://www.tempo.co/ekonomi/potensi-masalah-dari-rencana-pemerintah-ubah-lapas-jadi-perumahan-1533913'
df_hasil = scrape_tempo(url)
df_hasil

### B3. Scraping artikel dalam URLs

In [None]:
##Fungsi melakukan scraping data dari hasil crawling URL Tempo, masukkan csv
def scrape_tempo_dari_csv(path_csv: str) -> pd.DataFrame:
    # Membaca CSV hasil scraping link
    df_links = pd.read_csv(path_csv)

    # Memastikan kolom 'link' ada
    if 'link' not in df_links.columns:
        raise ValueError("CSV tidak mengandung kolom 'link'.")

    hasil_semua = []
    for i, row in tqdm(df_links.iterrows(), total=len(df_links)):
        url = row['link']
        df_artikel = scrape_tempo(url) #memanggil dan menjalankan fungsi scrape_tempo satu artikel
        keyword = row['keyword']
        if df_artikel is not None:
            df_artikel['keyword'] = keyword
            hasil_semua.append(df_artikel)

    # Gabungkan semua DataFrame
    if hasil_semua:
        df_final = pd.concat(hasil_semua, ignore_index=True)
        df_final.to_csv('./files/tempo_semua_artikel.csv', index=False, encoding='utf-8-sig')
        print("Selesai menyimpan semua artikel.")
        return df_final
    else:
        print("Tidak ada artikel yang berhasil di-scrape.")
        return pd.DataFrame()

In [None]:
df = scrape_tempo_dari_csv("./files/tempo_links.csv")
df.head(5)

# Next: Analisis Sentimen