# YOUTUBE SCRAPPING

### GAP 0 — Imports

In [55]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException, TimeoutException, StaleElementReferenceException,
    ElementClickInterceptedException
)
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time, re, hashlib, random, csv

### GAP 1 — Helper Functions

In [56]:
def parse_numeric_text(s):
    """Konversi teks likes seperti '1.2K' ke integer."""
    if not s:
        return 0
    s = str(s).strip().lower().replace(',', '')
    try:
        if 'k' in s:
            return int(float(s.replace('k','')) * 1_000)
        elif 'm' in s:
            return int(float(s.replace('m','')) * 1_000_000)
        else:
            return int(re.sub(r'[^0-9]', '', s) or 0)
    except:
        return 0

def make_hash_id(text):
    return hashlib.md5(text.encode('utf-8')).hexdigest()

def clean_comment_text(text):
    """Buat 1 baris per komentar: ganti newline/CR dengan spasi, strip."""
    if text is None:
        return ""
    return re.sub(r'\s+', ' ', text).strip()

### GAP 2 — WebDriver creator


In [57]:
def create_driver_visible():
    chrome_options = Options()
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--lang=en-US")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

### GAP 3 — Helper Functions untuk Scroll, Replies, dan Read More


In [58]:
# ==========================================================
# GAP 3 — Helper Functions (Final Optimized Smart Traversal)
# ==========================================================

def initial_scroll_to_comments(driver):
    """Scroll sedikit ke bawah supaya area komentar muncul."""
    for y in (300, 700, 1200):
        driver.execute_script(f"window.scrollTo(0, {y});")
        time.sleep(0.4)
    time.sleep(0.8)


def continuous_scroll_until_stable(driver, check_interval=1.0, max_attempts_nochange=6, max_scrolls=180):
    """
    Scroll terus sampai jumlah thread tidak berubah beberapa kali berturut-turut.
    """
    last_count, stable_count = 0, 0
    for i in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(check_interval + random.uniform(0.2, 0.4))
        threads = driver.find_elements(By.XPATH, "//ytd-comment-thread-renderer")
        cur_count = len(threads)
        if cur_count == last_count:
            stable_count += 1
        else:
            stable_count = 0
            last_count = cur_count
        if stable_count >= max_attempts_nochange:
            break
    print(f"[SCROLL] Komentar stabil setelah {i+1} iterasi, total thread: {last_count}")
    return last_count


def click_all_buttons_safely(driver, xpath, max_loops=10, pause=0.25):
    """
    Klik semua tombol yang cocok dengan xpath di seluruh halaman,
    berhenti jika tidak ada tombol baru yang muncul.
    """
    total_clicks, stagnant_rounds = 0, 0
    last_btns = 0

    for _ in range(max_loops):
        btns = driver.find_elements(By.XPATH, xpath)
        if len(btns) == last_btns:
            stagnant_rounds += 1
        else:
            stagnant_rounds = 0
        last_btns = len(btns)

        if stagnant_rounds >= 3 or not btns:
            break

        clicked = 0
        for b in btns:
            try:
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", b)
                driver.execute_script("arguments[0].click();", b)
                clicked += 1
                total_clicks += 1
                time.sleep(pause)
            except Exception:
                continue

        if clicked == 0:
            stagnant_rounds += 1
        else:
            stagnant_rounds = 0
        time.sleep(0.4)

    print(f"[INFO] {total_clicks} tombol '{xpath}' berhasil diklik.")
    return total_clicks


def expand_thread_fully(driver, thread, max_reply_loops=5):
    """
    Expand seluruh isi 1 thread:
      - Read more komentar utama
      - View replies
      - Read more di reply
      - Show more replies (jika banyak balasan)
    """
    total_clicks = 0
    try:
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", thread)
        time.sleep(0.2)
    except Exception:
        pass

    # Klik read more utama
    for _ in range(2):
        read_btns = thread.find_elements(By.XPATH, ".//tp-yt-paper-button[@id='more']")
        if not read_btns:
            break
        for b in read_btns:
            try:
                driver.execute_script("arguments[0].click();", b)
                total_clicks += 1
                time.sleep(0.2)
            except Exception:
                continue

    # Expand replies (View replies + Show more replies)
    for _ in range(max_reply_loops):
        reply_btns = thread.find_elements(By.XPATH, ".//ytd-button-renderer[@id='more-replies' or @id='more-replies-sub-thread']")
        if not reply_btns:
            break
        for b in reply_btns:
            try:
                driver.execute_script("arguments[0].scrollIntoView({block:'center'});", b)
                driver.execute_script("arguments[0].click();", b)
                total_clicks += 1
                time.sleep(0.25)
            except Exception:
                continue
        time.sleep(0.5)

    # Klik read more pada replies yang muncul
    for _ in range(2):
        reply_reads = thread.find_elements(By.XPATH, ".//ytd-comment-view-model//tp-yt-paper-button[@id='more']")
        for b in reply_reads:
            try:
                driver.execute_script("arguments[0].click();", b)
                total_clicks += 1
                time.sleep(0.2)
            except Exception:
                continue

    return total_clicks


### GAP 4 — Main scraping (order-preserving, robust)


In [59]:
# ==========================================================
# GAP 4 — Scraper Hybrid (Shadow-Safe + Efficient)
# ==========================================================

def scrape_all_comments(video_url):
    """
    Scrape semua komentar (utama + balasan) dengan kompatibilitas DOM YouTube 2025.
    Menjaga emoji, URL, dan urutan komentar. Mengakses komentar lewat JS (shadow-safe).
    """
    print(f"\n[START] Scraping video: {video_url}")
    driver = create_driver_visible()
    driver.get(video_url)
    time.sleep(3)

    # Pastikan area komentar muncul
    initial_scroll_to_comments(driver)

    # Ambil jumlah komentar (jika ada)
    total_displayed = None
    try:
        text_val = driver.execute_script("""
            let el = document.querySelector('ytd-comments-header-renderer #count, ytd-comments-header-renderer yt-formatted-string');
            return el ? el.innerText : '';
        """)
        if text_val:
            total_displayed = int(re.sub(r'[^0-9]', '', text_val.replace(',', '')))
            print(f"[INFO] Jumlah komentar di halaman: {total_displayed}")
        else:
            print("[WARN] Tidak bisa membaca jumlah komentar dari header.")
    except Exception:
        print("[WARN] Tidak bisa membaca jumlah komentar via JS.")

    # Scroll semua komentar agar termuat
    total_threads = continuous_scroll_until_stable(driver)
    print(f"[INFO] Total thread komentar termuat: {total_threads}")

    # Expand semua "Read more" dan "Show more replies"
    click_all_buttons_safely(driver, "//tp-yt-paper-button[@id='more']", max_loops=4)
    click_all_buttons_safely(driver, "//ytd-button-renderer[@id='more-replies' or @id='more-replies-sub-thread']", max_loops=8)

    # Scroll tambahan agar replies termuat penuh
    continuous_scroll_until_stable(driver, check_interval=0.8, max_attempts_nochange=4, max_scrolls=40)

    # Ambil semua komentar via JavaScript (karena Selenium tidak bisa langsung baca shadow DOM)
    js_code = """
        let data = [];
        let comments = document.querySelectorAll('ytd-comment-view-model #content-text');
        comments.forEach((el) => {
            let text = el.innerText || '';
            let likesEl = el.closest('ytd-comment-view-model')?.querySelector('#vote-count-middle');
            let likes = likesEl ? likesEl.innerText.trim() : '';
            let isReply = !!el.closest('ytd-comment-replies-renderer');
            data.push({text, likes, isReply});
        });
        return data;
    """
    extracted = driver.execute_script(js_code)

    if not extracted:
        print("[ERROR] Tidak ada komentar ditemukan — kemungkinan DOM belum termuat penuh.")
        driver.quit()
        return pd.DataFrame(columns=["comment", "likes_count", "is_reply"])

    # Proses hasil ke DataFrame
    records = []
    for item in extracted:
        text = re.sub(r'\s+', ' ', item.get("text", "").strip())
        likes = parse_numeric_text(item.get("likes", ""))
        is_reply = bool(item.get("isReply", False))
        if text:
            records.append({"comment": text, "likes_count": likes, "is_reply": is_reply})

    driver.quit()

    df = pd.DataFrame(records, columns=["comment", "likes_count", "is_reply"])
    scraped_count = len(df)

    print(f"[FINISH] {scraped_count} komentar berhasil diambil.")
    if total_displayed:
        diff = total_displayed - scraped_count
        print(f"[CHECK] YouTube: {total_displayed} | Scraped: {scraped_count}")
        if abs(diff) <= 5:
            print("[OK] Jumlah hampir sama (akurasi tinggi).")
        else:
            print(f"[WARN] Selisih {diff} komentar (kemungkinan komentar tersembunyi atau dibatasi).")

    return df


### GAP 5 — Eksekusi Scraping Iteratif


In [60]:
# ==========================================================
# GAP 5 — Eksekusi Scraping Iteratif (Manual 'next' Control)
# ==========================================================

if __name__ == "__main__":
    processed_video_urls = []
    video_count = 0
    max_videos = 6

    print(f"Memulai proses scraping untuk maksimal {max_videos} video.\n")

    while video_count < max_videos:
        video_url = input(f"Masukkan URL video ke-{video_count+1} (atau ketik 'selesai'): ").strip()
        if video_url.lower() == 'selesai':
            break
        if not video_url.startswith("https://www.youtube.com/watch?v=") and not video_url.startswith("https://youtu.be/"):
            print("URL tidak valid, masukkan URL YouTube yang benar.")
            continue
        if video_url in processed_video_urls:
            print("URL sudah pernah diproses.")
            continue

        # ==== Jalankan scraping (Chrome tetap terbuka) ====
        print(f"\n[PROCESS] Memulai scraping untuk video ke-{video_count+1} ...")
        driver = create_driver_visible()
        driver.get(video_url)
        time.sleep(3)

        # Jalankan scraper menggunakan driver aktif
        df_comments = scrape_all_comments(video_url)

        # ==== Tunggu input user sebelum lanjut ====
        print("\n[INFO] Scraping selesai untuk video ini.")
        print("Ketik 'next' untuk menutup browser dan lanjut ke video berikutnya.")
        print("Atau ketik 'stop' untuk mengakhiri program.")
        user_input = input(">>> ").strip().lower()

        if user_input == 'stop':
            try:
                driver.quit()
            except Exception:
                pass
            print("\n[EXIT] Proses scraping dihentikan oleh user.")
            break

        if user_input == 'next':
            try:
                driver.quit()
                print("[INFO] Browser ditutup. Lanjut ke video berikutnya.\n")
            except Exception:
                pass

        # ==== Simpan hasil ====
        if not df_comments.empty:
            filename = f"dataset_video_{video_count+1}.csv"
            df_comments.to_csv(filename, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_MINIMAL)
            print(f"[SAVED] Data komentar disimpan ke: {filename}\n")
        else:
            print("[INFO] Tidak ada komentar disimpan untuk video ini.\n")

        processed_video_urls.append(video_url)
        video_count += 1

    print("\n" + "=" * 60)
    print(f"PROSES SELESAI — Total {video_count} video berhasil disimpan.")
    print("=" * 60)


Memulai proses scraping untuk maksimal 6 video.


[PROCESS] Memulai scraping untuk video ke-1 ...

[START] Scraping video: https://www.youtube.com/watch?v=4OIKKv5j69w&t=23s
[INFO] Jumlah komentar di halaman: 427
[SCROLL] Komentar stabil setelah 20 iterasi, total thread: 293
[INFO] Total thread komentar termuat: 293
[INFO] 1172 tombol '//tp-yt-paper-button[@id='more']' berhasil diklik.
[INFO] 560 tombol '//ytd-button-renderer[@id='more-replies' or @id='more-replies-sub-thread']' berhasil diklik.
[SCROLL] Komentar stabil setelah 5 iterasi, total thread: 293
[FINISH] 392 komentar berhasil diambil.
[CHECK] YouTube: 427 | Scraped: 392
[WARN] Selisih 35 komentar (kemungkinan komentar tersembunyi atau dibatasi).

[INFO] Scraping selesai untuk video ini.
Ketik 'next' untuk menutup browser dan lanjut ke video berikutnya.
Atau ketik 'stop' untuk mengakhiri program.
[SAVED] Data komentar disimpan ke: dataset_video_1.csv


PROSES SELESAI — Total 1 video berhasil disimpan.
