In [41]:
# Facebook Video Scraper (Colab) – Stable Comment Crawler (fixed syntax)
# =====================================================================
# • Robust button selectors: span → ancestor div[@role=button]
# • Avoids stale element refs; scrolls with JS.
# • Stops when no pagination buttons remain **and** pager shows `N of N`,
#   or when comment count stagnates for 15 cycles.
# • Exports CSV, TXT, screenshot, and OCR.

# ─────────────────────────── Cell 1 : Environment ───────────────────────────
!apt-get update -qq
!apt-get install -y wget unzip curl gnupg2 tesseract-ocr -qq
!wget -q -O /tmp/chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i /tmp/chrome.deb || apt-get -fy install -qq
!google-chrome --version
!pip install selenium==4.10.0 chromedriver-autoinstaller pillow pytesseract pandas tqdm -q

# ─────────────────────────── Cell 2 : Scraper ───────────────────────────
import os, time, re, pytesseract, pandas as pd
from PIL import Image
from tqdm import tqdm
import chromedriver_autoinstaller as cda
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, StaleElementReferenceException,
    ElementClickInterceptedException)

VIDEO_URL  = "https://www.facebook.com/AlodiaGosiengfiao/videos/648653461336102/"
OUTPUT_DIR = "outputs"; os.makedirs(OUTPUT_DIR, exist_ok=True)
WAIT = 0.8

# ── Driver ─────────────────────────────────────────────────────────────
path = cda.install()
opts = Options(); opts.add_argument("--headless=new"); opts.add_argument("--no-sandbox"); opts.add_argument("--disable-dev-shm-usage")
service = Service(path)
driver = webdriver.Chrome(service=service, options=opts)
wait   = WebDriverWait(driver, 25)

# ── Load page & close modal ───────────────────────────────────────────
print("Loading video…")
driver.get(VIDEO_URL)
wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
try:
    dlg = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@role="dialog"]')))
    dlg.find_element(By.XPATH, './/div[@role="button" and contains(@aria-label,"Close")]').click()
    time.sleep(1)
except TimeoutException:
    pass

# ── Selectors ─────────────────────────────────────────────────────────
PAGE_KEYWORDS = {
    "View more comments":    "//span[normalize-space(text())='View more comments']/ancestor::div[@role='button']",
    "View previous comments": "//span[normalize-space(text())='View previous comments']/ancestor::div[@role='button']",
    "View more replies":     "//span[normalize-space(text())='View more replies']/ancestor::div[@role='button']",
    "See more replies":      "//span[normalize-space(text())='See more replies']/ancestor::div[@role='button']"
}
LONG_BTN_XP  = "//span[normalize-space(text())='See more']/ancestor::div[@role='button']"
ARTICLES_XP  = "//div[@role='article' and starts-with(@aria-label,'Comment by')]"
PAGER_XP     = "//span[contains(text(),' of ')]"

# ── Helpers ───────────────────────────────────────────────────────────

def safe_click(el):
    try:
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
        time.sleep(0.1)
        el.click(); return True
    except (StaleElementReferenceException, ElementClickInterceptedException):
        return False


def click_pagination_buttons():
    """Click all pagination buttons across all keyword XPaths until none remain."""
    clicked = False
    for xp in PAGE_KEYWORDS.values():
        while True:
            btns = driver.find_elements(By.XPATH, xp)
            if not btns: break
            for b in btns:
                if safe_click(b): clicked = True
            time.sleep(0.2)
    return clicked


def click_long_comment_buttons():
    for b in driver.find_elements(By.XPATH, LONG_BTN_XP):
        safe_click(b)


def scrape_comments():
    rows = []
    for art in driver.find_elements(By.XPATH, ARTICLES_XP):
        author = ""; ts = ""; comment = ""
        try:
            author = art.find_element(By.XPATH, ".//a[1]//span[@dir='auto']").text
        except NoSuchElementException: pass
        try:
            ts = art.find_element(By.XPATH, ".//ul//a[1]").text
        except NoSuchElementException: pass
        parts = [d.text for d in art.find_elements(By.XPATH, ".//div[@dir='auto']") if d.text.strip()]
        if parts: comment = max(parts, key=len)
        if comment:
            rows.append({"author": author, "timestamp": ts, "comment": comment})
    return rows


def pager_done():
    try:
        txt = driver.find_element(By.XPATH, PAGER_XP).text.strip()
        a, b = re.match(r"(\d+)\s+of\s+(\d+)", txt).groups(); return a == b
    except Exception: return False

# ── Crawl ─────────────────────────────────────────────────────────────
all_rows, stagn, prev_len = [], 0, 0
for _ in tqdm(range(600), desc="Crawling"):
    paged = click_pagination_buttons()
    click_long_comment_buttons()
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(WAIT)
    all_rows = scrape_comments()
    if len(all_rows) == prev_len: stagn += 1
    else: stagn = 0
    prev_len = len(all_rows)
    if not paged and pager_done():
        print("Pager indicates last slice – stopping."); break
    if stagn >= 20:
        print("No comment growth in 20 cycles – stopping."); break

print("Captured", len(all_rows), "comments")

# ── Save ──────────────────────────────────────────────────────────────
import pandas as pd
pd.DataFrame(all_rows).to_csv(os.path.join(OUTPUT_DIR, "facebook_comments.csv"), index=False)
with open(os.path.join(OUTPUT_DIR, "facebook_comments.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(r['comment'] for r in all_rows))

shot = os.path.join(OUTPUT_DIR, "screenshot.png"); ocr = os.path.join(OUTPUT_DIR, "ocr.txt")
driver.save_screenshot(shot)
with open(ocr, "w", encoding="utf-8") as f:
    f.write(pytesseract.image_to_string(Image.open(shot)))

print("✅ Outputs in", OUTPUT_DIR)
driver.quit()


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
(Reading database ... 126257 files and directories currently installed.)
Preparing to unpack /tmp/chrome.deb ...
Unpacking google-chrome-stable (136.0.7103.59-1) over (136.0.7103.59-1) ...
Setting up google-chrome-stable (136.0.7103.59-1) ...
Processing triggers for mailcap (3.70+nmu1ubuntu1) ...
Processing triggers for man-db (2.10.2-1) ...
Google Chrome 136.0.7103.59 
Loading video…


Crawling:   3%|▎         | 20/600 [14:56<7:13:28, 44.84s/it]

No comment growth in 20 cycles – stopping.
Captured 330 comments





✅ Outputs in outputs


In [43]:
# Facebook Video Scraper (Colab) – All‑Comments Mode
# ====================================================
# New features
# ------------
# ★ Switches comment filter from **“Most relevant”** (default) → **“All comments”.**
# ★ Then exhaustively clicks every pagination control to reach all 462 comments.
# ★ Outputs CSV, TXT, screenshot, OCR, plus console preview.

# ───────────────────────────── Cell 1 : Environment ─────────────────────────────
!apt-get update -qq
!apt-get install -y wget unzip curl gnupg2 tesseract-ocr -qq
!wget -q -O /tmp/chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i /tmp/chrome.deb || apt-get -fy install -qq
!google-chrome --version
!pip install selenium==4.10.0 chromedriver-autoinstaller pillow pytesseract pandas tqdm -q

# ───────────────────────────── Cell 2 : Scraper ─────────────────────────────
import os, time, re, pytesseract, pandas as pd
from PIL import Image
from tqdm import tqdm
import chromedriver_autoinstaller as cda
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, StaleElementReferenceException,
    ElementClickInterceptedException)

VIDEO_URL  = "https://www.facebook.com/AlodiaGosiengfiao/videos/648653461336102/"
OUTPUT_DIR = "outputs"; os.makedirs(OUTPUT_DIR, exist_ok=True)
WAIT = 0.8

# ── Driver ─────────────────────────────────────────────────────────────
path = cda.install()
opts = Options()
opts.add_argument("--headless=new"); opts.add_argument("--no-sandbox"); opts.add_argument("--disable-dev-shm-usage")
service = Service(path)
driver = webdriver.Chrome(service=service, options=opts)
wait   = WebDriverWait(driver, 25)

# ── Load page & close modal ───────────────────────────────────────────
print("Loading video…")
driver.get(VIDEO_URL)
wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
try:
    dlg = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@role="dialog"]')))
    dlg.find_element(By.XPATH, './/div[@role="button" and contains(@aria-label,"Close")]').click(); time.sleep(1)
except TimeoutException:
    pass

# ── Switch filter to “All comments” ───────────────────────────────────
try:
    filt_btn = wait.until(EC.element_to_be_clickable((By.XPATH,
        "//span[normalize-space(text())='Most relevant' or normalize-space(text())='All comments' or normalize-space(text())='Newest']/ancestor::div[@role='button']")))
    driver.execute_script("arguments[0].click();", filt_btn); time.sleep(0.5)
    all_opt = wait.until(EC.element_to_be_clickable((By.XPATH,
        "//span[normalize-space(text())='All comments']/ancestor::div[@role='menuitem']")))
    driver.execute_script("arguments[0].click();", all_opt); time.sleep(1)
    print("🔄 Filter set to All comments")
except TimeoutException:
    print("⚠️  Could not switch filter – proceeding with current setting")

# ── Selectors ─────────────────────────────────────────────────────────
PAGE_KEYWORDS = {
    "View more comments":    "//span[normalize-space(text())='View more comments']/ancestor::div[@role='button']",
    "View previous comments": "//span[normalize-space(text())='View previous comments']/ancestor::div[@role='button']",
    "View more replies":     "//span[normalize-space(text())='View more replies']/ancestor::div[@role='button']",
    "See more replies":      "//span[normalize-space(text())='See more replies']/ancestor::div[@role='button']"
}
LONG_BTN_XP = "//span[normalize-space(text())='See more']/ancestor::div[@role='button']"
ARTICLES_XP = "//div[@role='article' and starts-with(@aria-label,'Comment by')]"
PAGER_XP    = "//span[contains(text(),' of ')]"

# ── Helpers ───────────────────────────────────────────────────────────

def safe_click(el):
    try:
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el); time.sleep(0.1)
        el.click(); return True
    except (StaleElementReferenceException, ElementClickInterceptedException):
        return False


def click_pagination_buttons():
    clicked = False
    for xp in PAGE_KEYWORDS.values():
        while True:
            btns = driver.find_elements(By.XPATH, xp)
            if not btns: break
            for b in btns:
                if safe_click(b): clicked = True
            time.sleep(0.2)
    return clicked


def click_long_comment_buttons():
    for b in driver.find_elements(By.XPATH, LONG_BTN_XP): safe_click(b)


def scrape_comments():
    recs = []
    for art in driver.find_elements(By.XPATH, ARTICLES_XP):
        try: author = art.find_element(By.XPATH, ".//a[1]//span[@dir='auto']").text
        except NoSuchElementException: author = ""
        try: ts = art.find_element(By.XPATH, ".//ul//a[1]").text
        except NoSuchElementException: ts = ""
        txts = [d.text for d in art.find_elements(By.XPATH, ".//div[@dir='auto']") if d.text.strip()]
        if txts:
            recs.append({"author": author, "timestamp": ts, "comment": max(txts, key=len)})
    return recs


def pager_done():
    try:
        a,b = re.match(r"(\d+)\s+of\s+(\d+)", driver.find_element(By.XPATH, PAGER_XP).text).groups(); return a==b
    except Exception: return False

# ── Crawl ─────────────────────────────────────────────────────────────
all_rows, stagn, prev = [], 0, 0
for _ in tqdm(range(800), desc="Expanding"):
    paged = click_pagination_buttons(); click_long_comment_buttons()
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);"); time.sleep(WAIT)
    all_rows = scrape_comments()
    if len(all_rows)==prev: stagn+=1
    else: stagn=0
    prev = len(all_rows)
    if pager_done() and not paged:
        print("✔️ Pager shows last slice & no buttons left"); break
    if stagn>=25:
        print("⚠️  No growth for 25 loops – aborting"); break

print("Total comments captured:", len(all_rows))

# ── Save & preview ────────────────────────────────────────────────────
csv_path = os.path.join(OUTPUT_DIR, "facebook_comments.csv")
pd.DataFrame(all_rows).to_csv(csv_path, index=False, encoding="utf-8")
with open(os.path.join(OUTPUT_DIR, "facebook_comments.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(r['comment'] for r in all_rows))
print("CSV →", csv_path, "| size", os.path.getsize(csv_path), "bytes")
try:
    print(pd.read_csv(csv_path, nrows=3))
except Exception as e:
    print("Preview failed", e)

shot = os.path.join(OUTPUT_DIR, "screenshot.png"); ocr = os.path.join(OUTPUT_DIR, "ocr.txt")
driver.save_screenshot(shot)
with open(ocr, "w", encoding="utf-8") as f: f.write(pytesseract.image_to_string(Image.open(shot)))
print("✅ Outputs in", OUTPUT_DIR)
driver.quit()


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
(Reading database ... 126257 files and directories currently installed.)
Preparing to unpack /tmp/chrome.deb ...
Unpacking google-chrome-stable (136.0.7103.59-1) over (136.0.7103.59-1) ...
Setting up google-chrome-stable (136.0.7103.59-1) ...
Processing triggers for mailcap (3.70+nmu1ubuntu1) ...
Processing triggers for man-db (2.10.2-1) ...
Google Chrome 136.0.7103.59 
Loading video…
🔄 Filter set to All comments


Expanding:   3%|▎         | 25/800 [23:45<12:16:18, 57.00s/it]

⚠️  No growth for 25 loops – aborting
Total comments captured: 422
CSV → outputs/facebook_comments.csv | size 17327 bytes
               author timestamp  \
0  Alodia Gosiengfiao        1w   
1       The Butch Tan        1w   
2    Franco Pantangco        1w   

                                             comment  
0  Shrimptastic Magic Balls: https://www.nestlego...  
1  Great idea to ah! Push lang, papatok tong gant...  
2       Are you playing Clair Obscur: Expedition 33?  





✅ Outputs in outputs
