In [None]:
# =====================================================
# Pegos Twitter Scraper (Headless Enhanced)
# =====================================================
import os, time, random
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def parse_interaction_count(tag_text):
    """Kısaltmaları tam sayıya çevirir."""
    if not tag_text:
        return 0
    tag_text = tag_text.replace(",", "").strip()
    if tag_text.endswith("B"):
        return int(float(tag_text[:-1]) * 1_000)
    elif tag_text.endswith("M") or tag_text.endswith("Mn"):
        return int(float(tag_text[:-1]) * 1_000_000)
    try:
        return int(float(tag_text))
    except:
        return 0

In [None]:
# --------------------------
# ENV değişkenleri (GitHub Secrets)
# --------------------------
AUTH_TOKEN = os.getenv("AUTH_TOKEN")
CT0 = os.getenv("CT0")
OUT_PATH = "/tmp/pegos_output.csv"

if not AUTH_TOKEN or not CT0:
    raise RuntimeError("AUTH_TOKEN veya CT0 tanımlı değil (GitHub Secrets kısmına ekle).")

# --------------------------
# Chrome başlat (tam render için optimize)
# --------------------------
opts = Options()
opts.add_argument("--headless=chrome")  # tam render
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--disable-blink-features=AutomationControlled")
opts.add_argument("--window-size=1920,1080")
opts.add_argument("--enable-javascript")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

In [None]:
# --------------------------
# Login with cookies
# --------------------------
driver.get("https://x.com")
time.sleep(3)
driver.add_cookie({"name": "auth_token", "value": AUTH_TOKEN, "domain": ".x.com"})
driver.add_cookie({"name": "ct0", "value": CT0, "domain": ".x.com"})
driver.refresh()
time.sleep(5)
print("✅ Cookies set, current URL:", driver.current_url)

In [None]:
# --------------------------
# Tweet scraping
# --------------------------
KEYWORDS = ['blockchain', 'cryptocurrency', 'bitcoin']
tweetArr = []

for kw in KEYWORDS:
    print(f"🔎 Searching for: {kw}")
    driver.get(f"https://x.com/search?q={kw}&src=typed_query&f=live")
    time.sleep(5)

    for scroll_round in range(40):
        driver.execute_script("window.scrollBy(0, 1200);")
        time.sleep(random.uniform(2.5, 3.5))
        driver.execute_script("window.scrollBy(0, -200);")  # lazy-load tetikleme

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        articles = soup.find_all("article") or []

        for art in articles:
            try:
                tag = art.find(attrs={"data-testid": "tweetText"})
                if not tag:
                    continue
                text = tag.get_text(" ", strip=True)
                time_tag = art.find("time")
                time_str = time_tag["datetime"] if time_tag else None

                # Etkileşim verilerini yakala
                vals = art.find_all(attrs={"data-testid": "app-text-transition-container"})
                vals_texts = [v.get_text(strip=True) for v in vals]
                vals_int = [parse_interaction_count(v) for v in vals_texts]

                tweetArr.append({
                    "keyword": kw,
                    "tweet": text,
                    "time": time_str,
                    "comment": vals_int[0] if len(vals_int) > 0 else 0,
                    "retweet": vals_int[1] if len(vals_int) > 1 else 0,
                    "like": vals_int[2] if len(vals_int) > 2 else 0,
                    "see_count": vals_int[3] if len(vals_int) > 3 else 0
                })
            except Exception as e:
                continue

    print(f"✅ Finished {kw}: total tweets so far {len(tweetArr)}")

driver.quit()

In [None]:
# --------------------------
# Save to CSV
# --------------------------
df = pd.DataFrame(tweetArr)
if df.empty:
    print("⚠️ No tweets collected.")
else:
    df.drop_duplicates(subset=['tweet', 'time'], inplace=True)
    df.to_csv(OUT_PATH, index=False)
    print(f"💾 Saved to {OUT_PATH}, total {len(df)} rows.")