In [None]:
# =====================================================
# Pegos Twitter Scraper (Top Tweets + Full Interaction Fix)
# =====================================================
import os, time, random
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def safe_int(val):
    """Kısaltılmış string sayıları integer'a çevir."""
    if not val: return 0
    val = val.replace(",", "").replace("·", "").strip()
    try:
        if val.endswith("B"): return int(float(val[:-1]) * 1_000)
        if val.endswith("M") or val.endswith("Mn"): return int(float(val[:-1]) * 1_000_000)
        return int(float(val))
    except: return 0

In [None]:
AUTH_TOKEN = os.getenv("AUTH_TOKEN")
CT0 = os.getenv("CT0")
OUT_PATH = "/tmp/pegos_output.csv"

if not AUTH_TOKEN or not CT0:
    raise RuntimeError("AUTH_TOKEN veya CT0 tanımlı değil (GitHub Secrets kısmına ekle).")

opts = Options()
opts.add_argument("--headless=chrome")  # tam render destekli headless
opts.add_argument("--disable-gpu")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1920,1080")
opts.add_argument("--disable-blink-features=AutomationControlled")
opts.add_argument("--start-maximized")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

# Cookie login
driver.get("https://x.com")
time.sleep(3)
driver.add_cookie({"name": "auth_token", "value": AUTH_TOKEN, "domain": ".x.com"})
driver.add_cookie({"name": "ct0", "value": CT0, "domain": ".x.com"})
driver.refresh()
time.sleep(5)
print("✅ Login başarılı:", driver.current_url)

In [None]:
KEYWORDS = ['bitcoin', 'blockchain', 'cryptocurrency']
tweetArr = []

for kw in KEYWORDS:
    print(f"🔎 Searching Top Tweets for: {kw}")
    # 🔸 “Top” sekmesi: f=top
    driver.get(f"https://x.com/search?q={kw}&src=typed_query&f=top")
    time.sleep(6)

    for scroll_round in range(25):  # daha az scroll, sadece popülerleri al
        driver.execute_script("window.scrollBy(0, 1400);")
        time.sleep(random.uniform(3, 5))
        driver.execute_script("window.scrollBy(0, -200);")

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        articles = soup.find_all("article")

        for art in articles:
            try:
                text_tag = art.find(attrs={"data-testid": "tweetText"})
                if not text_tag:
                    continue
                text = text_tag.get_text(" ", strip=True)
                time_tag = art.find("time")
                time_str = time_tag["datetime"] if time_tag else None

                # 🔸 Yeni testid'ler: bazen farklı adlarla render oluyor
                reply = art.find(attrs={"data-testid": ["reply", "conversation"]})
                retweet = art.find(attrs={"data-testid": ["retweet", "repost"]})
                like = art.find(attrs={"data-testid": ["like", "favorite"]})
                view = art.find(attrs={"data-testid": ["viewCount", "views"]})

                reply_val = safe_int(reply.get_text(strip=True) if reply else "0")
                retweet_val = safe_int(retweet.get_text(strip=True) if retweet else "0")
                like_val = safe_int(like.get_text(strip=True) if like else "0")
                view_val = safe_int(view.get_text(strip=True) if view else "0")

                # Boş ve spam tweetleri atla
                if len(text) < 15 or (reply_val + retweet_val + like_val + view_val) == 0:
                    continue

                tweetArr.append({
                    "keyword": kw,
                    "tweet": text,
                    "time": time_str,
                    "comment": reply_val,
                    "retweet": retweet_val,
                    "like": like_val,
                    "see_count": view_val
                })
            except Exception:
                continue

    print(f"✅ {kw}: {len(tweetArr)} tweet kaydedildi (popülerler)")
driver.quit()

In [None]:
df = pd.DataFrame(tweetArr)
if df.empty:
    print("⚠️ Veri toplanamadı.")
else:
    df.drop_duplicates(subset=["tweet", "time"], inplace=True)
    df.sort_values(by="like", ascending=False, inplace=True)
    df.to_csv(OUT_PATH, index=False)
    print(f"💾 Kaydedildi: {OUT_PATH}, toplam {len(df)} satır (popüler tweetler).")