In [None]:
# =====================================================
# Pegos Twitter Scraper (Top + Live, robust counts, always-save)
# =====================================================
import os, time, random
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

def safe_int(val: str):
    """Metin sayıları (3.5K, 1M, vb.) güvenli int'e çevirir."""
    if not val:
        return 0
    val = val.replace(',', '').replace('·', '').strip()
    try:
        if val.endswith('B'):
            return int(float(val[:-1]) * 1_000)
        if val.endswith('M') or val.endswith('Mn'):
            return int(float(val[:-1]) * 1_000_000)
        return int(float(val))
    except:
        return 0

def find_view_node(article):
    """Tweet view sayısını yakalamak için alternatif testler."""
    v = article.find(attrs={"data-testid": ["viewCount", "views"]})
    if v: return v
    v = article.find("span", attrs={"aria-label": lambda s: s and "views" in s.lower()})
    if v: return v
    v = article.find("div", attrs={"aria-label": lambda s: s and "views" in s.lower()})
    return v

print("✅ Kütüphaneler ve fonksiyonlar yüklendi.")

In [None]:
# ======================= ENV & PATHS =======================
AUTH_TOKEN = os.getenv("AUTH_TOKEN")
CT0 = os.getenv("CT0")
if not AUTH_TOKEN or not CT0:
    raise RuntimeError("❌ AUTH_TOKEN veya CT0 tanımlı değil (GitHub Secrets).")

TODAY = datetime.utcnow().strftime("%Y-%m-%d")
OUT_DIR = f"/tmp/data/{TODAY}"
OUT_CSV = f"{OUT_DIR}/pegos_output.csv"
LATEST_CSV = f"{OUT_DIR}/latest.csv"

os.makedirs(OUT_DIR, exist_ok=True)
print("📁 OUT_DIR:", OUT_DIR)

In [None]:
# ======================= BROWSER =======================
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-gpu")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1920,1080")
opts.add_argument("--disable-blink-features=AutomationControlled")
opts.add_experimental_option("excludeSwitches", ["enable-automation"])
opts.add_experimental_option("useAutomationExtension", False)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

# Cookie login işlemi
driver.get("https://x.com")
time.sleep(3)
driver.add_cookie({"name": "auth_token", "value": AUTH_TOKEN, "domain": ".x.com"})
driver.add_cookie({"name": "ct0", "value": CT0, "domain": ".x.com"})
driver.refresh()
time.sleep(5)

print("✅ Login başarılı:", driver.current_url)

In [None]:
# ======================= SCRAPE =======================
KEYWORDS = ["bitcoin", "blockchain", "cryptocurrency"]
MODES = ["top", "live"]  # önce top, sonra live
tweetArr = []

for kw in KEYWORDS:
    for mode in MODES:
        print(f"\n🔎 {kw} | mode={mode}")
        driver.get(f"https://x.com/search?q={kw}&src=typed_query&f={mode}")
        time.sleep(6)

        seen = set()
        for _ in range(60):  # scroll sayısı artırıldı
            driver.execute_script("window.scrollBy(0, 1200);")
            time.sleep(random.uniform(2.0, 3.2))
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            for art in soup.find_all("article"):
                try:
                    text_tag = art.find(attrs={"data-testid": "tweetText"})
                    if not text_tag:
                        continue
                    text = text_tag.get_text(" ", strip=True)
                    if len(text) < 8:
                        continue

                    ttag = art.find("time")
                    tstr = ttag["datetime"] if ttag else None
                    key = (text, tstr)
                    if key in seen:
                        continue
                    seen.add(key)

                    reply = art.find(attrs={"data-testid": ["reply", "conversation"]})
                    retw = art.find(attrs={"data-testid": ["retweet", "repost"]})
                    like = art.find(attrs={"data-testid": ["like", "favorite"]})
                    view = find_view_node(art)

                    tweetArr.append({
                        "keyword": kw,
                        "tweet": text,
                        "time": tstr,
                        "comment": safe_int(reply.get_text(strip=True) if reply else "0"),
                        "retweet": safe_int(retw.get_text(strip=True) if retw else "0"),
                        "like": safe_int(like.get_text(strip=True) if like else "0"),
                        "see_count": safe_int(view.get_text(strip=True) if view else "0"),
                    })
                except Exception:
                    continue

        print(f"✅ {kw}/{mode}: {len(tweetArr)} tweet toplandı.")

driver.quit()
print(f"🟢 Toplam tweet sayısı: {len(tweetArr)}")

In [None]:
# ======================= SAVE =======================
df = pd.DataFrame(tweetArr)

if not df.empty:
    df.drop_duplicates(subset=["tweet", "time"], inplace=True)
    sort_cols = [c for c in ["like", "retweet", "comment", "see_count"] if c in df.columns]
    if sort_cols:
        df.sort_values(by=sort_cols, ascending=False, inplace=True)
else:
    # boş olsa da kolon yapısını oluştur
    df = pd.DataFrame(columns=["keyword", "tweet", "time", "comment", "retweet", "like", "see_count"])

df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
df.to_csv(LATEST_CSV, index=False, encoding="utf-8-sig")

print(f"💾 Kaydedildi: {OUT_CSV} ({len(df)} satır)")
print(f"💾 Kaydedildi: {LATEST_CSV} ({len(df)} satır)")