In [None]:
# =====================================================
# Pegos Twitter Scraper (Top Tweets + Extended View Count + Daily Folder)
# =====================================================
import os, time, random, traceback
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime

In [None]:
# =====================================================
# Sayı dönüştürme yardımcı fonksiyonu
# =====================================================
def safe_int(val):
    """Kısaltılmış string sayıları integer'a çevir."""
    if not val:
        return 0
    val = val.replace(",", "").replace("·", "").strip()
    try:
        if val.endswith("B"):
            return int(float(val[:-1]) * 1_000)
        if val.endswith("M") or val.endswith("Mn"):
            return int(float(val[:-1]) * 1_000_000)
        return int(float(val))
    except:
        return 0

In [None]:
# =====================================================
# Çevre değişkenleri
# =====================================================
AUTH_TOKEN = os.getenv("AUTH_TOKEN")
CT0 = os.getenv("CT0")
if not AUTH_TOKEN or not CT0:
    raise RuntimeError("AUTH_TOKEN veya CT0 tanımlı değil (GitHub Secrets kısmına ekle).")

# Günlük klasör oluştur
today = datetime.utcnow().strftime("%Y-%m-%d")
out_dir = f"/tmp/data/{today}"
os.makedirs(out_dir, exist_ok=True)
OUT_PATH = f"{out_dir}/pegos_output.csv"

print(f"📁 Günlük klasör oluşturuldu: {out_dir}")

In [None]:
# =====================================================
# Chrome ayarları
# =====================================================
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-gpu")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1920,1080")
opts.add_argument("--disable-blink-features=AutomationControlled")
opts.add_argument("--start-maximized")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

In [None]:
# =====================================================
# Login (cookie üzerinden)
# =====================================================
driver.get("https://x.com")
time.sleep(3)
driver.add_cookie({"name": "auth_token", "value": AUTH_TOKEN, "domain": ".x.com"})
driver.add_cookie({"name": "ct0", "value": CT0, "domain": ".x.com"})
driver.refresh()
time.sleep(5)
print("✅ Login başarılı:", driver.current_url)


In [None]:
# =====================================================
# Arama ve tweet toplama
# =====================================================
KEYWORDS = ["bitcoin", "blockchain", "cryptocurrency"]
tweetArr = []

for kw in KEYWORDS:
    print(f"\n🔎 Searching Top Tweets for: {kw}")
    driver.get(f"https://x.com/search?q={kw}&src=typed_query&f=top")
    time.sleep(6)

    last_height = 0
    scroll_round = 0

    while scroll_round < 100:  # 🔹 Maksimum scroll artırıldı
        scroll_round += 1
        driver.execute_script("window.scrollBy(0, 1800);")
        time.sleep(random.uniform(2.0, 3.5))

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        articles = soup.find_all("article")

        for art in articles:
            try:
                text_tag = art.find(attrs={"data-testid": "tweetText"})
                if not text_tag:
                    continue
                text = text_tag.get_text(" ", strip=True)
                time_tag = art.find("time")
                time_str = time_tag["datetime"] if time_tag else None

                # Etkileşim verileri
                reply = art.find(attrs={"data-testid": ["reply", "conversation"]})
                retweet = art.find(attrs={"data-testid": ["retweet", "repost"]})
                like = art.find(attrs={"data-testid": ["like", "favorite"]})

                # 🔹 View count fallback
                view = art.find(attrs={"data-testid": ["viewCount", "views"]})
                view_text = ""
                if not view:
                    aria_views = art.find("span", attrs={"aria-label": lambda v: v and "views" in v.lower()})
                    if aria_views:
                        view_text = aria_views.get_text(strip=True)
                    else:
                        group_spans = art.find_all("span")
                        for sp in group_spans:
                            if "Views" in sp.text or "views" in sp.text:
                                view_text = sp.text
                                break
                else:
                    view_text = view.get_text(strip=True)

                reply_val = safe_int(reply.get_text(strip=True) if reply else "0")
                retweet_val = safe_int(retweet.get_text(strip=True) if retweet else "0")
                like_val = safe_int(like.get_text(strip=True) if like else "0")
                view_val = safe_int(view_text)

                # Boş veya spam tweetleri atla
                if len(text) < 15 or (reply_val + retweet_val + like_val + view_val) == 0:
                    continue

                tweetArr.append({
                    "keyword": kw,
                    "tweet": text,
                    "time": time_str,
                    "comment": reply_val,
                    "retweet": retweet_val,
                    "like": like_val,
                    "see_count": view_val
                })
            except Exception as e:
                print("⚠️ Parse Error:", str(e))
                continue

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print("⛔ Sayfa sonuna ulaşıldı.")
            break
        last_height = new_height

    print(f"✅ {kw}: {len(tweetArr)} tweet kaydedildi (popüler + view fix)")

driver.quit()

In [None]:
# =====================================================
# CSV kaydet
# =====================================================
df = pd.DataFrame(tweetArr)
if df.empty:
    print("⚠️ Veri toplanamadı.")
else:
    df.drop_duplicates(subset=["tweet", "time"], inplace=True)
    df.sort_values(by=["like", "retweet", "comment", "see_count"], ascending=False, inplace=True)
    df.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
    print(f"💾 Kaydedildi: {OUT_PATH}, toplam {len(df)} satır (popüler tweetler).")