In [None]:
# ============================================================
# Pegos Twitter (X) Scraper - Fixed Keywords & HF Upload
# ============================================================

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from huggingface_hub import HfApi, CommitOperationDelete, list_repo_files

import pandas as pd
import datetime, time, io, os, random, traceback

print("▶️ Job started:", datetime.datetime.utcnow().isoformat(), "UTC")

In [None]:
# ------------------------------------------------------------
# ENVIRONMENT VARIABLES (Secrets)
# ------------------------------------------------------------
TWITTER_USER = os.getenv("TWITTER_USER")
TWITTER_PASS = os.getenv("TWITTER_PASS")
HF_TOKEN = os.getenv("HF_TOKEN")
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO")
KEEP_LAST = int(os.getenv("KEEP_LAST", "200"))

if not HF_TOKEN or not HF_DATASET_REPO:
    raise RuntimeError("❌ HF token veya dataset repo tanımlı değil.")
if not TWITTER_USER or not TWITTER_PASS:
    raise RuntimeError("❌ Twitter kullanıcı adı veya şifre tanımlı değil.")

print("🔐 Twitter user:", TWITTER_USER)
print("📦 HF Dataset:", HF_DATASET_REPO)
print("🧹 Keep last:", KEEP_LAST)

api = HfApi(token=HF_TOKEN)


In [None]:
# ------------------------------------------------------------
# SELENIUM BAŞLATMA
# ------------------------------------------------------------
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/115 Safari/537.36")

driver = None
try:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    print("✅ Chrome driver started.")
except Exception as e:
    print("❌ Chrome başlatılamadı:", e)
    raise

In [None]:
def login_x():
    try:
        driver.get("https://x.com/i/flow/login")
        WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.TAG_NAME, "input")))
        inputs = driver.find_elements(By.TAG_NAME, "input")
        inputs[0].send_keys(TWITTER_USER)
        inputs[0].send_keys(Keys.ENTER)
        time.sleep(3)

        pw_fields = driver.find_elements(By.CSS_SELECTOR, "input[type='password']")
        if pw_fields:
            pw_fields[0].send_keys(TWITTER_PASS)
            pw_fields[0].send_keys(Keys.ENTER)
        time.sleep(6)

        # 🔎 Giriş doğrulama
        if "home" in driver.current_url.lower() or "x.com/home" in driver.current_url.lower():
            print("✅ Login confirmed: now on home timeline.")
        else:
            print("⚠️ Login might not be confirmed, current URL:", driver.current_url)

    except Exception as e:
        print("❌ Login step failed:", e)
        traceback.print_exc()


In [None]:
def scrape_keywords(scrolls_per_keyword=25):
    tweetArr = []
    keywords = ["blockchain", "bitcoin", "crypto", "ethereum", "defi", "web3", "nft"]

    for keyword in keywords:
        print(f"\n🔍 Searching keyword: {keyword}")
        try:
            search_url = f"https://x.com/search?q={keyword}&src=typed_query&f=live"
            driver.get(search_url)
            time.sleep(3)
            for _ in range(scrolls_per_keyword):
                driver.execute_script("window.scrollBy(0, 1200);")
                time.sleep(1.3)
                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")
                tweets = soup.find_all("article")

                for t in tweets:
                    try:
                        tweet_text_tag = t.find(attrs={"data-testid": "tweetText"})
                        if not tweet_text_tag:
                            continue
                        tweet_text = tweet_text_tag.get_text(" ", strip=True)
                        time_tag = t.find("time")
                        tweet_time = time_tag["datetime"] if time_tag else None

                        counts = t.find_all(attrs={"data-testid": "app-text-transition-container"})
                        vals = [0, 0, 0, 0]
                        for i, c in enumerate(counts[:4]):
                            val = c.text.replace(" ", "")
                            if "B" in val:
                                val = float(val.replace("B", "")) * 1000
                            elif "Mn" in val or "M" in val:
                                val = float(val.replace("Mn", "").replace("M", "")) * 1_000_000
                            vals[i] = int(float(val)) if val else 0

                        tweetArr.append({
                            "keyword": keyword,
                            "tweet": tweet_text,
                            "time": tweet_time,
                            "comment": vals[0],
                            "retweet": vals[1],
                            "like": vals[2],
                            "see_count": vals[3]
                        })
                    except Exception:
                        continue  # bireysel tweet hatasını atla

            # ✅ Döngü bitince bu print bloğu BURADA olmalı
            print(f"✅ Finished {keyword}: total tweets so far {len(tweetArr)}")

        except Exception as e:
            print(f"⚠️ Error scraping {keyword}: {e}")
            traceback.print_exc()

    df = pd.DataFrame(tweetArr)
    return df


In [None]:
# ------------------------------------------------------------
# MAIN EXECUTION
# ------------------------------------------------------------
df = pd.DataFrame()
try:
    login_x()
    df = scrape_keywords(scrolls_per_keyword=20)
    print("✅ Total tweets collected:", len(df))
    if df.empty:
        print("⚠️ No tweets found, skipping upload.")
    else:
        csv_buf = io.StringIO()
        df.to_csv(csv_buf, index=False)
        csv_bytes = csv_buf.getvalue().encode("utf-8")

        ts = datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        remote_archive = f"data/x_tweets_{ts}.csv"
        remote_latest = "data/latest.csv"

        print("📤 Uploading to Hugging Face:", HF_DATASET_REPO)
        api.upload_file(path_or_fileobj=io.BytesIO(csv_bytes),
                        path_in_repo=remote_archive,
                        repo_id=HF_DATASET_REPO,
                        repo_type="dataset")
        print("✅ Uploaded:", remote_archive)

        api.upload_file(path_or_fileobj=io.BytesIO(csv_bytes),
                        path_in_repo=remote_latest,
                        repo_id=HF_DATASET_REPO,
                        repo_type="dataset")
        print("✅ Updated latest.csv")

        # Cleanup old files
        try:
            files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset")
            old = sorted([f for f in files if f.startswith("data/x_tweets_")])
            if len(old) > KEEP_LAST:
                to_delete = old[:len(old)-KEEP_LAST]
                del_ops = [CommitOperationDelete(p) for p in to_delete]
                api.create_commit(repo_id=HF_DATASET_REPO, repo_type="dataset",
                                  operations=del_ops, commit_message=f"Cleanup old ({len(del_ops)}) files")
                print("🧹 Cleaned up old archives.")
        except Exception as e:
            print("⚠️ Cleanup failed:", e)

except Exception as e:
    print("❌ Fatal error:", e)
    traceback.print_exc()

finally:
    try:
        driver.quit()
        print("🛑 Driver closed.")
    except:
        pass

print("🏁 Job finished:", datetime.datetime.utcnow().isoformat(), "UTC")
