In [None]:
#!/usr/bin/env python
# coding: utf-8

# ==============================================================
# Pegos - Live Twitter Scraper (Multi Keyword) + Hugging Face Upload
# ==============================================================

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from huggingface_hub import HfApi, list_repo_files, CommitOperationDelete
import pandas as pd
import time, os, io, random, traceback
from datetime import datetime

# ==============================================================
# ENVIRONMENT CHECK
# ==============================================================

print("=== ENVIRONMENT VARIABLES CHECK ===")
TWITTER_USER = os.getenv("TWITTER_USER")
TWITTER_PASS = os.getenv("TWITTER_PASS")
HF_TOKEN = os.getenv("HF_TOKEN")
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO")
KEEP_LAST = os.getenv("KEEP_LAST", "200") or "200"

print("HF_DATASET_REPO:", HF_DATASET_REPO)
print("HF_TOKEN length:", len(HF_TOKEN) if HF_TOKEN else "MISSING")
print("TWITTER_USER:", TWITTER_USER)
print("TWITTER_PASS (masked):", "***" if TWITTER_PASS else "MISSING")
print("KEEP_LAST:", KEEP_LAST)
print("===================================")

if not HF_TOKEN or not HF_DATASET_REPO:
    raise RuntimeError("❌ Missing HF credentials (HF_TOKEN or HF_DATASET_REPO).")
if not TWITTER_USER or not TWITTER_PASS:
    raise RuntimeError("❌ Missing Twitter credentials.")

api = HfApi(token=HF_TOKEN)

# ==============================================================
# SELENIUM SETUP
# ==============================================================

options = Options()
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) Chrome/115.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.set_page_load_timeout(60)
print("✅ Chrome driver initialized.")

def safe_sleep(t):
    time.sleep(t + random.random() * 0.6)

# ==============================================================
# LOGIN
# ==============================================================

def login_x():
    try:
        print("🔐 Logging in...")
        driver.get("https://x.com/i/flow/login")
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "input")))

        username_field = driver.find_element(By.TAG_NAME, "input")
        username_field.send_keys(TWITTER_USER)
        safe_sleep(1)
        username_field.send_keys(Keys.ENTER)
        safe_sleep(2)

        password_xpath = (
            '/html/body/div/div/div/div[1]/div/div/div/div/div/div/div[2]/div[2]/div/div/'
            'div[2]/div[2]/div[1]/div/div/div[3]/div/label/div/div[2]/div[1]/input'
        )
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, password_xpath)))
        password_field = driver.find_element(By.XPATH, password_xpath)
        password_field.send_keys(TWITTER_PASS)
        safe_sleep(1)
        password_field.send_keys(Keys.ENTER)
        safe_sleep(3)
        print("✅ Login complete.")
    except Exception:
        print("❌ Login failed:", traceback.format_exc())
        raise

# ==============================================================
# SCRAPE TWEETS
# ==============================================================

def scrape_keywords(keywords, scrolls_per_keyword=100):
    tweetArr = []

    for keyword in keywords:
        print(f"🔎 Searching tweets for: {keyword}")
        try:
            search_url = f"https://x.com/search?q={keyword}&src=typed_query&f=live"
            driver.get(search_url)
            safe_sleep(4)

            for _ in range(scrolls_per_keyword):
                driver.execute_script("window.scrollBy(0, 1200);")
                safe_sleep(2)

                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")
                articles = soup.find_all("article")

                for art in articles:
                    tweetObj = {}
                    tweetText = art.find(attrs={"data-testid": "tweetText"})
                    if not tweetText:
                        continue

                    tweetObj["tweet"] = tweetText.text.strip()

                    # Tweet time
                    timeEl = art.find("time")
                    tweetObj["time"] = timeEl["datetime"] if timeEl else None

                    # Interaction counts
                    counts = art.find_all(attrs={"data-testid": "app-text-transition-container"})
                    tweetObj["comment"] = tweetObj["retweet"] = tweetObj["like"] = tweetObj["see_count"] = 0
                    for i, c in enumerate(counts):
                        val = c.text.strip()
                        if val.endswith("B"):
                            val = float(val[:-1]) * 1000
                        elif val.endswith("Mn") or val.endswith("M"):
                            val = float(val[:-2]) * 1_000_000
                        else:
                            try:
                                val = int(val)
                            except:
                                val = 0
                        if i == 0: tweetObj["comment"] = val
                        elif i == 1: tweetObj["retweet"] = val
                        elif i == 2: tweetObj["like"] = val
                        elif i == 3: tweetObj["see_count"] = val

                    tweetObj["keyword"] = keyword
                    tweetArr.append(tweetObj)

        except Exception:
            print("⚠️ Error while scraping keyword:", keyword)
            print(traceback.format_exc())
            continue

    df = pd.DataFrame(tweetArr)
    if not df.empty:
        df.drop_duplicates(inplace=True)
        df["time_parsed"] = pd.to_datetime(df["time"], errors="coerce")
    return df

# ==============================================================
# MAIN EXECUTION
# ==============================================================

try:
    print("▶️ Job started:", datetime.utcnow().isoformat(), "UTC")
    login_x()

    keywords = [
        "blockchain", 
    ]

    df = scrape_keywords(keywords, scrolls_per_keyword=100)
    print("✅ Total tweets collected:", len(df))

    if df.empty:
        print("⚠️ No tweets found — skipping upload.")
    else:
        csv_buf = io.StringIO()
        df.to_csv(csv_buf, index=False)
        csv_bytes = csv_buf.getvalue().encode("utf-8")

        ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        remote_archive = f"data/x_tweets_{ts}.csv"
        remote_latest = "data/latest.csv"

        print("📤 Uploading scraped tweets to Hugging Face:", HF_DATASET_REPO)
        api.upload_file(io.BytesIO(csv_bytes), remote_archive, repo_id=HF_DATASET_REPO, repo_type="dataset")
        api.upload_file(io.BytesIO(csv_bytes), remote_latest, repo_id=HF_DATASET_REPO, repo_type="dataset")
        print("✅ Uploaded both archive and latest CSV files.")

        # Optional cleanup
        try:
            keep_n = int(KEEP_LAST)
            files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset")
            archives = sorted([f for f in files if f.startswith("data/x_tweets_")])
            if len(archives) > keep_n:
                to_delete = archives[: len(archives) - keep_n]
                if to_delete:
                    ops = [CommitOperationDelete(path_in_repo=p) for p in to_delete]
                    api.create_commit(repo_id=HF_DATASET_REPO, repo_type="dataset", operations=ops, commit_message=f"Trim archives (keep {keep_n})")
                    print(f"🧹 Deleted {len(to_delete)} old archives.")
        except Exception as e:
            print("⚠️ Cleanup error:", e)

except Exception as e:
    print("❌ Fatal error:", traceback.format_exc())

finally:
    try:
        driver.quit()
        print("🛑 Driver closed.")
    except:
        pass

print("🏁 Job finished:", datetime.utcnow().isoformat(), "UTC")


In [None]:
print("✅ Total tweets collected:", len(df))


In [None]:
print("First 5 tweets preview:")
print(df.head())
