In [None]:
# ============================================================
# Pegos Twitter Scraper → Upload to Hugging Face (no local files)
# ============================================================
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd
from datetime import datetime
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from huggingface_hub import HfApi
import io, os, traceback

print("▶️ Starting scrape process at", datetime.utcnow().isoformat(), "UTC")

# ========== ENV =============
HF_TOKEN = os.getenv("HF_TOKEN")
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO")
api = HfApi(token=HF_TOKEN)
ts = datetime.utcnow().strftime("%Y%m%d_%H%M")
remote_latest = "data/latest.csv"
remote_archive = f"data/blockchain_tweets_{ts}.csv"

# ========== SELENIUM CONFIG ==========
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
url = "https://x.com/i/flow/login"
driver.get(url)

try:
    # Kullanıcı adı
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH,
        '/html/body/div/div/div/div[1]/div/div/div/div/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div/div/div/div[4]/label/div/div[2]/div/input')))
    usernameInput = driver.find_element(By.XPATH,
        '/html/body/div/div/div/div[1]/div/div/div/div/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div/div/div/div[4]/label/div/div[2]/div/input')
    usernameInput.send_keys("canergiden007")
    time.sleep(1)
    usernameInput.send_keys(Keys.ENTER)

    # Şifre
    password_field_locator = (By.XPATH,
        '/html/body/div/div/div/div[1]/div/div/div/div/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[1]/div/div/div[3]/div/label/div/div[2]/div[1]/input')
    WebDriverWait(driver, 10).until(EC.presence_of_element_located(password_field_locator))
    passwordInput = driver.find_element(*password_field_locator)
    passwordInput.send_keys("Canergiden007@")
    passwordInput.send_keys(Keys.ENTER)
    time.sleep(3)

    # Anahtar kelimeler
    keywords = ["blockchain",]
    tweetArr = []

    for stock in keywords:
        url = "https://x.com/search?q=" + stock
        driver.get(url)
        time.sleep(4)
        for _ in range(30):  # optimize: 100 yerine 30 scroll
            driver.execute_script("window.scrollBy(0, 1200);")
            time.sleep(1.5)
            htmlContent = driver.page_source
            soap = BeautifulSoup(htmlContent, "html.parser")
            tweetBodies = soap.find_all("article")

            for tweetBody in tweetBodies:
                tweetObj = {}
                tweet = tweetBody.find(attrs={"data-testid": "tweetText"})
                if not tweet or not tweet.text:
                    continue
                tweetObj["tweet"] = tweet.text
                timeElement = tweetBody.find("time")
                tweetObj["time"] = timeElement["datetime"] if timeElement else "-"
                interactionCountItems = tweetBody.find_all(attrs={"data-testid":"app-text-transition-container"})
                for counter, interactionCount in enumerate(interactionCountItems):
                    result = interactionCount.text or "0"
                    if "B" in result:
                        result = float(result.replace("B", "").strip()) * 1000
                    elif "Mn" in result:
                        result = float(result.replace("Mn", "").strip()) * 1_000_000
                    try:
                        result = int(float(result))
                    except:
                        result = 0
                    if counter == 0:
                        tweetObj["comment"] = result
                    elif counter == 1:
                        tweetObj["retweet"] = result
                    elif counter == 2:
                        tweetObj["like"] = result
                    elif counter == 3:
                        tweetObj["see_count"] = result
                tweetObj["code"] = stock
                tweetArr.append(tweetObj)

    df = pd.DataFrame(tweetArr)
    df.drop_duplicates(inplace=True)
    print("✅ Scraped rows:", len(df))

    # ========== Upload to Hugging Face ==========
    csv_buf = io.StringIO()
    df.to_csv(csv_buf, index=False)
    csv_bytes = csv_buf.getvalue().encode("utf-8")

    api.upload_file(
        path_or_fileobj=io.BytesIO(csv_bytes),
        path_in_repo=remote_archive,
        repo_id=HF_DATASET_REPO,
        repo_type="dataset",
    )
    print("📤 Uploaded archive:", remote_archive)

    api.upload_file(
        path_or_fileobj=io.BytesIO(csv_bytes),
        path_in_repo=remote_latest,
        repo_id=HF_DATASET_REPO,
        repo_type="dataset",
    )
    print("✅ Updated:", remote_latest)

except Exception as e:
    print("❌ Error during scraping:", traceback.format_exc())
finally:
    driver.quit()
    print("🛑 Driver closed.")
