In [None]:
# ==============================================================
# Pegos - Twitter (X) Scraper + Direct HF Upload (env-secrets)
# - Uses TWITTER_USER, TWITTER_PASS, HF_TOKEN, HF_DATASET_REPO from env
# - No local CSV files left behind (uploads from memory)
# - Meant to be executed by GitHub Actions via nbconvert or as a script
# ==============================================================

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import time, io, os, traceback
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd
from datetime import datetime
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from huggingface_hub import HfApi, list_repo_files, CommitOperationDelete
import random

print("▶️ Job started:", datetime.utcnow().isoformat(), "UTC")

# ---------------------------
# ENV / CONFIG
# ---------------------------
TWITTER_USER = os.getenv("TWITTER_USER")
TWITTER_PASS = os.getenv("TWITTER_PASS")
HF_TOKEN = os.getenv("HF_TOKEN")
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO")  # örn: "Caner7/pegos-stream"
KEEP_LAST = int(os.getenv("KEEP_LAST", "200"))  # arşiv temizliği (isteğe bağlı)

# Basit validasyon
if not TWITTER_USER or not TWITTER_PASS:
    raise RuntimeError("TWITTER_USER veya TWITTER_PASS environment değişkenleri tanımlı değil.")
if not HF_TOKEN or not HF_DATASET_REPO:
    raise RuntimeError("HF_TOKEN veya HF_DATASET_REPO environment değişkenleri tanımlı değil.")

api = HfApi(token=HF_TOKEN)

# ---------------------------
# Selenium başlatma (headless)
# ---------------------------
options = Options()
options.add_argument("--headless=new")  # yeni headless
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
# Opsiyonel user-agent (bazen yardımcı olur)
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36")

driver = None
try:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.set_page_load_timeout(60)
except Exception as e:
    print("❌ Chrome driver başlatılamadı:", e)
    raise

# ---------------------------
# Helper fonksiyonlar
# ---------------------------
def safe_sleep(t):
    # küçük rastgele gecikme anti-bot için faydalı olabilir
    time.sleep(t + random.random()*0.7)

def login_x():
    try:
        login_url = "https://x.com/i/flow/login"
        driver.get(login_url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "input")))
        # Kullanıcı alanını bul ve gönder
        # (Sayfa yapısına göre X zaman zaman değiştirir — hata olursa log'u oku)
        # Deneme: önce username input'unu bulmaya çalış
        try:
            username_xpath = '/html/body/div/div/div/div[1]/div/div/div/div/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div/div/div/div[4]/label/div/div[2]/div/input'
            WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, username_xpath)))
            usernameInput = driver.find_element(By.XPATH, username_xpath)
        except Exception:
            # alternatif: ilk görünen input elemanını kullan
            usernameInput = driver.find_element(By.TAG_NAME, "input")

        print("🔐 Entering username")
        usernameInput.send_keys(TWITTER_USER)
        safe_sleep(0.8)
        usernameInput.send_keys(Keys.ENTER)
        safe_sleep(2)

        # Şifre alanı
        password_field_locator = (By.XPATH, '/html/body/div/div/div/div[1]/div/div/div/div/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[1]/div/div/div[3]/div/label/div/div[2]/div[1]/input')
        WebDriverWait(driver, 15).until(EC.presence_of_element_located(password_field_locator))
        passwordInput = driver.find_element(*password_field_locator)
        print("🔐 Entering password")
        passwordInput.send_keys(TWITTER_PASS)
        safe_sleep(0.8)
        passwordInput.send_keys(Keys.ENTER)
        safe_sleep(3)
        # Basit kontrol: profil ya da ana sayfa elementi var mı?
        print("✅ Login attempted")
    except Exception as ex:
        print("❌ Login failed:", traceback.format_exc())
        raise

def scrape_keywords(keywords, scrolls_per_keyword=30):
    tweetArr = []
    for keyword in keywords:
        try:
            search_url = f"https://x.com/search?q={keyword}&src=typed_query&f=live"
            driver.get(search_url)
            safe_sleep(3)
            for _ in range(scrolls_per_keyword):
                driver.execute_script("window.scrollBy(0, 1200);")
                safe_sleep(1.2)
                htmlContent = driver.page_source
                soap = BeautifulSoup(htmlContent, "html.parser")
                tweetBodies = soap.find_all("article")
                for tweetBody in tweetBodies:
                    try:
                        tweetObj = {}
                        tweet = tweetBody.find(attrs={"data-testid": "tweetText"})
                        if not tweet or not tweet.text:
                            continue
                        tweetObj["tweet"] = tweet.text
                        timeElement = tweetBody.find("time")
                        try:
                            tweetObj["time"] = timeElement["datetime"]
                        except:
                            tweetObj["time"] = None
                        # etkileşim sayıları
                        interactionCountItems = tweetBody.find_all(attrs={"data-testid":"app-text-transition-container"})
                        # default values
                        tweetObj["comment"] = 0
                        tweetObj["retweet"] = 0
                        tweetObj["like"] = 0
                        tweetObj["see_count"] = 0
                        for counter, interactionCount in enumerate(interactionCountItems):
                            result = interactionCount.text or ""
                            # temizle
                            result = result.replace(" ", "")
                            if "B" in result:
                                try:
                                    result = float(result.replace("B","")) * 1000
                                except:
                                    result = 0
                            if "Mn" in result or "M" in result:
                                try:
                                    result = float(result.replace("Mn","").replace("M","")) * 1_000_000
                                except:
                                    result = 0
                            if result == "":
                                result = 0
                            try:
                                result = int(float(result))
                            except:
                                result = 0
                            if counter == 0:
                                tweetObj["comment"] = result
                            elif counter == 1:
                                tweetObj["retweet"] = result
                            elif counter == 2:
                                tweetObj["like"] = result
                            elif counter == 3:
                                tweetObj["see_count"] = result
                        tweetObj["keyword"] = keyword
                        tweetArr.append(tweetObj)
                    except Exception:
                        # hücre içinde tek bir hataya takılmamak için pass
                        continue
        except Exception:
            print("⚠️ Error scraping keyword:", keyword, traceback.format_exc())
            continue
    # dataframe
    df = pd.DataFrame(tweetArr)
    if not df.empty:
        df.drop_duplicates(inplace=True)
        # normalize time column if possible
        if "time" in df.columns:
            try:
                df["time_parsed"] = pd.to_datetime(df["time"], errors="coerce")
            except:
                pass
    return df

# ---------------------------
# RUN: login + scrape
# ---------------------------
try:
    login_x()
    keywords = ["blockchain",]
    df = scrape_keywords(keywords, scrolls_per_keyword=30)
    print("Scraped rows:", len(df))
    if df.empty:
        print("⚠️ No rows scraped; aborting upload.")
    else:
        # bellek-CSV'ye çevir
        csv_buf = io.StringIO()
        df.to_csv(csv_buf, index=False)
        csv_bytes = csv_buf.getvalue().encode("utf-8")
        # yükleme hedefleri
        ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
        remote_archive = f"data/blockchain_tweets_{ts}.csv"
        remote_latest = "data/latest.csv"
        print("📤 Uploading to HF dataset:", HF_DATASET_REPO)
        api.upload_file(path_or_fileobj=io.BytesIO(csv_bytes),
                        path_in_repo=remote_archive,
                        repo_id=HF_DATASET_REPO,
                        repo_type="dataset")
        print("✅ Uploaded archive:", remote_archive)
        api.upload_file(path_or_fileobj=io.BytesIO(csv_bytes),
                        path_in_repo=remote_latest,
                        repo_id=HF_DATASET_REPO,
                        repo_type="dataset")
        print("✅ Updated latest:", remote_latest)

        # -- opsiyonel: arşiv temizleme (KEEP_LAST)
        try:
            files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset")
            archives = sorted([f for f in files if f.startswith("data/blockchain_tweets_") and f.endswith(".csv")])
            if len(archives) > KEEP_LAST:
                to_delete = archives[:len(archives)-KEEP_LAST]
                if to_delete:
                    ops = [CommitOperationDelete(path_in_repo=p) for p in to_delete]
                    api.create_commit(repo_id=HF_DATASET_REPO, repo_type="dataset",
                                      operations=ops, commit_message=f"Trim archives keep {KEEP_LAST}")
                    print(f"🧹 Deleted {len(to_delete)} old archives")
        except Exception as e:
            print("⚠️ Cleanup error (non-fatal):", e)

except Exception as e:
    print("❌ Fatal error during scraping/upload:", traceback.format_exc())
finally:
    try:
        driver.quit()
        print("🛑 Driver closed.")
    except:
        pass

print("🏁 Job finished:", datetime.utcnow().isoformat(), "UTC")
