In [None]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Setup Selenium WebDriver with headless mode
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36")

service = Service("chromedriver.exe")  # Ensure correct chromedriver path as per your o

# Load the CSV file with Twitter links
df = pd.read_csv("twitter_links.csv", header=None)
df.columns = ["twitter_link"]
df["username"] = df["twitter_link"].str.extract(r"twitter\.com\/@?([^\/\s]+)")

# List to store profile data
profile_data = []

def scrape_twitter_profile(username, driver):
    twitter_url = f"https://twitter.com/{username}"
    driver.get(twitter_url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div[data-testid="UserDescription"], span[data-testid="UserLocation"]'))
        )
    except Exception as e:
        print(f"Timeout or CAPTCHA issue with {username}: {e}")
        return [username, "N/A", "N/A", "N/A", "N/A", "N/A"]
    
    soup = BeautifulSoup(driver.page_source, "html.parser")

    def extract_data(selector, attribute="text"):
        try:
            element = soup.select_one(selector)
            return element.text.strip() if attribute == "text" else element[attribute]
        except (AttributeError, TypeError):
            return "N/A"

    bio = extract_data('div[data-testid="UserDescription"]')
    followers = extract_data(f'a[href="/{username}/followers"] span')
    following = extract_data(f'a[href="/{username}/following"] span')
    location = extract_data('span[data-testid="UserLocation"]')
    website = extract_data('a[data-testid="UserUrl"]', "href")

    return [username, bio, followers, following, location, website]

# Use a single WebDriver instance for efficiency
with webdriver.Chrome(service=service, options=chrome_options) as driver:
    for username in df["username"].dropna():
        profile_data.append(scrape_twitter_profile(username, driver))
        time.sleep(random.uniform(1, 2))  # Random delay to avoid rate limits

# Save scraped data to a CSV file
output_df = pd.DataFrame(profile_data, columns=["Username", "Bio", "Followers", "Following", "Location", "Website"])
output_df.to_csv("twitter_profiles.csv", index=False)

print("Scraping complete! Data saved to twitter_profiles.csv.")


✅ Scraping complete! Data saved to twitter_profiles.csv.
