In [None]:
import nest_asyncio
import pandas as pd
import csv
from bs4 import BeautifulSoup
!pip install --upgrade pip
!pip install playwright
!pip install chromium

from playwright.async_api import async_playwright

nest_asyncio.apply()

async def load_and_extract_all_links(csv_file="cyber_wanted.csv"):
    # Load existing links
    try:
        df_existing = pd.read_csv(csv_file)
        existing = set(df_existing["Profile URL"].dropna().tolist())
    except FileNotFoundError:
        existing = set()

    print(f"✅ Currently have {len(existing)} saved links.")

    async with async_playwright() as pw:
        browser = await pw.chromium.launch(headless=False)
        page = await browser.new_page()
        await page.goto("https://www.fbi.gov/wanted/cyber", timeout=60000)
        await page.wait_for_timeout(3000)

        # Repeatedly click “Show More” until it’s gone or disabled
        while True:
            # Scroll into view
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await page.wait_for_timeout(1500)

            button = await page.query_selector("button.loadMoreButton")
            if not button:
                print("🔚 No more 'Show More' button.")
                break

            disabled_attr = await button.get_attribute("disabled")
            if disabled_attr is not None:
                print("🔚 Button exists but is disabled.")
                break

            await button.scroll_into_view_if_needed()
            await page.evaluate("(btn) => btn.click()", button)
            print("➡️ Clicked 'Show More' to load more entries.")
            await page.wait_for_timeout(4000)

        html = await page.content()
        await browser.close()

    soup = BeautifulSoup(html, "html.parser")
    cards = soup.select("li.portal-type-person a")

    new_links = []
    for card in cards:
        href = card.get("href")
        if href and "/wanted/cyber/" in href:
            full = href if href.startswith("http") else "https://www.fbi.gov" + href
            if full not in existing:
                new_links.append(full)

    new_links = sorted(set(new_links))
    print(f"🔍 Found {len(new_links)} new links to add.")

    if new_links:
        with open(csv_file, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            for link in new_links:
                writer.writerow([link])
        print(f"✅ Appended {len(new_links)} new links.")
    else:
        print("⚠️ No new links to append.")

    print(f"📦 Final total saved: {len(existing) + len(new_links)} links")


In [None]:
await load_and_extract_all_links()

In [None]:
import requests

In [None]:
import asyncio
import pandas as pd
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import random
import time
import nest_asyncio

nest_asyncio.apply()

CSV_FILE = "cyber_wanted.csv"
OUTPUT_FILE = "cyberwanted_profile.csv"

# These are the profile fields we care about
TARGET_FIELDS = [
    "Name", "Aliases", "Date(s) of Birth Used", "Place of Birth", "Hair", "Eyes",
    "Sex", "Race", "Nationality", "Profile URL"
]

def extract_profile_details(html, url):
    soup = BeautifulSoup(html, 'html.parser')
    data = {key: "" for key in TARGET_FIELDS}
    data["Profile URL"] = url

    try:
        data["Name"] = soup.find("h1", class_="documentFirstHeading").text.strip()
    except: pass

    try:
        data["Aliases"] = soup.select_one("div.wanted-person-aliases p").text.strip()
    except: pass

    table = soup.find("table", class_="table table-striped wanted-person-description")
    if table:
        rows = table.find_all("tr")
        for row in rows:
            try:
                key = row.find_all("td")[0].text.strip()
                value = row.find_all("td")[1].text.strip()
                if key in TARGET_FIELDS:
                    data[key] = value
            except: continue
    return data

async def scrape_and_save_profiles():
    df = pd.read_csv(CSV_FILE)
    urls = df["Profile URL"].dropna().unique().tolist()

    # Load existing results if any
    try:
        existing_df = pd.read_csv(OUTPUT_FILE)
        done_urls = set(existing_df["Profile URL"])
    except:
        existing_df = pd.DataFrame(columns=TARGET_FIELDS)
        done_urls = set()

    results = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)

        for idx, url in enumerate(urls):
            if url in done_urls:
                continue

            context = await browser.new_context()
            page = await context.new_page()

            print(f"\n[{idx+1}/{len(urls)}] 🔗 Opening: {url}")
            try:
                await page.goto(url, timeout=60000)
                await page.wait_for_selector("h1.documentFirstHeading", timeout=15000)
                html = await page.content()
                data = extract_profile_details(html, url)

                print(f"✅ Extracted: {data['Name']}")
                results.append(data)

                # Save to CSV immediately
                pd.concat([existing_df, pd.DataFrame([data])], ignore_index=True)\
                    .drop_duplicates(subset=["Profile URL"])\
                    .to_csv(OUTPUT_FILE, index=False)
                existing_df = pd.read_csv(OUTPUT_FILE)  # Reload updated

            except Exception as e:
                print(f"❌ Failed to extract {url}: {e}")

            await context.close()

            # Random wait to mimic human behavior
            wait_time = random.randint(2, 5)
            print(f"⏳ Waiting {wait_time} seconds before next...")
            time.sleep(wait_time)

        await browser.close()

    print(f"\n📁 All data saved to: {OUTPUT_FILE}")

# ✅ Run this in Jupyter or script
await scrape_and_save_profiles()