In [16]:
import nest_asyncio
import asyncio
import os
import json
import time
import random
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

# Allow nested event loops in Jupyter
nest_asyncio.apply()

# Initialization

In [17]:
pw = await async_playwright().start()
browser = await pw.chromium.launch(headless=True)
context = await browser.new_context(
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
)
page = await context.new_page()

print("Browser initialized and ready.")

Browser initialized and ready.


# Category links

In [22]:
if not os.path.exists('./json'):
    os.makedirs('./json')

In [25]:
save_name = './json/category_list.json'

if os.path.isfile(save_name):
    with open(save_name, 'r') as f:
        # Parsing the JSON file into a Python dictionary
        category_list = json.load(f)
else:
    # Initialization
    pw = await async_playwright().start()
    browser = await pw.chromium.launch(headless=True)
    context = await browser.new_context(
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    )
    page = await context.new_page()
    # Open URL
    await page.goto("https://www.binance.com/en/support/faq", wait_until="networkidle")
    
    # Extract category links
    categories = await page.query_selector_all('a[href*="/en/support/faq/"]')
    category_list = []
    
    for cat in categories:
        name = await cat.inner_text()
        name = name.strip()
        if '\n\n' in name:
            name, description = name.split('\n\n')
        else:
            name, description = name, ''
        url = await cat.get_attribute('href')
        if name and "/en/support/faq/" in url:
            category_list.append({
                "section": name,
                "description":description,
                "url": f"https://www.binance.com{url}" if url.startswith('/') else url
            })
        # time.sleep(random.uniform(1, 2))
    print(f"Found {len(category_list)} categories.")
    
    # Save to JSON
    with open(save_name, 'w', encoding='utf-8') as f:
        json.dump(category_list, f, indent=4, ensure_ascii=False)

In [26]:
category_list[:3]

[{'section': 'Getting Started',
  'description': 'Learn how to start your first trade here on Binance.',
  'url': 'https://www.binance.com/en/support/faq/list/94'},
 {'section': 'Account Functions',
  'description': 'Learn how to register, secure, and verify your Binance account.',
  'url': 'https://www.binance.com/en/support/faq/list/1'},
 {'section': 'Buy and Sell',
  'description': 'See how to buy, sell and transfer crypto to and from your account.',
  'url': 'https://www.binance.com/en/support/faq/list/66'}]

# Article Links

In [33]:
# Create directory
save_dir = f"./json/URLs"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
# Initialization
pw = await async_playwright().start()
browser = await pw.chromium.launch(headless=True)
context = await browser.new_context(
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
)
page = await context.new_page()
# Loop category
for i,target_cat in enumerate(category_list):
    save_name = f"{save_dir}/{target_cat['section'].replace('/','-')}.json"
    if os.path.isfile(save_name):
        continue
    # Open URL
    if i == 0:
        print(f"Go to {target_cat['url']}")
    await page.goto(target_cat['url'], wait_until="networkidle")
    # Look for links that have long alphanumeric IDs (actual articles)
    links = await page.query_selector_all('a[href*="/en/support/faq/"]')
    # Get each article's URL
    article_urls = []
    for link in links:
        url = await link.get_attribute('href')
        title = await link.inner_text()
        # Logic: Article URLs are usually much longer than category URLs
        if url and len(url) > 50: 
            article_urls.append({
                "title": title.strip(),
                "url": f"https://www.binance.com{url}" if url.startswith('/') else url
            })
        time.sleep(random.uniform(0.5, 1))
    # Save to JSON
    with open(save_name, 'w', encoding='utf-8') as f:
        json.dump(article_urls, f, indent=4, ensure_ascii=False)
    print(f"Save URLs from: {target_cat['section']}")

# Article Contents

In [43]:
# Create directory
save_dir = f"./json/content"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
# Initialization
pw = await async_playwright().start()
browser = await pw.chromium.launch(headless=True)
context = await browser.new_context(
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
)
page = await context.new_page()
# Scrape articel content by each section
for i,target_cat in enumerate(category_list):
    # Skip if file exists
    save_name = f"{save_dir}/{target_cat['section'].replace('/','-')}.json"
    if os.path.isfile(save_name):
        print(f"File exists: {save_name}")
        continue
    # Load URL list
    load_name = f"./json/URLs/{target_cat['section'].replace('/','-')}.json"
    with open(load_name, 'r') as f:
        # Parsing the JSON file into a Python dictionary
        article_urls = json.load(f)
    # Article content
    articles = []
    for j,target_article in enumerate(article_urls):
        if i == 0:
            if j < 5:
                print(f"\tGo to {target_article['title']}: {target_article['url']}")
            elif j == 5:
                print('...')
        # Go to article URL            
        await page.goto(target_article['url'], wait_until="networkidle")
        soup = BeautifulSoup(await page.content(), 'html.parser')
        # Standard Binance FAQ selectors
        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else "No Title"
        content_body = soup.find('article') or soup.find('main')
        # Try a list of potential containers used by Binance's help center
        # These are based on current 2026 patterns for their support docs
        potential_selectors = [
            'div.css-18z6mjt',        # Specific dynamic class often used for FAQ body
            'div[data-testid="article-content"]', # Data test IDs are the most stable
            'div.article-content',     # Common semantic class
            '.css-1pws6n0',           # Another common wrapper
            '#skip-to-content'        # Accessibility ID
        ]
        
        content_body = None
        for selector in potential_selectors:
            content_body = soup.select_one(selector)
            if content_body:
                print(f"Found content using selector: {selector}")
                break
        
        # Fallback: If still None, grab the largest div that contains a lot of text
        if not content_body:
            divs = soup.find_all('div')
            # Pick the div with the most characters that isn't the whole body
            content_body = max(divs, key=lambda d: len(d.get_text()), default=None)
        
        text = content_body.get_text(separator="\n", strip=True) if content_body else "Content Not Found"
        if title in text:
            text = text.split(title)[1]
        if 'Risk warning' in text:
            text = text.split('Risk warning')[0]

        articles.append({'title':title, 'content':text})
        time.sleep(random.uniform(1, 2))
    # Save to JSON
    with open(save_name, 'w', encoding='utf-8') as f:
        json.dump(articles, f, indent=4, ensure_ascii=False)
    print(f"Save {target_cat['section'].replace('/','-')} articels to JSON")
print('Done!')

File exists: ./json/content/Getting Started.json
File exists: ./json/content/Account Functions.json
File exists: ./json/content/Buy and Sell.json
File exists: ./json/content/Trade.json
File exists: ./json/content/Binance Earn.json
File exists: ./json/content/Convert & Block Trade.json
File exists: ./json/content/Account Functions.json
File exists: ./json/content/Tutorial.json
File exists: ./json/content/Binance Wallet.json
File exists: ./json/content/Binance Junior.json
File exists: ./json/content/Crypto Deposit-Withdrawal.json
File exists: ./json/content/Buy Crypto (Fiat-P2P).json
File exists: ./json/content/Spot & Margin Trading.json
File exists: ./json/content/Crypto Derivatives.json
File exists: ./json/content/Trading Bots.json
File exists: ./json/content/Copy Trading.json
File exists: ./json/content/Finance.json
File exists: ./json/content/Launchpool & Megadrop.json
File exists: ./json/content/Binance Earn.json
File exists: ./json/content/Binance Fan Token.json
File exists: ./json