In [None]:
import requests

url = "https://api.anycrawl.dev/v1/scrape"
headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer"
}
data = {
  "url": "https://wiki.themk.org/index.php/1994_Alps_catalogue",
  "engine": "playwright",
  "formats": [
    "markdown"
  ]
}

response = requests.post(url, headers=headers, json=data)
result = response.json()
print(result)



In [2]:
import json 

link = "https://wiki.themk.org/index.php/1994_Alps_catalogue"
fname = link.split("/")[-1] + ".json"
print(fname) 

with open(fname, "w", encoding="utf-8") as f: 
    json.dump(result, f, indent=2, ensure_ascii=False)

1994_Alps_catalogue.json


In [4]:
import os, re, json, time, pathlib, requests
from urllib.parse import urlparse, parse_qs, unquote
from dotenv import load_dotenv
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

load_dotenv()
API_KEY = os.getenv("ANYCRAWL_API_KEY")
if not API_KEY:
    raise SystemExit("❌ ANYCRAWL_API_KEY missing in .env")

SCRAPE_URL = "https://api.anycrawl.dev/v1/scrape"
JOB_URL    = "https://api.anycrawl.dev/v1/jobs/{job_id}"

OUT_DIR   = pathlib.Path("data")
ERR_DIR   = pathlib.Path("errors")
OUT_DIR.mkdir(exist_ok=True)
ERR_DIR.mkdir(exist_ok=True)

# ---- Session with retries for transient errors ----
session = requests.Session()
retry = Retry(
    total=5, backoff_factor=0.8,
    status_forcelist=(429, 500, 502, 503, 504),
    allowed_methods=frozenset(["GET","POST"])
)
session.mount("https://", HTTPAdapter(max_retries=retry))

HEADERS = {
    "Content-Type": "application/json",
    "Accept": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}


In [5]:
def debug_once(test_url):
    r = session.post(SCRAPE_URL, headers=HEADERS, json={
        "url": test_url,
        "engine": "playwright",
        "formats": ["markdown","json"],   # ask for both; safer on odd pages
        # "wait_until":"networkidle", "timeout":60000  # optional stabilizers
    }, timeout=60)
    print("STATUS:", r.status_code, "| CT:", r.headers.get("Content-Type"))
    print("HEAD:", r.text[:400].replace("\n"," ") if hasattr(r,"text") else "<no text>")
    try:
        print("JSON OK:", bool(r.json()))
    except Exception as e:
        print("JSON DECODE ERROR:", e)

In [6]:
debug_once("https://wiki.themk.org/index.php/1994_Alps_catalogue")

STATUS: 200 | CT: application/json; charset=utf-8
HEAD: {"success":true,"data":{"url":"https://wiki.themk.org/index.php/1994_Alps_catalogue","type":"scrape","status":"completed","title":"1994 Alps catalogue - Keyboard Wiki","metadata":[{"name":"generator","content":"MediaWiki 1.44.0"},{"name":"robots","content":"max-image-preview:standard"},{"name":"format-detection","content":"telephone=no"},{"name":"viewport","content":"width=device-width, initial-sc
JSON OK: True


In [1]:
import os 

print(file for file in os.listdir("data/"))

<generator object <genexpr> at 0x7fb9d42a5d20>


In [2]:
for file in os.listdir("data/"):
    print(file)

Alps_SKCC_Tall_Cream.json
Alps_Cruciform_Magnetic_Reed.json
ALPS_Plate_Spring.json
Alps_AKB-3420.json
Alps_SKCM_Black.json
Alps_buckling_spring.json
ALPS.json
Alps_SKCL_series.json
Outemu_Alps_Mount_Switch.json
Alps_SKCL_Brown.json
Alps_SKCL_Double_Action.json
Alps_SKCM_series.json
ALPS_Switches.json
Alps_keyboard_codes.json
Alps_Magnetic_Reed.json
Alps_SKCL_Compact.json
Alps_SKEW_Blue.json
Alps_SKCL_lock.json
Alps.tw_Type_T9.json
Alps_CM.json
Cherry_MX_Alps_Clear.json
Alps_SKCM_Salmon.json
YH_Alps_clone.json
Alps_SKFL_Lock.json
Alps_SKCM_Cream.json
Alps_moulded_spring_over_membrane.json
Alps_SKCC_Grey.json
Alps_SKCL_Heavy_Cream.json
Alps_flat_spherical_series.json
Cherry_MX_Alps_Click.json
Alps_SKCM_Amber.json
Simplified_Alps_Type_IV_switch.json
YH-B_Alps_clone.json
Alps_SKCL_Lock.json
Alps_vertical_plate_spring.json
Alps_SKBM_Black.json
Alps_integrated_dome.json
Alps_SKCL_Cream.json
Alps_SKCM_Blue.json
SMK_Alps_mount_switch.json
Alps_SKCM_Yellow.json
Alps_semi-integrated_dome.json
Al