In [7]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import requests
import json as pyjson
import time
import uuid
import re

nest_asyncio.apply()

# --- Unvan ve isim ayırıcı ---
def extract_title_and_name(raw_name):
    known_titles = [
        "Dr. Öğr. Üyesi", "Prof. Dr.", "Doç. Dr.", "Yrd. Doç. Dr.",
        "Uzm. Dr.", "Op. Dr.", "Dr. Dt.", "Dt.", "Dr."
    ]
    for t in known_titles:
        if raw_name.startswith(t):
            return t, raw_name[len(t):].strip()
    return "", raw_name.strip()

# --- Scroll ile soru linklerini topla (küçük adımlı kaydırma) ---
async def collect_question_links(category_url, max_scrolls=200, max_links=1200):
    links = set()
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(category_url)

        # Sayfa tam yüklenene kadar kısa bir bekleme
        await asyncio.sleep(2)

        for scroll_round in range(max_scrolls):
            # Aşağıya küçük adımlarla kaydırma
            for _ in range(20):
                await page.evaluate("window.scrollBy(0, 500)")
                await asyncio.sleep(0.2)
            # Yeni içeriklerin yüklenmesi için bekle
            await asyncio.sleep(2)

            # DOM’daki tüm soru linklerini topla
            anchors = await page.query_selector_all("a[href*='/blog/soru/']")
            for a in anchors:
                href = await a.get_attribute("href")
                if href and "/blog/soru/" in href:
                    full = "https://www.doktorsitesi.com" + href
                    links.add(full)
                    if len(links) >= max_links:
                        break
            if len(links) >= max_links:
                print(f"🛑 Maksimum link sayısına ulaşıldı: {len(links)}")
                break

        await browser.close()
    return list(links)

# --- QA sayfası işle ---
def parse_qa_page(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.content, "html.parser")
        script_tag = soup.find("script", type="application/ld+json")
        if not script_tag:
            raise ValueError("JSON-LD bulunamadı.")
        cleaned = re.sub(r"[\x00-\x1F]+", " ", script_tag.string).strip()
        data = pyjson.loads(cleaned)[0]
        return {
            "topic": "Kulak Burun Boğaz Hastalıkları",
            "title": data["mainEntity"]["name"],
            "question": data["mainEntity"]["text"],
            "answer": data["mainEntity"]["acceptedAnswer"]["text"],
            "doctor_link": data["mainEntity"]["acceptedAnswer"]["url"],
            "doctor_name": data["mainEntity"]["acceptedAnswer"]["author"]["name"]
        }
    except Exception as e:
        print(f"[QA ERROR] {url} -> {e}")
        return None

# --- Doktor profili işle ---
def parse_doctor_profile(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.content, "html.parser")

        # Unvan ve isim ayrıştır
        raw_name = soup.find("h1").text.strip()
        title, name = extract_title_and_name(raw_name)

        # JSON-LD’dan clinic name ve rating çek
        script_tag = soup.find("script", type="application/ld+json")
        clinic_name = rating = ""
        if script_tag:
            try:
                cleaned = re.sub(r"[\x00-\x1F]+", " ", script_tag.string).strip()
                jsonld = pyjson.loads(cleaned)[0]
                pos = jsonld.get("hasPOS", [{}])[0]
                full_clinic = pos.get("name", "")
                if "," in full_clinic:
                    clinic_name = full_clinic.split(",", 1)[1].strip()
                else:
                    clinic_name = full_clinic.strip()
                rating = jsonld.get("aggregateRating", {}).get("ratingValue", "")
            except:
                pass

        # Adres (sadece <b> dışındaki ilk metin düğümü)
        street = ""
        street_el = soup.select_one("div.ta-address-explain")
        if street_el:
            for c in street_el.contents:
                if isinstance(c, str):
                    street = c.strip()
                    break

        # City → URL sonundan (ör: '/adana' → 'Adana')
        city = url.rstrip("/").split("/")[-1].capitalize()

        # About
        about_section = soup.select_one("div#tabid-1 p")
        about = about_section.text.strip() if about_section else ""

        # Uzmanlık
        spec_div = soup.select_one("div.expert-branches p")
        specialty = spec_div.text.strip() if spec_div else ""

        return {
            "Name": name,
            "Title": title,
            "Specialty": specialty,
            "ClinicName": clinic_name,
            "ClinicAddress": {
                "Street": street,
                "City": city
            },
            "About": about,
            "AverageReview": rating
        }
    except Exception as e:
        print(f"[DOCTOR ERROR] {url} -> {e}")
        return None

# --- JSON’a kaydet ---
def save_json(data, filename):
    with open(f"/kaggle/working/{filename}", "w", encoding="utf-8") as f:
        pyjson.dump(data, f, ensure_ascii=False, indent=2)

# --- Ana akış ---
async def main():
    category_url = "https://www.doktorsitesi.com/blog/sorular/kulak-burun-bogaz-hastaliklari"
    print("🔍 Soru linkleri toplanıyor...")
    question_links = await collect_question_links(
        category_url,
        max_scrolls=200,    # 200 kez küçük adımlı kaydır
        max_links=1200      # 1200 linke kadar dur
    )
    print(f"⏳ Toplam link bulundu: {len(question_links)}")

    qa_data = []
    doctor_data = {}

    for idx, qurl in enumerate(question_links):
        print(f"[{idx+1}/{len(question_links)}] Soru işleniyor: {qurl}")
        qa = parse_qa_page(qurl)
        if not qa:
            continue

        doc_link = qa["doctor_link"]
        if doc_link not in doctor_data:
            print(f"   👨‍⚕️ Doktor işleniyor: {doc_link}")
            doc_info = parse_doctor_profile(doc_link)
            if not doc_info:
                continue
            doctor_id = str(uuid.uuid4())
            doc_info["doctorID"] = doctor_id
            doctor_data[doc_link] = doc_info
        else:
            doctor_id = doctor_data[doc_link]["doctorID"]

        qa_data.append({
            "topic": qa["topic"],
            "title": qa["title"],
            "question": qa["question"],
            "answer": qa["answer"],
            "doctorID": doctor_id
        })

        # Her 100 işlemde kısa bir mola ver
        if (idx + 1) % 100 == 0:
            print(f"   ↪ {idx+1} soru işlendi, 5 saniye ara veriliyor...")
            time.sleep(5)
        else:
            time.sleep(0.5)

    save_json(qa_data, "qa_data.json")
    save_json(list(doctor_data.values()), "doctor_data.json")
    print(f"\n✅ {len(qa_data)} QA ve {len(doctor_data)} doktor kaydedildi.")

# --- Çalıştır ---
await main()


🔍 Soru linkleri toplanıyor...
🛑 Maksimum link sayısına ulaşıldı: 1200
⏳ Toplam link bulundu: 1200
[1/1200] Soru işleniyor: https://www.doktorsitesi.com/blog/soru/nefes-darligi-yutma-guclugu/2556080
   👨‍⚕️ Doktor işleniyor: https://www.doktorsitesi.com/prof-dr-sevtap-akbulut/kulak-burun-bogaz-hastaliklari/istanbul
[2/1200] Soru işleniyor: https://www.doktorsitesi.com/blog/soru/kulakta-degisik-sesler/2506815
   👨‍⚕️ Doktor işleniyor: https://www.doktorsitesi.com/prof-dr-ali-ozdek/kulak-burun-bogaz-hastaliklari/ankara
[3/1200] Soru işleniyor: https://www.doktorsitesi.com/blog/soru/kulak-arkadasinda-sertlik/2548567
   👨‍⚕️ Doktor işleniyor: https://www.doktorsitesi.com/prof-dr-guven-yildirim/kulak-burun-bogaz-hastaliklari/istanbul
[4/1200] Soru işleniyor: https://www.doktorsitesi.com/blog/soru/kulak-cinlamasi-isitme-kaybi/2546354
   👨‍⚕️ Doktor işleniyor: https://www.doktorsitesi.com/op-dr-zafer-koksal-eren/kulak-burun-bogaz-hastaliklari/kocaeli
[5/1200] Soru işleniyor: https://www.doktor

In [2]:
!pip install playwright nest_asyncio
!playwright install


Collecting playwright
  Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl (45.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.52.0 pyee-13.0.0
Downloading Chromium 136.0.7103.25 (playwright build v1169)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1169/chromium-linux.zip[22m
Chromium 136.0.7103.25 (playwright build v1169) downloaded to /root/.cache/ms-playwright/chromium-1169
Downloading Chromium Headless Shell 136.0.7103.25 (playwright build v1169)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/11