In [None]:
import requests
from bs4 import BeautifulSoup
import json as pyjson
import time
import uuid
import re
import os

# --- Ünvan ve ismi ayır ---
def extract_title_and_name(raw_name):
    known_titles = [
        "Prof. Dr.", "Doç. Dr.", "Dr. Öğr. Üyesi", "Yrd. Doç. Dr.",
        "Uzm. Dr.", "Op. Dr.", "Dr. Dt.", "Dt.", "Dr."
    ]
    for t in known_titles:
        if raw_name.startswith(t):
            return t, raw_name[len(t):].strip()
    return "", raw_name.strip()

# --- Sayfa sayfa soru linklerini topla ---
def collect_paginated_links(base_url, max_pages=100, max_links=2500):
    links = set()
    for page_num in range(1, max_pages + 1):
        url = base_url.format(page_num)
        print(f"📄 Sayfa {page_num} işleniyor...")
        try:
            res = requests.get(url, timeout=10)
            soup = BeautifulSoup(res.text, "html.parser")
            anchors = soup.select("a[href^='/blog/soru/']")
            for a in anchors:
                href = a["href"]
                full = "https://www.doktorsitesi.com" + href
                links.add(full)
            print(f"🔗 Toplam link: {len(links)}")
            if len(links) >= max_links:
                print("🛑 Maksimum link sayısına ulaşıldı.")
                break
        except Exception as e:
            print(f"[LINK ERROR] {url} -> {e}")
        time.sleep(1)
    return list(links)

# --- Soru sayfasını işle ---
def parse_qa_page(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.content, "html.parser")
        script_tag = soup.find("script", type="application/ld+json")
        if not script_tag:
            raise ValueError("JSON-LD yok.")
        cleaned = re.sub(r"[\x00-\x1F]+", " ", script_tag.string).strip()
        data = pyjson.loads(cleaned)[0]

        main = data["mainEntity"]
        accepted = main["acceptedAnswer"]
        topic = "Fiziksel Tıp ve Rehabilitasyon"

        return {
            "topic": topic,
            "title": main.get("name", ""),
            "question": main.get("text", ""),
            "answer": accepted.get("text", ""),
            "doctor_name": accepted["author"]["name"],
            "doctor_link": accepted["author"]["url"]
        }
    except Exception as e:
        print(f"[QA ERROR] {url} -> {e}")
        return None

# --- Doktor profilini işle ---
def parse_doctor_profile(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.content, "html.parser")

        # Ad ve unvan ayır
        raw_name = soup.find("h1").text.strip()
        title, name = extract_title_and_name(raw_name)

        # Uzmanlık
        spec_div = soup.select_one("div.expert-branches p")
        specialty = spec_div.text.strip() if spec_div else ""

        # Klinik adı (JSON-LD'den denenir)
        script_tag = soup.find("script", type="application/ld+json")
        clinic_name = rating = ""
        if script_tag:
            try:
                cleaned = re.sub(r"[\x00-\x1F]+", " ", script_tag.string).strip()
                jsonld = pyjson.loads(cleaned)[0]
                pos = jsonld.get("hasPOS", [{}])[0]
                full_clinic = pos.get("name", "")
                clinic_name = full_clinic.split(",", 1)[-1].strip() if "," in full_clinic else full_clinic.strip()
                rating = jsonld.get("aggregateRating", {}).get("ratingValue", "")
            except:
                pass

        # Adres: sokak
        street = ""
        street_el = soup.select_one("div.ta-address-explain")
        if street_el:
            for c in street_el.contents:
                if isinstance(c, str):
                    street = c.strip()
                    break

        # Şehir: URL'den
        city = url.rstrip("/").split("/")[-1].capitalize()

        # Posta Kodu: yoksa boş
        post_code = ""

        # Hakkında
        about = ""
        about_section = soup.select_one("div#tabid-1 p")
        if about_section:
            about = about_section.text.strip()

        return {
            "Name": name,
            "Title": title,
            "Specialty": specialty,
            "ClinicName": clinic_name,
            "ClinicAddress": {
                "Street": street,
                "City": city,
                "Post Code": post_code
            },
            "About": about,
            "AverageReview": rating
        }
    except Exception as e:
        print(f"[DOCTOR ERROR] {url} -> {e}")
        return None

# --- JSON kaydet ---
def save_json(data, filename):
    os.makedirs("output", exist_ok=True)
    with open(os.path.join("output", filename), "w", encoding="utf-8") as f:
        pyjson.dump(data, f, ensure_ascii=False, indent=2)

# --- Ana akış ---
def main():
    print("🔍 Soru linkleri toplanıyor...")
    question_links = collect_paginated_links(
      base_url="https://www.doktorsitesi.com/blog/sorular/fiziksel-tip-ve-rehabilitasyon?sayfa={}",
        max_pages=300,
        max_links=2500
    )

    qa_data = []
    doctor_data = {}

    for idx, qurl in enumerate(question_links):
        print(f"[{idx+1}/{len(question_links)}] Soru işleniyor...")
        qa = parse_qa_page(qurl)
        if not qa:
            continue

        doc_url = qa["doctor_link"]
        if doc_url not in doctor_data:
            print(f"   👨‍⚕️ Doktor işleniyor: {doc_url}")
            doc_info = parse_doctor_profile(doc_url)
            if not doc_info:
                continue
            doctor_id = str(uuid.uuid4())
            doc_info["doctorID"] = doctor_id
            doctor_data[doc_url] = doc_info
        else:
            doctor_id = doctor_data[doc_url]["doctorID"]

        qa_data.append({
            "topic": qa["topic"],
            "title": qa["title"],
            "question": qa["question"],
            "answer": qa["answer"],
            "doctorID": doctor_id
        })

        if (idx + 1) % 100 == 0:
            print(f"   ↪ {idx+1} soru işlendi, 5 saniye mola...")
            time.sleep(5)
        else:
            time.sleep(0.5)

    save_json(qa_data, "qa_data.json")
    save_json(list(doctor_data.values()), "doctor_data.json")
    print(f"\n✅ {len(qa_data)} soru ve {len(doctor_data)} doktor kaydedildi.")

# --- Çalıştır ---
main()
