In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import os
from bs4 import Tag, NavigableString
from tqdm import tqdm


def get_links(url, prefix="https://youmed.vn/tin-tuc/"):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    links = set()

    main_tag = soup.find("main")
    if not main_tag:
        return links  # Không có thẻ <main>

    for a_tag in main_tag.find_all("a", href=True):
        if a_tag.find_parent(class_="primary-desktop-menu"):
            continue
        href = a_tag["href"]
        if href.startswith("/"):
            href = "https://youmed.vn" + href
        if href.startswith(prefix):
            links.add(href)

    return links


def extract_article_content(article_soup):
    """
    Lấy nội dung bài viết từ trang web
    """
    # Lấy thẻ <article> đầu tiên
    article = article_soup.find("article")
    if not article:
        return None
    groupts = []
    nav_div = article_soup.find(class_="text-sm text-gray-600")
    if nav_div and "Trang chủ" in nav_div.get_text():
        groupts = [a.get_text(strip=True) for a in nav_div.find_all("a")]

    def parse_element(el):
        if isinstance(el, Tag):
            tag = el.name
            children = []
            for child in el.children:
                parsed = parse_element(child)
                if parsed is not None and (
                    isinstance(parsed, dict)
                    or (isinstance(parsed, str) and parsed.strip())
                ):
                    children.append(parsed)
            if children:
                return {tag: children}
            else:
                if tag == "img":
                    text = el.get("src")
                else:
                    text = el.get_text(strip=True)
                return {tag: text} if text else None
        elif isinstance(el, NavigableString):
            text = str(el).strip()
            return text if text else None
        else:
            return None

    return (parse_element(article), groupts)


def crawl_and_save_articles(list_links, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for link in tqdm(list_links):
        try:
            resp = requests.get(link)
            soup = BeautifulSoup(resp.content, "html.parser")
            result = extract_article_content(soup)
            if result is not None:
                content, groupts = result
                if content and isinstance(content, dict) and "article" in content:
                    content = content["article"][:-1]
                else:
                    content = None
            else:
                content = None
                groupts = []
            if content:
                # Add groupts to the content dict
                if isinstance(content, list):
                    content = {"content": content, "groupts": groupts}
                # Tạo tên file từ slug cuối của link
                slug = link.rstrip("/").split("/")[-1]
                out_path = os.path.join(output_dir, f"{slug}.json")
                with open(out_path, "w", encoding="utf-8") as f:
                    json.dump(content, f, ensure_ascii=False, indent=2)
            else:
                print(f"Không tìm thấy nội dung article cho: {link}")
        except Exception as e:
            print(f"Lỗi với link {link}: {e}")


In [4]:
url = "https://youmed.vn/tin-tuc/trieu-chung-benh/"
benh_links = get_links(url)


'https://youmed.vn/tin-tuc/mang-thai/dinh-duong-chuan-bi-mang-thai/' in benh_links

False

In [5]:
benh_links

{'https://youmed.vn/tin-tuc/',
 'https://youmed.vn/tin-tuc/10-su-that-ve-hoi-chung-tourette-ban-khong-the-bo-qua/',
 'https://youmed.vn/tin-tuc/am-thoi-tai-tim-binh-thuong-hay-benh-ly/',
 'https://youmed.vn/tin-tuc/ap-xe-ma-nguyen-nhan-dau-hieu-va-cach-dieu-tri/',
 'https://youmed.vn/tin-tuc/ap-xe-phoi-nhung-kien-thuc-co-ban/',
 'https://youmed.vn/tin-tuc/bach-cau-cap-tre-em-cach-nhan-biet-va-dieu-tri-benh-hieu-qua/',
 'https://youmed.vn/tin-tuc/ban-biet-gi-ve-can-benh-basedow/',
 'https://youmed.vn/tin-tuc/ban-biet-gi-ve-dong-kinh-thuy-thai-duong/',
 'https://youmed.vn/tin-tuc/ban-biet-gi-ve-hoi-chung-klinefelter/',
 'https://youmed.vn/tin-tuc/ban-biet-gi-ve-hoi-chung-turner/',
 'https://youmed.vn/tin-tuc/ban-biet-gi-ve-phuc-hoi-chuc-nang-sau-dot-quy/',
 'https://youmed.vn/tin-tuc/ban-biet-gi-ve-roi-loan-khiem-khuyet-co-the/',
 'https://youmed.vn/tin-tuc/ban-co-dang-bi-moi-mat/',
 'https://youmed.vn/tin-tuc/ban-da-biet-cach-phong-ngua-cam-cum-trong-mua-lanh-nay/',
 'https://youmed.vn/

In [None]:
benh_links=list(benh_links)

In [None]:
crawl_and_save_articles(benh_links[2:3], "./data/Tra cứu bệnh/")