In [None]:
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel
import json

class Chapter(BaseModel):
    chapter: str
    metadatas: list

class Metadata(BaseModel):
    title: str
    content: str

def fetch_and_extract_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        data = []
        chapters = soup.find_all('p', id=lambda x: x and x.startswith('Chuong'))

        for index, chapter in enumerate(chapters, start=1):
            chapter_title = chapter.find('strong').get_text(strip=True)
            titles = [dt.get_text(strip=True) for dt in soup.find_all('dt')]
            contents = [dd.get_text(strip=True) for dd in soup.find_all('dd')]
            index_ranges = {
                1: (0, 4),
                2: (4, 20),
                3: (20, 30),
                4: (30, 33),
                5: (33, 37)
            }

            start_idx, end_idx = index_ranges.get(index, (0, 0))
            metadatas = []
            for title, content in zip(titles[start_idx:end_idx], contents[start_idx:end_idx]):
                metadatas.append(Metadata(title=title, content=content))
            chapter = Chapter(chapter=chapter_title, metadatas=metadatas)
            data.append(chapter)
            
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")


In [None]:
# URL of the webpage to crawl
url = "https://undergrad.tdtu.edu.vn/hoc-vu/quy-che-chuc-va-quan-ly-dao-tao-trinh-do-dai-hoc-khoa-ts2021-tro-ve-sau#ChuongI"

# Fetch and extract text from the URL
extracted_data = fetch_and_extract_text(url)
# Convert to dictionary
extracted_data_dict = [item.dict() for item in extracted_data]
print(extracted_data)
# Save to JSON file
with open('extracted_data.json', 'w', encoding='utf-8') as json_file:
    json.dump(extracted_data_dict, json_file, ensure_ascii=False, indent=4)