In [1]:
import requests
import os
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
from selenium.common.exceptions import TimeoutException

In [2]:
def fetch_and_save_stats(url, filename_prefix, div_id=None, div_class=None, section_class=None):
    driver = webdriver.Chrome()
    try:
        driver.get(url)
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
        if div_id:
            wait.until(EC.presence_of_element_located((By.ID, div_id)))
        elif div_class:
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, div_class)))
        elif section_class:
            wait.until(EC.presence_of_element_located((By.CLASS_NAME, section_class)))
        
        page_source = driver.page_source
        soup = bs(page_source, 'html.parser')

        target_element = None
        if div_id:
            target_element = soup.find('div', {'id': div_id})
        elif div_class:
            target_element = soup.find('div', {'class': div_class})
        elif section_class:
            target_element = soup.find('section', {'class': section_class})
        else:
            target_element = soup.body  # Nếu không có thông số, lấy toàn bộ trang

        if not target_element:
            print("Error: Element with specified identifier not found.")
            return

        content = []
        for elem in target_element.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a']):
            text = elem.get_text(strip=True)
            if elem.name == 'a' and elem.has_attr('href'):
                text += f" ({elem['href']})"
            if text:
                content.append(text)

        save_data(content, filename_prefix)
    except Exception as e:
        print(f"Error occurred: {e}")
    finally:
        driver.quit()

def save_data(content, filename_prefix):
    txt_filename = f"{filename_prefix}.txt"
    with open(txt_filename, 'w', encoding='utf-8') as f:
        for text in content:
            f.write(f"{text}\n")
    print(f"Content saved to {txt_filename}")

In [3]:
urls ={
    "tuyen_sinh_ton_duc_thang":
        {
            "url": "https://tuyensinh.edu.vn/dai-hoc/truong-dh-ton-duc-thang-chi-danh-25-chi-tieu-xet-diem-thi-tot-nghiep-thpt/",
            "div_id": "content",
            "div_class": "entry-content single-page",
        },
    "tuyen_sinh_phenikaa":
        {
            "url": "https://tuyensinh.edu.vn/tin-tuyen-sinh/truong-dai-hoc-phenikaa-cong-bo-5-phuong-thuc-tuyen-sinh-dai-hoc-nam-2022/",
            "div_id": "content",
            "div_class": "entry-content single-page",
        },
}

In [4]:
if not os.path.exists('crawled'):
    os.makedirs('crawled')

# Fetch data
for stat_type, info in urls.items():
    fetch_and_save_stats(info['url'], f"./crawled/data_{stat_type}", info['div_id'], info['div_class'])

Content saved to ./crawled/data_tuyen_sinh_ton_duc_thang.txt
Content saved to ./crawled/data_tuyen_sinh_phenikaa.txt
