In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import json
import time
from tqdm import tqdm
import numpy as np

BASE_URL = "https://www.hufs.ac.kr"
LIST_URL = "https://www.hufs.ac.kr/hufs/11282/subview.do"

target_ids = ['176988','140225','133460', '126859','228507','228277','226933','226908', '224782','223977','222312','214963']



def get_driver():
    options = Options()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(options=options)
    return driver


def parse_detail(driver):
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div.view-con'))
    )
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    content_div = soup.select_one('div.view-con')

    for tag in content_div.select('script, style'):
        tag.decompose()

    text_parts = [p.get_text(" ", strip=True) for p in content_div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'li'])]
    page_content = "\n\n".join(text_parts)

    tables = []
    for table in content_div.find_all('table'):
        df = pd.read_html(str(table))[0]
        df = df.replace({np.nan: '정보없음'})
        tables.append(df.values.tolist())
        page_content += "\n\n" + df.to_markdown(index=False)

    return page_content, tables


def crawl_hufs_notice():
    driver = get_driver()
    driver.get(LIST_URL)
    result = []
    page_num = 1
    collected_notice_ids = set()

    # 중요공지 먼저 수집
    WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'table.board-table tbody tr'))
    )
    rows = driver.find_elements(By.CSS_SELECTOR, 'table.board-table tbody tr')

    for row in rows:
        is_notice = 'notice' in row.get_attribute('class')
        if not is_notice:
            continue

        link = row.find_element(By.CSS_SELECTOR, 'td.td-subject a').get_attribute('href')
        ntt_id = link.split('/')[-2]

        if ntt_id not in target_ids or ntt_id in collected_notice_ids:
            continue

        title = row.find_element(By.CSS_SELECTOR, 'td.td-subject').text.strip()
        author = row.find_element(By.CSS_SELECTOR, 'td.td-write').text.strip()
        date = row.find_element(By.CSS_SELECTOR, 'td.td-date').text.strip()

        row.find_element(By.CSS_SELECTOR, 'td.td-subject a').click()
        time.sleep(1)

        page_content, tables = parse_detail(driver)

        result.append({
            'title': title or '정보없음',
            'author': author or '정보없음',
            'date': date or '정보없음',
            'url': link or '정보없음',
            'ntt_id': ntt_id or '정보없음',
            'page_content': page_content or '정보없음',
            'tables': tables or []
        })

        collected_notice_ids.add(ntt_id)
        driver.back()
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'table.board-table tbody tr'))
        )

    # 일반공지 수집 시작
    while True:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'table.board-table tbody tr'))
        )

        rows = driver.find_elements(By.CSS_SELECTOR, 'table.board-table tbody tr')

        general_rows = [r for r in rows if 'notice' not in r.get_attribute('class')]

        for row in tqdm(general_rows):
            title = row.find_element(By.CSS_SELECTOR, 'td.td-subject').text.strip()
            author = row.find_element(By.CSS_SELECTOR, 'td.td-write').text.strip()
            date = row.find_element(By.CSS_SELECTOR, 'td.td-date').text.strip()
            link = row.find_element(By.CSS_SELECTOR, 'td.td-subject a').get_attribute('href')
            ntt_id = link.split('/')[-2]

            if ntt_id in target_ids:
                continue

            row.find_element(By.CSS_SELECTOR, 'td.td-subject a').click()
            time.sleep(1)

            page_content, tables = parse_detail(driver)

            result.append({
                'title': title or '정보없음',
                'author': author or '정보없음',
                'date': date or '정보없음',
                'url': link or '정보없음',
                'ntt_id': ntt_id or '정보없음',
                'page_content': page_content or '정보없음',
                'tables': tables or []
            })

            driver.back()
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'table.board-table tbody tr'))
            )

        if page_num >= 6:
            break

        try:
            next_btn = driver.find_element(By.CSS_SELECTOR, '#menu11282_obj152 > div._fnctWrap > form:nth-child(3) > div > div > a._listNext')
            if 'disabled' in next_btn.get_attribute('class'):
                break
            next_btn.click()
            time.sleep(1)
            page_num += 1
        except:
            break

    driver.quit()

    with open('hufs_notice.json', 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)



In [12]:
if __name__ == '__main__':
    crawl_hufs_notice()

  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
100%|███████████████