# **이전 버전**

In [54]:
import re
import json
from datetime import datetime, timedelta
from collections import defaultdict
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin

# 날짜 범위 파싱
def parse_date_range(raw_date: str, year: str):
    parts = re.split(r"\s*~\s*", raw_date)
    if len(parts) == 1:
        return [datetime.strptime(f"{year}.{parts[0].strip()}", "%Y.%m.%d")]
    start = datetime.strptime(f"{year}.{parts[0].strip()}", "%Y.%m.%d")
    end = datetime.strptime(f"{year}.{parts[1].strip()}", "%Y.%m.%d")
    return [start + timedelta(days=i) for i in range((end - start).days + 1)]

# 요일 추출
def get_korean_weekday(date_obj):
    days = ["월요일", "화요일", "수요일", "목요일", "금요일", "토요일", "일요일"]
    return days[date_obj.weekday()]

# 학기 분류 기준 설정
def determine_semester_and_academic_year(date_obj):
    y = date_obj.year
    if datetime(y, 3, 2) <= date_obj <= datetime(y, 6, 22):
        return "1학기", y
    elif datetime(y, 6, 23) <= date_obj <= datetime(y, 8, 31):
        return "여름방학", y
    elif datetime(y, 9, 1) <= date_obj <= datetime(y, 12, 22):
        return "2학기", y
    elif datetime(y, 12, 23) <= date_obj <= datetime(y, 12, 31):
        return "겨울방학", y + 1
    elif datetime(y, 1, 1) <= date_obj <= datetime(y, 3, 1):
        return "겨울방학", y
    return "학기미정", y

# 크롤링 코드
def crawl_hufs_schedule(url: str):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    iframe = driver.find_element("tag name", "iframe")
    iframe_url = urljoin(url, iframe.get_attribute("src"))
    driver.get(iframe_url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    table = soup.select_one("#timeTableList > ul")
    items = []
    current_year = None

    for el in table.children:
        if getattr(el, "name", None) is None:
            continue
        if el.name == "div":
            year = el.find("strong")
            if year:
                current_year = re.search(r"\d{4}", year.text.strip()).group()
        elif el.name == "li":
            if not current_year:
                y = soup.select_one(".month-search-header > p")
                if y: current_year = re.search(r"\d{4}", y.text.strip()).group()
            month = el.select_one("p.box-month")
            if not month: continue
            month_kr = month.text.strip()

            for e in el.select("div.list-inner > div.list-box"):
                d_tag = e.select_one("p.list-date")
                c_tag = e.select_one("p.list-content")
                if not d_tag or not c_tag: continue
                raw_date = d_tag.text.strip().replace(" ", "").replace("\n", "")
                content = c_tag.get_text(strip=True)

                try:
                    for d in parse_date_range(raw_date, current_year):
                        items.append({
                            "date": d.strftime("%Y-%m-%d"),
                            "date_obj": d,
                            "text": f"{d.year}년 {d.month}월 {d.day}일은 {content} 기간입니다.",
                            "month_kr": month_kr,
                            "weekday": get_korean_weekday(d)
                        })
                except:
                    continue

    structured = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
    for i in items:
        d = i["date_obj"]
        semester, academic_year = determine_semester_and_academic_year(d)
        mon = i["month_kr"]
        structured[str(academic_year)][semester][mon][i["date"]] = {
            "text": i["text"],
            "weekday": i["weekday"]
        }

    return structured

In [55]:
# 실행
if __name__ == "__main__":
    url = "https://www.hufs.ac.kr/hufs/11360/subview.do"
    data = crawl_hufs_schedule(url)

    with open("hufs_schedule.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print("✅ 학년도 기준으로 hufs_schedule.json 저장 완료!")

✅ 학년도 기준으로 hufs_schedule.json 저장 완료!


In [56]:
data

defaultdict(<function __main__.crawl_hufs_schedule.<locals>.<lambda>()>,
            {'2025': defaultdict(<function __main__.crawl_hufs_schedule.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'겨울방학': defaultdict(dict,
                                      {'3월': {'2025-03-01': {'text': '2025년 3월 1일은 삼일절 기간입니다.',
                                         'weekday': '토요일'}}}),
                          '1학기': defaultdict(dict,
                                      {'3월': {'2025-03-03': {'text': '2025년 3월 3일은 제1학기 개강 ·대체휴업일 기간입니다.',
                                         'weekday': '월요일'},
                                        '2025-03-04': {'text': '2025년 3월 4일은 수강신청 변경 기간입니다.',
                                         'weekday': '화요일'},
                                        '2025-03-05': {'text': '2025년 3월 5일은 수강신청 변경 기간입니다.',
                                         'weekday': '수요일'},
                                        '2025-03-06': {'text': '2025년 3월 6일은 수

# **수정 버전(25.05.19일자 수정)**

In [1]:
import re
import json
from datetime import datetime, timedelta
from collections import defaultdict
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin

# 날짜 범위 파싱
def parse_date_range(raw_date: str, year: str):
    parts = re.split(r"\s*~\s*", raw_date)
    if len(parts) == 1:
        return [datetime.strptime(f"{year}.{parts[0].strip()}", "%Y.%m.%d")]
    start = datetime.strptime(f"{year}.{parts[0].strip()}", "%Y.%m.%d")
    end = datetime.strptime(f"{year}.{parts[1].strip()}", "%Y.%m.%d")
    return [start + timedelta(days=i) for i in range((end - start).days + 1)]

# 요일 추출
def get_korean_weekday(date_obj):
    days = ["월요일", "화요일", "수요일", "목요일", "금요일", "토요일", "일요일"]
    return days[date_obj.weekday()]

# 학기 분류 기준 설정
def determine_semester_and_academic_year(date_obj):
    y = date_obj.year
    if datetime(y, 3, 2) <= date_obj <= datetime(y, 6, 22):
        return "1학기", y
    elif datetime(y, 6, 23) <= date_obj <= datetime(y, 8, 31):
        return "여름방학", y
    elif datetime(y, 9, 1) <= date_obj <= datetime(y, 12, 22):
        return "2학기", y
    elif datetime(y, 12, 23) <= date_obj <= datetime(y, 12, 31):
        return "겨울방학", y + 1
    elif datetime(y, 1, 1) <= date_obj <= datetime(y, 3, 1):
        return "겨울방학", y
    return "학기미정", y

# 크롤링 코드
def crawl_hufs_schedule(url: str):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    iframe = driver.find_element("tag name", "iframe")
    iframe_url = urljoin(url, iframe.get_attribute("src"))
    driver.get(iframe_url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    table = soup.select_one("#timeTableList > ul")
    items = []
    current_year = None

    for el in table.children:
        if getattr(el, "name", None) is None:
            continue
        if el.name == "div":
            year = el.find("strong")
            if year:
                current_year = re.search(r"\d{4}", year.text.strip()).group()
        elif el.name == "li":
            if not current_year:
                y = soup.select_one(".month-search-header > p")
                if y: current_year = re.search(r"\d{4}", y.text.strip()).group()
            month = el.select_one("p.box-month")
            if not month: continue
            month_kr = month.text.strip()

            for e in el.select("div.list-inner > div.list-box"):
                d_tag = e.select_one("p.list-date")
                c_tag = e.select_one("p.list-content")
                if not d_tag or not c_tag: continue
                raw_date = d_tag.text.strip().replace(" ", "").replace("\n", "")
                content = c_tag.get_text(strip=True)

                try:
                    for d in parse_date_range(raw_date, current_year):
                        items.append({
                            "date": d.strftime("%Y-%m-%d"),
                            "date_obj": d,
                            "text": f"{d.year}년 {d.month}월 {d.day}일은 {content} 기간입니다.",
                            "month_kr": month_kr,
                            "weekday": get_korean_weekday(d)
                        })
                except:
                    continue

    # 초기화
    structured = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict))))
    
    # 누적 저장 로직
    for i in items:
        d = i["date_obj"]
        semester, academic_year = determine_semester_and_academic_year(d)
        mon = i["month_kr"]
        date_key = i["date"]
    
        # 이벤트 누적
        structured[str(academic_year)][semester][mon][date_key].setdefault("events", []).append({
            "text": i["text"],
            "weekday": i["weekday"]
        })

    return structured

In [2]:
# 실행
if __name__ == "__main__":
    url = "https://www.hufs.ac.kr/hufs/11360/subview.do"
    data = crawl_hufs_schedule(url)

    with open("hufs_schedule.json", "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    print("✅ 학년도 기준으로 hufs_schedule.json 저장 완료!")

✅ 학년도 기준으로 hufs_schedule.json 저장 완료!
