In [1]:
# %% [설치 - 필요시만 실행]
# %pip install requests beautifulsoup4 lxml pandas urllib3


In [2]:
# %% [임포트 & 전역 설정]
from __future__ import annotations
import os, re, time
from dataclasses import dataclass, asdict
from typing import Optional, List, Dict, Tuple
from urllib.parse import urljoin, urlparse, parse_qs, quote

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
import pandas as pd

LIST_URL = "https://maruartcenter.co.kr/default/exhibit/exhibit01.php?sub=01"
PARSER = "lxml"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0 Safari/537.36"
    ),
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8",
}
TIMEOUT = 15
SLEEP_BETWEEN = 0.4
MAX_PAGES = 1
IMG_DIR = "maru_images"
DOWNLOAD_IMAGES = False
MAX_IMGS_PER_POST = None


In [3]:
# %% [세션/유틸 함수]
def get_session() -> requests.Session:
    s = requests.Session()
    s.headers.update(HEADERS)
    retry = Retry(
        total=3,
        backoff_factor=0.4,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET", "HEAD"]
    )
    s.mount("http://", HTTPAdapter(max_retries=retry, pool_connections=8, pool_maxsize=16))
    s.mount("https://", HTTPAdapter(max_retries=retry, pool_connections=8, pool_maxsize=16))
    return s

def _make_soup(html: str) -> BeautifulSoup:
    try:
        return BeautifulSoup(html, PARSER)
    except Exception:
        return BeautifulSoup(html, "html.parser")

def _get_soup(url: str, s: requests.Session, *, referrer: Optional[str] = None) -> BeautifulSoup:
    headers = dict(HEADERS)
    if referrer:
        headers["Referer"] = referrer
    r = s.get(url, headers=headers, timeout=TIMEOUT)
    if not r.encoding or r.encoding.lower() in ("iso-8859-1", "ascii"):
        r.encoding = r.apparent_encoding or r.encoding
    return _make_soup(r.text)

def _clean_text(txt: str) -> str:
    if not txt:
        return ""
    txt = txt.replace("\xa0", " ")
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

def _abs_url(base: str, u: str) -> str:
    return urljoin(base, u)


In [4]:
# %% [목록 → 상세 링크 수집]
def list_current_detail_urls(list_url: str = LIST_URL, *, max_pages: int = MAX_PAGES) -> List[str]:
    out: List[str] = []
    seen = set()
    with get_session() as s:
        for page in range(1, max_pages + 1):
            url = list_url
            if page > 1:
                sep = '&' if '?' in url else '?'
                url = f"{url}{sep}com_board_page={page}"
            soup = _get_soup(url, s)
            for a in soup.select('a[href*="exhibit01.php"][href*="read_form"], a[href*="com_board_idx="]'):
                href = a.get("href")
                if not href:
                    continue
                u = urljoin(list_url, href)
                if "read_form" in u and "com_board_idx=" in u:
                    if u not in seen:
                        seen.add(u)
                        out.append(u)
            time.sleep(SLEEP_BETWEEN)
    return out


In [5]:
# %% [패턴 정의 & 테이블 필드 추출]
_PERIOD_PATTERNS = [
    re.compile(r"(?P<s>\d{4}\.\s*\d{1,2}\.\s*\d{1,2})\s*[-–~]\s*(?P<e>\d{1,2}\.\s*\d{1,2})"),
    re.compile(r"(?P<s>\d{4}\.\s*\d{1,2}\.\s*\d{1,2})\s*[-–~]\s*(?P<e>\d{4}\.\s*\d{1,2}\.\s*\d{1,2})"),
    re.compile(r"기간\s*[:：]?\s*(?P<s>\d{4}\.\s*\d{1,2}\.\s*\d{1,2})\s*[-–~]\s*(?P<e>\d{1,2}\.\s*\d{1,2}(?:\.\s*\d{1,2})?)"),
]

_SECTION_LABELS = [
    ("전시설명", re.compile(r"^\s*\[?전시\s*설명\]?\s*$")),
    ("전시서문", re.compile(r"^\s*\[?전시\s*서문\]?\s*$")),
    ("작가노트", re.compile(r"^\s*\[?작가\s*노트\]?\s*$")),
    ("작가의 글", re.compile(r"^\s*\[?작가의\s*글\]?\s*$")),
]

def _extract_table_field(soup: BeautifulSoup, label_keywords: List[str]) -> Optional[str]:
    for tr in soup.select("tr"):
        tds = tr.find_all("td")
        if len(tds) < 2:
            continue
        left = _clean_text(tds[0].get_text(" "))
        right = _clean_text(tds[1].get_text(" "))
        if not left:
            left, right = right, left
        if any(kw in left for kw in label_keywords):
            if right:
                return right
    return None

def _extract_title(soup: BeautifulSoup) -> Optional[str]:
    t = _extract_table_field(soup, ["제목"])
    if t:
        return t
    m = soup.find("meta", attrs={"property": "og:title"})
    if m and m.get("content"):
        t = _clean_text(m["content"])
        if t:
            return t
    candidates = []
    for sel in ["h1", "h2", ".tit", ".title", ".subject", ".board_tit", ".view_tit"]:
        for el in soup.select(sel):
            txt = _clean_text(el.get_text(" "))
            if txt:
                candidates.append((len(txt), txt))
    if candidates:
        candidates.sort(reverse=True)
        return candidates[0][1]
    if soup.title and soup.title.string:
        return _clean_text(soup.title.string)
    return None

def _extract_period_from_table_or_text(soup: BeautifulSoup) -> Optional[str]:
    p = _extract_table_field(soup, ["기간", "전시기간", "전시 일정", "DATE"])
    if p:
        return p
    text = _clean_text(soup.get_text("\n"))
    for pat in _PERIOD_PATTERNS:
        m = pat.search(text)
        if m:
            s = _clean_text(m.group("s")); e = _clean_text(m.group("e"))
            return f"{s} - {e}"
    for line in text.splitlines():
        if "기간" in line:
            line = _clean_text(line)
            if len(line) > 3:
                return line
    return None


In [6]:
# %% [본문 스코프 & 푸터/노이즈 필터 헬퍼]
_CONTENT_SELECTORS = "#post_area, .board_view, .view, .view_area, .view_con, .board, #board, .content, .editor, .viewDetail"
_FOOTER_SELECTORS = "footer, #footer, .footer, .foot, .ft, .bottom, .site-info, address"

_FOOTER_PAT = re.compile(
    r"("
    r"개인정보(처리|취급)방침|이메일무단수집거부|오시는길|고객센터|회사\s*:|대표자|사업자|사업자등록|"
    r"주소\s*:|Tel\s*:|Fax\s*:|EMAIL\s*:|E-?mail\s*:|Copyright|COPYRIGHT|All\s+Right[s]?\s+Reserved"
    r")",
    re.IGNORECASE
)

_UI_NOISE_SELECTORS = """
script, style, noscript,
#footer, footer, .footer, .foot, .ft, .bottom, .site-info, address,
.board_buttons, .board_btn, .post_btn, .post_buttons, .view_btns, .btn_area,
.comment, .comments, #comments, #comment, .reply, #reply,
#vote, .vote, .rating, .evaluate, .post_evaluate,
.pagination, .pager, .navi, .nav,
.sns_share, .share, .share_box,
.prev, .next, .list, .btn, .bx-wrapper, .slider, .slide
""".replace("\n", " ")

def get_content_scope(soup: BeautifulSoup) -> Tag:
    scope = soup.select_one(_CONTENT_SELECTORS) or soup
    for f in scope.select(_FOOTER_SELECTORS):
        f.decompose()
    return scope

def is_footer_text(text: str) -> bool:
    t = _clean_text(text)
    if not t:
        return False
    return bool(_FOOTER_PAT.search(t))

def trim_footer_tail(block_text: str) -> str:
    if not block_text:
        return block_text
    lines = [l.rstrip() for l in block_text.splitlines()]
    cut = len(lines)
    for i, line in enumerate(lines):
        if is_footer_text(line):
            cut = i
            break
    cleaned = "\n".join(lines[:cut]).rstrip()
    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
    return cleaned

def prune_noise_nodes(root: Tag) -> None:
    for n in root.select(_UI_NOISE_SELECTORS):
        n.decompose()
    for c in root.find_all(string=lambda t: isinstance(t, Comment)):
        c.extract()
    for el in root.find_all(True):
        style = (el.get("style") or "").lower()
        if "display:none" in style or "visibility:hidden" in style:
            el.decompose()


In [7]:
# %% [섹션 추출 헬퍼]
_LABEL_RE = re.compile(r"^\s*\[?(전시\s*설명|전시\s*서문|작가\s*노트|작가의\s*글)\]?\s*$")
_LABEL_ANY_RE = re.compile(r"\[?\s*(전시\s*설명|전시\s*서문|작가\s*노트|작가의\s*글)\s*\]?")

_HEADER_NOISE_PAT = re.compile(
    r"^\s*("
    r"현재전시|제목\s*:?\s*|기간\s*:?\s*|장소\s*:?\s*|관람시간|관람료|문의|"
    r"게시물\s*평가|댓글\s*쓰기|댓글\s*목록|목록|이전\s*다음|이전|다음|추천하기|수정하기|삭제하기|답글쓰기|글쓰기|"
    r"슬라이드|게시판\s*끝|컨텐츠\s*끝|FOOTER|▼|▲|[-–—]"
    r")\s*$",
    re.IGNORECASE
)

def _is_label_text(t: str) -> bool:
    return bool(_LABEL_RE.match(_clean_text(t or "")))

def _strip_leading_until_label(text: str) -> str:
    if not text:
        return text
    m = _LABEL_ANY_RE.search(text)
    return text[m.end():].lstrip() if m else text

_JS_TRASH_PAT = re.compile(r"(function\s+\w+\s*\(|window\.onload|document\.getElementById|var\s+\w+\s*=)", re.I)
def sanitize_text_block(text: str) -> str:
    if not text:
        return text
    lines = []
    for ln in text.splitlines():
        t = ln.strip()
        if t and (_JS_TRASH_PAT.search(t) or _HEADER_NOISE_PAT.match(t)):
            continue
        lines.append(ln)
    out = "\n".join(lines)
    return re.sub(r"\n{3,}", "\n\n", out).strip()

def _nearest_block(tag: Tag) -> Tag:
    cur = tag
    while cur and isinstance(cur, Tag) and cur.name.lower() not in {"p", "div", "li", "section", "article"}:
        cur = cur.parent
    return cur if isinstance(cur, Tag) else tag

def _collect_following_text(start: Tag) -> str:
    buf: List[str] = []
    it = start.next_elements
    first = True
    for el in it:
        if first:
            first = False
            continue

        if isinstance(el, Tag):
            name = el.name.lower()
            if name in {"footer", "address", "hr", "h1", "h2", "h3", "h4"}:
                break
            if el.select_one(_FOOTER_SELECTORS):
                break

        if isinstance(el, NavigableString):
            txt = _clean_text(str(el))
        elif isinstance(el, Tag):
            txt = _clean_text(el.get_text("\n"))
        else:
            txt = ""

        if not txt:
            continue
        if _is_label_text(txt):
            break
        if _HEADER_NOISE_PAT.match(txt.strip()):
            continue

        buf.append(txt)
        if sum(len(x) for x in buf) > 12000:
            break

    out = "\n".join(x for x in buf if x).strip()
    out = trim_footer_tail(out)
    out = _strip_leading_until_label(out)
    out = sanitize_text_block(out)
    return out

def _extract_sections(soup: BeautifulSoup) -> Dict[str, str]:
    root = get_content_scope(soup)
    prune_noise_nodes(root)
    textmap: Dict[str, str] = {}

    for node in root.find_all(string=True):
        s = _clean_text(str(node))
        if not s:
            continue
        if _is_label_text(s):
            block = _nearest_block(node if isinstance(node, Tag) else node.parent)
            content = _collect_following_text(block)
            if content:
                label = _clean_text(s).strip("[]")
                textmap[label] = content

    if not textmap:
        for lab, pat in _SECTION_LABELS:
            for el in root.find_all(["h1", "h2", "h3", "strong", "b", "p", "div"]):
                t = _clean_text(el.get_text(" "))
                if pat.match(t):
                    content = _collect_following_text(el)
                    if content:
                        textmap[lab] = content

    if not textmap:
        paragraphs = [
            _clean_text(p.get_text("\n"))
            for p in root.find_all(["p", "div"])
        ]

        cleaned_paras = []
        for para in paragraphs:
            lines = [ln for ln in para.splitlines() if ln.strip()]
            if lines and all(_HEADER_NOISE_PAT.match(ln) for ln in lines):
                continue
            cleaned_paras.append(para)

        cleaned_paras = [
            sanitize_text_block(trim_footer_tail(_strip_leading_until_label(x)))
            for x in cleaned_paras if x and len(x) >= 40
        ]
        cleaned_paras.sort(key=len, reverse=True)
        if cleaned_paras:
            textmap["본문"] = cleaned_paras[0]

    return textmap


In [8]:
# %% [이미지 수집 & 다운로드]
_IMG_EXT = (".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif")

_EXCLUDE_PATH_SUBSTR = [
    "/default/img/common/",
    "/img/common/",
    "/component/board/board_10/list.gif",
    "/component/board/board_10/write.gif",
]
_EXCLUDE_NAME_EXACT = {
    "icon-phone.png", "icon-insta.png", "icon-blog.png", "icon-map.png",
    "icon-top.png", "logo.png", "logo-m.png", "logo-f.png",
}
_EXCLUDE_NAME_PREFIX = ("icon", "logo")

def _norm_name_from_url(u: str) -> str:
    name = os.path.basename(urlparse(u).path)
    name = re.sub(r'^thumb-', '', name, flags=re.IGNORECASE)
    name = re.sub(r'_(\d+)x(\d+)(?=\.[A-Za-z0-9]+$)', '', name)
    return name.lower()

def dedupe_img_urls_by_key(img_urls: List[str]) -> List[str]:
    uniq, seen = [], set()
    for u in img_urls:
        key = _norm_name_from_url(u)
        if key and key not in seen:
            seen.add(key)
            uniq.append(u)
    return uniq

def _should_keep_image(u: str) -> bool:
    p = urlparse(u).path.lower()
    name = os.path.basename(p)
    for sub in _EXCLUDE_PATH_SUBSTR:
        if sub in p:
            return False
    if name in _EXCLUDE_NAME_EXACT:
        return False
    if name.startswith(_EXCLUDE_NAME_PREFIX):
        return False
    return True

def collect_image_urls(detail_url: str, soup: BeautifulSoup) -> List[str]:
    urls: List[str] = []
    scope = get_content_scope(soup)
    for img in scope.find_all("img"):
        cand = None
        for attr in ("src", "data-src", "data-original", "data-lazy", "data-echo"):
            v = img.get(attr)
            if v and isinstance(v, str):
                cand = v
                break
        if not cand:
            continue
        u = _abs_url(detail_url, cand)
        path = urlparse(u).path.lower()
        if ((not os.path.splitext(path)[1]) or path.endswith(_IMG_EXT)) and _should_keep_image(u):
            urls.append(u)
    urls = dedupe_img_urls_by_key(urls)
    return urls

def _filename_from_url_or_headers(url: str, resp) -> str:
    base = os.path.basename(urlparse(url).path)
    if base:
        return base
    cd = resp.headers.get("Content-Disposition", "")
    m = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', cd)
    if m:
        return m.group(1)
    ctype = (resp.headers.get("Content-Type") or "").lower()
    if "png" in ctype: return "image.png"
    if "webp" in ctype: return "image.webp"
    return "image.jpg"

def download_images_from_urls(detail_url: str, img_urls: List[str], img_dir: str = IMG_DIR, max_imgs: Optional[int] = MAX_IMGS_PER_POST) -> List[str]:
    if not img_urls:
        return []
    img_urls = dedupe_img_urls_by_key(img_urls)
    qs = parse_qs(urlparse(detail_url).query)
    post_id = qs.get("com_board_idx", ["unknown"])[0]
    subdir = os.path.join(img_dir, re.sub(r"[^0-9A-Za-z_-]", "_", post_id))
    os.makedirs(subdir, exist_ok=True)

    saved: List[str] = []
    tried = 0
    with get_session() as s:
        _ = _get_soup(detail_url, s)
        for u in img_urls:
            if max_imgs is not None and tried >= max_imgs:
                break
            tried += 1
            try:
                r = s.get(
                    u,
                    headers={**HEADERS, "Referer": detail_url, "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8"},
                    timeout=TIMEOUT,
                    allow_redirects=True,
                )
                ctype = (r.headers.get("Content-Type") or "").lower()
                if r.status_code == 200 and r.content and "image" in ctype:
                    name = _filename_from_url_or_headers(u, r)
                    if "." not in os.path.basename(name):
                        if "png" in ctype: name += ".png"
                        elif "webp" in ctype: name += ".webp"
                        else: name += ".jpg"
                    base, ext = os.path.splitext(name)
                    final = os.path.join(subdir, name)
                    k = 1
                    while os.path.exists(final):
                        final = os.path.join(subdir, f"{base}_{k}{ext}"); k += 1
                    with open(final, "wb") as f:
                        f.write(r.content)
                    saved.append(final)
                else:
                    print(f"[이미지 응답 이상] {r.status_code} {u} (ctype={ctype})")
            except Exception as e:
                print(f"[이미지 실패] {u} -> {e}")
    return saved


In [9]:
# %% [데이터 모델 & 상세 파서]
@dataclass
class ExhibitRecord:
    url: str
    title: str
    period: str
    section_type: str
    section_text: str
    image_urls: List[str]
    saved_images: List[str]

def parse_detail(url: str, s: Optional[requests.Session] = None, *, download_images: bool = DOWNLOAD_IMAGES) -> ExhibitRecord:
    own = False
    if s is None:
        s = get_session(); own = True
    try:
        soup = _get_soup(url, s, referrer=LIST_URL)
        title = _extract_title(soup) or ""
        period = _extract_period_from_table_or_text(soup) or ""
        sections = _extract_sections(soup)
        order = ["전시설명", "전시서문", "작가노트", "작가의 글", "본문"]
        section_type, section_text = "", ""
        for k in order:
            if k in sections and sections[k]:
                section_type, section_text = k, sections[k]
                break
        image_urls = collect_image_urls(url, soup)
        saved = download_images_from_urls(url, image_urls) if download_images else []
        return ExhibitRecord(url=url, title=title, period=period, section_type=section_type, section_text=section_text, image_urls=image_urls, saved_images=saved)
    finally:
        if own:
            s.close()


In [10]:
# %% [엔드투엔드 크롤러]
def crawl_maru_current(list_url: str = LIST_URL, *, max_pages: int = MAX_PAGES, limit: Optional[int] = None, download_images: bool = DOWNLOAD_IMAGES) -> List[ExhibitRecord]:
    detail_urls = list_current_detail_urls(list_url, max_pages=max_pages)
    if limit is not None:
        detail_urls = detail_urls[:limit]
    results: List[ExhibitRecord] = []
    with get_session() as s:
        _ = _get_soup(list_url, s)
        for du in detail_urls:
            try:
                rec = parse_detail(du, s, download_images=download_images)
                results.append(rec)
            except Exception as e:
                results.append(ExhibitRecord(url=du, title="", period="", section_type="", section_text=f"[ERROR] {e}", image_urls=[], saved_images=[]))
            time.sleep(SLEEP_BETWEEN)
    return results


In [11]:
# %% [실행 예시]
DOWNLOAD_IMAGES = False
MAX_PAGES = 1

records = crawl_maru_current(LIST_URL, max_pages=MAX_PAGES, limit=10, download_images=DOWNLOAD_IMAGES)
df = pd.DataFrame([{**asdict(r),
                    "images_count": len(r.image_urls),
                    "first_image": r.image_urls[0] if r.image_urls else "",
                    "saved_count": len(r.saved_images)} for r in records])

cols = ["title", "period", "section_type", "images_count", "first_image", "url"]
display(df[cols])

df.to_csv("maru_current_exhibits.csv", index=False, encoding="utf-8-sig")
df.to_json("maru_current_exhibits.json", orient="records", force_ascii=False)
print("Saved maru_current_exhibits.(csv|json)")


Unnamed: 0,title,period,section_type,images_count,first_image,url
0,제목,제목,,5,https://maruartcenter.co.kr/bizdemo133414/comp...,https://maruartcenter.co.kr/default/exhibit/ex...
1,제목,제목,전시설명,4,https://maruartcenter.co.kr/bizdemo133414/comp...,https://maruartcenter.co.kr/default/exhibit/ex...
2,제목,제목,작가노트,3,https://maruartcenter.co.kr/bizdemo133414/comp...,https://maruartcenter.co.kr/default/exhibit/ex...
3,제목,제목,전시서문,4,https://maruartcenter.co.kr/bizdemo133414/comp...,https://maruartcenter.co.kr/default/exhibit/ex...
4,제목,제목,,4,https://maruartcenter.co.kr/bizdemo133414/comp...,https://maruartcenter.co.kr/default/exhibit/ex...
5,제목,제목,,3,https://maruartcenter.co.kr/bizdemo133414/comp...,https://maruartcenter.co.kr/default/exhibit/ex...


Saved maru_current_exhibits.(csv|json)
