In [1]:
!pip install requests beautifulsoup4 pandas

Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting beautifulsoup4
  Using cached beautifulsoup4-4.13.3-py3-none-any.whl.metadata (3.8 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.1-cp312-cp312-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4)
  Using cached typing_extensions-4.13.1-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.26.0 

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_blog_full_text(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        res = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(res.text, 'html.parser')

        # 네이버 블로그의 iframe 구조 처리
        if "blog.naver.com" in url and "PostView.naver" not in res.url:
            iframe = soup.find("iframe")
            if iframe:
                iframe_url = "https://blog.naver.com" + iframe["src"]
                res = requests.get(iframe_url, headers=headers, timeout=5)
                soup = BeautifulSoup(res.text, 'html.parser')

        # ✅ 본문 추출 시도 (최신 구조 대응)
        main_content = soup.select_one("div.se-main-container")  # 에디터 3.0
        if not main_content:
            main_content = soup.select_one("div#postViewArea")  # 구형 에디터 대응

        if main_content:
            text = main_content.get_text(separator="\n").strip()
            return text
        else:
            return ""
    except Exception as e:
        return ""

def crawl_naver_blog(keyword, max_pages=3):
    headers = {'User-Agent': 'Mozilla/5.0'}
    results = []

    for page in range(1, max_pages + 1):
        start = (page - 1) * 10 + 1
        url = f"https://search.naver.com/search.naver?query={keyword}&where=post&sm=tab_pge&start={start}"
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')

        items = soup.select("li.bx")
        print(f"[{keyword}] Page {page}: Found {len(items)} blog items")

        for item in items:
            title_tag = item.select_one("a.api_txt_lines")
            if title_tag:
                title = title_tag.text.strip()
                link = title_tag['href']
                body_text = get_blog_full_text(link)
                results.append({'keyword': keyword, 'title': title, 'link': link, 'body': body_text})

        time.sleep(1)

    return results

# 실행
keywords = ["HTML부터 배우기 비전공자", "컴공 아닌데 개발자 되기"]
all_results = []

for kw in keywords:
    all_results.extend(crawl_naver_blog(kw, max_pages=3))

df = pd.DataFrame(all_results)
df.to_csv("naver_blog_full.csv", index=False, encoding='utf-8-sig')
print("✅ 본문 포함 크롤링 완료! 'naver_blog_full.csv' 저장됨.")


# 🔍 대상 키워드
keywords = ["HTML부터 배우기 비전공자", "컴공 아닌데 개발자 되기"]
all_results = []

for kw in keywords:
    all_results.extend(crawl_naver_blog(kw, max_pages=3))

# 💾 저장
df = pd.DataFrame(all_results)
df.to_csv("naver_blog_results.csv", index=False, encoding='utf-8-sig')
print("✅ 크롤링 완료! naver_blog_results.csv 저장됨")


[HTML부터 배우기 비전공자] Page 1: Found 35 blog items
[HTML부터 배우기 비전공자] Page 2: Found 35 blog items


KeyboardInterrupt: 