In [None]:
!pip install selenium
!pip install webdriver-manager
!pip install google
!pip install beautifulsoup4
!pip install pandas
!pip install openpyxl


Collecting google
  Downloading google-3.0.0-py2.py3-none-any.whl.metadata (627 bytes)
Downloading google-3.0.0-py2.py3-none-any.whl (45 kB)
Installing collected packages: google
Successfully installed google-3.0.0
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [17]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from googlesearch import search
import time
import re
import pandas as pd

In [23]:
# Excel-safe 본문 정제 함수
def sanitize_for_excel(text):
    text = re.sub(r'[\[\]\*\?\/\\:]', '', text)
    text = ''.join(ch for ch in text if ch.isprintable())
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text.strip()

# 본문 필터 조건
def match_advanced_filter(text, include_keywords, exclude_keywords):
    text = text.lower()
    return all(kw.lower() in text for kw in include_keywords) and not any(bad_kw.lower() in text for bad_kw in exclude_keywords)

# ✅ 네이버 블로그 본문 수집 (Selenium + iframe 대응)
def get_blog_full_text_selenium(driver, url):
    try:
        driver.get(url)
        time.sleep(2)

        # iframe 내부로 이동 (구형 블로그 대응)
        if "blog.naver.com" in url:
            try:
                iframe = driver.find_element(By.TAG_NAME, "iframe")
                driver.switch_to.frame(iframe)
                time.sleep(1)
            except:
                pass

        # 최신 에디터
        try:
            main = driver.find_element(By.CSS_SELECTOR, "div.se-main-container")
        except:
            try:
                main = driver.find_element(By.CSS_SELECTOR, "div#postViewArea")
            except:
                return ""

        return sanitize_for_excel(main.text)
    except:
        return ""

# ✅ 네이버 블로그 크롤링
def crawl_naver_blog(keyword, include_keywords, exclude_keywords, max_pages=2):
    options = Options()
    options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    results = []
    for page in range(1, max_pages + 1):
        start = (page - 1) * 10 + 1
        url = f"https://search.naver.com/search.naver?query={keyword}&where=post&sm=tab_pge&start={start}"
        driver.get(url)
        time.sleep(2)

        links = driver.find_elements(By.CSS_SELECTOR, "a.api_txt_lines")
        for link in links:
            try:
                title = link.text
                href = link.get_attribute("href")
                body = get_blog_full_text_selenium(driver, href)
                if match_advanced_filter(body, include_keywords, exclude_keywords):
                    results.append(("네이버 블로그", title, href, body))
            except:
                continue

    driver.quit()
    return results

# ✅ 벨로그 크롤링
def crawl_velog(keyword, include_keywords, exclude_keywords, max_scroll=3):
    options = Options()
    options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    driver.get(f"https://velog.io/search?q={keyword}")
    time.sleep(2)

    for _ in range(max_scroll):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    articles = driver.find_elements(By.CSS_SELECTOR, "div.Feed__ArticleCard-sc-1t9x9k6-0")
    results = []

    for article in articles:
        try:
            link = article.find_element(By.TAG_NAME, "a").get_attribute("href")
            title = article.find_element(By.CSS_SELECTOR, "h4").text
            res = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
            soup = BeautifulSoup(res.text, "html.parser")
            content = soup.select_one("div.content")
            body = sanitize_for_excel(content.get_text(separator="\n")) if content else ""
            if match_advanced_filter(body, include_keywords, exclude_keywords):
                results.append(("벨로그", title, link, body))
        except:
            continue

    driver.quit()
    return results

# ✅ 티스토리 크롤링
def crawl_tistory_via_google(keyword, include_keywords, exclude_keywords, max_results=10):
    query = f"site:tistory.com {keyword}"
    results = []
    for url in search(query, stop=max_results):
        try:
            res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=5)
            soup = BeautifulSoup(res.text, 'html.parser')
            title_tag = soup.find("title")
            title = title_tag.text if title_tag else url
            candidates = [
                "div.entry-content",
                "div.article",
                "div.tt_article_useless_p_margin",
                "div.contents_style",
                "div#content",
                "div.blogview_content"
            ]
            body = ""
            for sel in candidates:
                el = soup.select_one(sel)
                if el:
                    body = sanitize_for_excel(el.get_text(separator="\n"))
                    break
            if match_advanced_filter(body, include_keywords, exclude_keywords):
                results.append(("티스토리", title, url, body))
        except:
            continue
    return results

# ✅ CSV 저장
def save_to_file(data, base_filename="blog_results"):
    df = pd.DataFrame(data, columns=["플랫폼", "제목", "링크", "본문"])
    df.to_csv(f"{base_filename}.csv", index=False, encoding='utf-8-sig')
    print(f"\n📁 저장 완료: {base_filename}.csv")

# ✅ 실행 함수
def main():
    search_keyword = input("검색할 키워드를 입력하세요: ").strip()
    include_keywords = input("본문에 반드시 포함할 키워드 (쉼표로 구분): ").strip().split(',')
    exclude_keywords = input("본문에 제외할 키워드 (쉼표로 구분, 없으면 Enter): ").strip().split(',')
    if exclude_keywords == ['']: exclude_keywords = []

    print("\n[🔍 크롤링 중... 잠시만 기다려주세요]\n")

    naver = crawl_naver_blog(search_keyword, include_keywords, exclude_keywords)
    velog = crawl_velog(search_keyword, include_keywords, exclude_keywords)
    tistory = crawl_tistory_via_google(search_keyword, include_keywords, exclude_keywords)

    total = naver + velog + tistory
    print(f"\n✅ 총 {len(total)}개 결과 수집됨!\n")

    for platform, title, link, _ in total:
        print(f"[{platform}] {title} → {link}")

    save_to_file(total)

if __name__ == "__main__":
    main()


[🔍 크롤링 중... 잠시만 기다려주세요]



ReadTimeoutError: HTTPConnectionPool(host='localhost', port=50617): Read timed out. (read timeout=120)

In [22]:
df = pd.read_csv("blog_results.csv")

df

Unnamed: 0,플랫폼,제목,링크,본문
0,네이버 블로그,Boot camp 영어로 신병 훈련소를 뜻한다. (주로 군인이 등장하는) 게임에서 ...,https://namu.wiki/w/%EB%B6%80%ED%8A%B8%20%EC%B...,
1,네이버 블로그,"무료, 유료, 온라인·오프라인, 채용연계 코딩 부트캠프를 한 눈에 비교하고 나한테 ...",https://boottent.com/camps,
2,네이버 블로그,간편한 설치 ; Boot Camp 지원은 사용자가 제공하는 Windows ISO를 ...,https://support.apple.com/ko-kr/guide/bootcamp...,
3,네이버 블로그,2024.08.14. [현직자와 함께 5주 직무경험_직무부트캠프] 참여자 모집_모집...,https://www.syu.ac.kr/blog/%ED%98%84%EC%A7%81%...,
4,네이버 블로그,경남대 반도체부트캠프사업단은 2일 라온솔루션을 초청해 반도체 산업 핵심 기술 특강을...,http://www.veritas-a.com/news/articleView.html...,
5,네이버 블로그,▲ /경남대 경남대학교 반도체부트캠프사업단(단장 배성환)은 지난 1일 제4공학관에서...,http://www.gndomin.com/news/articleView.html?i...,
6,네이버 블로그,대전테크노파크(이하 대전TP)는 대전 지역기업의 지속적인 성장지원을 위해 ‘대전 지...,http://www.edaily.co.kr/news/newspath.asp?news...,
7,네이버 블로그,보안솔루션 미국 진출 컨설팅·판매대행 전문기업 위브릿지코리아는 지난 3일 판교에서 ...,https://www.newsis.com/view/NISX20250404_00031...,
8,네이버 블로그,Boot camp 영어로 신병 훈련소를 뜻한다. (주로 군인이 등장하는) 게임에서 ...,https://namu.wiki/w/%EB%B6%80%ED%8A%B8%20%EC%B...,
9,네이버 블로그,"무료, 유료, 온라인·오프라인, 채용연계 코딩 부트캠프를 한 눈에 비교하고 나한테 ...",https://boottent.com/camps,
