# STEP 1-1. 단일 키워드 
- 현재 검색어 상위노출 중인 블로거를 추출합니다. (단일 키워드)

In [1]:
#단일검색
import PySimpleGUI as sg
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.utils.dataframe import dataframe_to_rows

while True:
    layout = [
        [sg.Text('검색어를 입력하세요:', size=(18, 1)), sg.InputText(size=(35, 1), key='SEARCH_QUERY')],
        [sg.Text('가져올 글의 수:', size=(18, 1)), sg.Radio('15개', "RADIO1", default=True, key='15', enable_events=True),
         sg.Radio('30개', "RADIO1", default=False, key='30', enable_events=True),
         sg.Radio('50개', "RADIO1", default=False, key='50', enable_events=True), 
         sg.Radio('100개', "RADIO1", default=False, key='100', enable_events=True)],
        [sg.Text(' '*36), sg.Text('또는 직접 입력:', size=(11, 1)), sg.InputText(size=(20, 1), key='CUSTOM_NUM', enable_events=True)],
        [sg.Column([
            [sg.Submit(size=(10, 1), key='SUBMIT'), sg.Cancel(size=(10, 1))]
        ], justification='right', element_justification='right')]
    ]

    window = sg.Window('네이버 블로그 검색', layout)
    errors = []  # 에러 메시지를 담을 리스트

    while True:
        event, values = window.read()
        if event in (sg.WIN_CLOSED, 'Cancel'):
            window.close()
            raise SystemExit

        # 라디오 버튼을 선택할 때 직접 입력 필드 클리어
        if event in ['15', '30', '50', '100']:
            window['CUSTOM_NUM'].update('')

        # 직접 입력 필드에 입력할 때 라디오 버튼 해제
        if event == 'CUSTOM_NUM' and values['CUSTOM_NUM']:
            for key in ['15', '30', '50', '100']:
                window[key](value=False)

        if event == 'SUBMIT':
            errors.clear()
            search_query = values['SEARCH_QUERY'].strip()
            custom_num = values['CUSTOM_NUM'].strip()
            selected_num = [key for key in ['15', '30', '50', '100'] if values[key]]

            if not search_query:
                errors.append('검색어를 입력해야 합니다.')

            if custom_num and selected_num:
                errors.append('직접 입력과 선택 옵션 중 하나만 사용해주세요.')

            if custom_num:
                try:
                    num_of_posts = int(custom_num)
                except ValueError:
                    errors.append('직접 입력한 수는 유효한 숫자여야 합니다.')

            elif selected_num:
                num_of_posts = int(selected_num[0])
            else:
                errors.append('글 수를 입력하거나 선택해주세요.')

            if errors:
                sg.popup_error("\n".join(errors))  # 모든 에러를 하나의 팝업으로 표시
                continue
            break

    window.close()

    break  # 입력 검증 완료, 웹 스크래핑 및 기타 처리 시작

options = Options()
options.headless = False
driver = webdriver.Chrome(options=options)

driver.get("https://www.naver.com")
time.sleep(1)

driver.find_element(By.ID, "query").send_keys(search_query)
driver.find_element(By.ID, "search-btn").click()
time.sleep(1)

driver.find_element(By.LINK_TEXT, "블로그").click()
time.sleep(2)

try:
    while True:
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(2)
        html = BeautifulSoup(driver.page_source, 'html.parser')
        posts = html.find_all('a', class_='title_link')
        if (num_of_posts and len(posts) >= num_of_posts) or html.find('span', class_='bl_tit'):
            break

    if num_of_posts is None or len(posts) < num_of_posts:
        num_of_posts = len(posts)

except Exception as e:
    sg.popup_error('브라우저 창이 닫혔습니다:', str(e))
    driver.quit()
    raise SystemExit

name_links = html.find_all('a', class_='name')[:num_of_posts]
title_links = posts[:num_of_posts]
date_links = html.find_all('span', class_='sub')[:num_of_posts]

data = []
for name_link, title_link, date_link in zip(name_links, title_links, date_links):
    url = title_link['href']
    email = ''
    if "https://blog.naver.com/" in url:
        username = url.split('/')[3]
        email = f"{username}@naver.com"
    elif "https://adcr.naver.com/" in url:
        email = "파워콘텐츠"
    elif "https://post.naver.com/" in url:
        email = "포스트"

    data.append((name_link.text.strip(), "", title_link.text.strip(), date_link.text.strip(), url, email))

df = pd.DataFrame(data, columns=['NAME', '2차키워드', '제목', '게시일', 'URL', '이메일'])

wb = Workbook()
ws = wb.active

for r in dataframe_to_rows(df, index=False, header=True):
    ws.append(r)

ws.column_dimensions['A'].width = 16
ws.column_dimensions['B'].width = 16
ws.column_dimensions['C'].width = 70
ws.column_dimensions['D'].width = 16
ws.column_dimensions['E'].width = 8
ws.column_dimensions['F'].width = 24

center_alignment = Alignment(horizontal='center')
for cell in ws["D"] + ws["F"]:
    cell.alignment = center_alignment

header_font = Font(bold=True)
header_fill = PatternFill(start_color="B7DEE8", end_color="B7DEE8", fill_type="solid")
thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin'))

for cell in ws[1]:
    cell.font = header_font
    cell.alignment = center_alignment
    cell.fill = header_fill
    cell.border = thin_border

gray_fill = PatternFill(start_color="ECECEC", end_color="ECECEC", fill_type="solid")
for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
    if row[5].value in ['파워콘텐츠', '포스트']:
        for cell in row:
            cell.fill = gray_fill

ws.auto_filter.ref = ws.dimensions

save_filename = sg.popup_get_file(
    '파일을 저장할 경로를 선택하세요',
    save_as=True,
    no_window=True,
    default_extension='xlsx',
    file_types=(('Excel Files', '*.xlsx'),)
)

if save_filename:
    wb.save(save_filename)
    print(f'{save_filename} 파일이 저장되었습니다.')

driver.quit()


C:/Users/USER/Desktop/github/naver/naver_blog_bot/1-1_캐리어바퀴_15.xlsx 파일이 저장되었습니다.


# STEP1-2 복수 키워드 검색(a2)
- 현재 검색어 상위노출 중인 블로거를 추출합니다. (복수 키워드)

- [키워드] 엑셀파일 **B3**부터 키워드 입력 // GUI에서 엑셀파일 불러오기
- 크롤링 갯수 선택
- 파일 저장 → 헤더x [ A키워드 / B블로그명 / C블로그제목 / D날짜 / E포스팅링크 / F메일주소 ]
- 출력 결과 -> 헤더 없음

In [3]:

import PySimpleGUI as sg
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.utils.dataframe import dataframe_to_rows

# 엑셀 파일 선택 및 불러오기
filename = sg.popup_get_file('검색어가 포함된 엑셀 파일을 선택하세요', file_types=(('Excel Files', '*.xlsx'),))
if not filename:
    sg.popup_error('파일을 선택하지 않았습니다.')
    raise SystemExit

# 검색어 목록 불러오기
df_keywords = pd.read_excel(filename, usecols='B')  # B열만 불러온다고 가정
keywords = df_keywords.iloc[1:].squeeze().tolist()  # 첫 번째 행(헤더)를 제외하고 리스트로 변환

# GUI 설정
layout = [
    [sg.Text('가져올 글의 수를 선택하세요:', size=(25, 1)), sg.Radio('10개', "NUM_POSTS", default=True, key='10'),
     sg.Radio('20개', "NUM_POSTS", key='20'), sg.Radio('30개', "NUM_POSTS", key='30'),
     sg.Radio('50개', "NUM_POSTS", key='50')],
    [sg.Submit('Start'), sg.Cancel()]
]

window = sg.Window('Post Count Selection', layout)
event, values = window.read()

if event in (None, 'Cancel'):
    window.close()
    raise SystemExit

# 선택된 글의 수 파악
num_posts = next((int(k) for k, v in values.items() if v and k.isdigit()), 10)
window.close()

options = Options()
options.headless = False
driver = webdriver.Chrome(options=options)

wb = Workbook()
ws = wb.active

for search_query in keywords:
    driver.get("https://www.naver.com")
    time.sleep(1)

    driver.find_element(By.ID, "query").send_keys(search_query)
    driver.find_element(By.ID, "search-btn").click()
    time.sleep(1)

    driver.find_element(By.LINK_TEXT, "블로그").click()
    time.sleep(2)

    html = BeautifulSoup(driver.page_source, 'html.parser')
    title_links = html.find_all('a', class_='title_link', limit=num_posts)
    name_links = html.find_all('a', class_='name', limit=num_posts)
    date_links = html.find_all('span', class_='sub', limit=num_posts)

    data = []
    for name_link, title_link, date_link in zip(name_links, title_links, date_links):
        url = title_link['href']
        email = ''
        if "https://blog.naver.com/" in url:
            username = url.split('/')[3]
            email = f"{username}@naver.com"
        elif "https://adcr.naver.com/" in url:
            email = "파워콘텐츠"
        elif "https://post.naver.com/" in url:
            email = "포스트"

        data.append((search_query, name_link.text.strip(), title_link.text.strip(), date_link.text.strip(), url, email))

    for row in data:
        ws.append(row)

# 엑셀 파일 저장
save_filename = sg.popup_get_file(
    '저장할 엑셀 파일 경로를 선택하세요',
    save_as=True,
    no_window=True,
    default_extension='xlsx',
    file_types=(('Excel Files', '*.xlsx'),)
)

if save_filename:
    wb.save(save_filename)
    sg.popup('저장 완료', f'{save_filename} 파일이 저장되었습니다.')

driver.quit()


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# 쿠팡 상품 리뷰 크롤링 (리뵤비교 및 블라인드 여부 판별) 

import os
import time
import difflib
import pytesseract
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# 사용자 입력 받기
input_url = input("Enter the URL: ")
input_pages = int(input("Enter the number of pages to crawl: "))
screenshot_folder = input("Enter the path of the screenshot review folder: ")

# Selenium 설정
options = Options()
options.add_experimental_option("detach", True)
options.add_argument("--disable-blink-features=AutomationControlled")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# 경고창 닫기 함수
def close_alert_if_present(driver):
    try:
        alert = driver.switch_to.alert
        alert.dismiss()
        print("Unexpected alert closed.")
    except:
        pass

# 리뷰 크롤링 함수
def crawl_reviews(driver, pages):
    reviews = []
    current_page = 1
    while current_page <= pages:
        reviews.extend(extract_reviews(driver))
        if (current_page) % 10 == 0:  # 10, 20, 30 페이지 등에서 '>' 버튼 클릭
            try:
                next_group_button = driver.find_element(By.CSS_SELECTOR, next_group_button_selector)
                driver.execute_script("arguments[0].click();", next_group_button)
                print(f"Clicked '>' button to go to the next group of pages.")
            except Exception as e:
                print(f"Failed to click '>' button: {e}")
                break
        else:
            try:
                next_page_button_selector = page_button_selector_template.format((current_page % 10) + 2)
                next_page_button = driver.find_element(By.CSS_SELECTOR, next_page_button_selector)
                driver.execute_script("arguments[0].click();", next_page_button)
                print(f"Page {current_page + 1} button clicked.")
            except Exception as e:
                print(f"Failed to click page {current_page + 1} button: {e}")
                break
        
        current_page += 1
        time.sleep(2)  # 페이지 로드 시간을 더 길게 설정
    return reviews

# 리뷰 추출 함수
def extract_reviews(driver):
    reviews = []
    review_elements = driver.find_elements(By.CSS_SELECTOR, review_selector)
    for review in review_elements:
        reviewer_name = review.find_element(By.CSS_SELECTOR, '.sdp-review__article__list__info .sdp-review__article__list__info__user').text
        review_date = review.find_element(By.CSS_SELECTOR, '.sdp-review__article__list__info .sdp-review__article__list__info__product-info__reg-date').text
        seller_name = review.find_element(By.CSS_SELECTOR, '.sdp-review__article__list__info .sdp-review__article__list__info__product-info__seller_name').text
        product_name = review.find_element(By.CSS_SELECTOR, '.sdp-review__article__list__info .sdp-review__article__list__info__product-info__name').text

        try:
            headline = review.find_element(By.CSS_SELECTOR, '.sdp-review__article__list__headline').text
        except:
            headline = None
        try:
            content = review.find_element(By.CSS_SELECTOR, '.sdp-review__article__list__review.js_reviewArticleContentContainer').text
        except:
            content = None

        reviews.append({
            "reviewer_name": reviewer_name,
            "review_date": review_date,
            "seller_name": seller_name,
            "product_name": product_name,
            "headline": headline,
            "content": content
        })
    return reviews

# 이미지 전처리 함수
def preprocess_image(image_path):
    try:
        with Image.open(image_path) as img:
            img = img.convert('L')
            img = img.point(lambda x: 0 if x < 140 else 255, '1')
            img = img.filter(ImageFilter.SHARPEN)
            return img
    except Exception as e:
        print("이미지 전처리 오류:", e)
        return None

# 이미지에서 텍스트를 추출하는 함수
def extract_text(image_path):
    try:
        img = preprocess_image(image_path)
        if img is not None:
            custom_config = r'--oem 3 --psm 6'
            text = pytesseract.image_to_string(img, lang='kor', config=custom_config)
            text = ''.join(text.split())
            return text
        else:
            return None
    except Exception as e:
        print("텍스트 추출 오류:", e)
        return None

# 유사도 계산 함수
def calculate_similarity(text1, text2):
    return difflib.SequenceMatcher(None, text1, text2).ratio()

# 리뷰 분류 함수
def classify_reviews(crawled_reviews, screenshot_texts):
    results = []
    for screenshot_text in screenshot_texts:
        max_similarity = 0
        best_match = None
        for crawled_review in crawled_reviews:
            similarity = calculate_similarity(screenshot_text, crawled_review["content"] or "")
            if similarity > max_similarity:
                max_similarity = similarity
                best_match = crawled_review
        if max_similarity >= 0.3:
            classification = "확인"
        elif max_similarity >= 0.1:
            classification = "확인 요망"
        else:
            classification = "블라인드"
        results.append({
            "screenshot_text": screenshot_text,
            "matched_review": best_match,
            "similarity": max_similarity,
            "classification": classification
        })
    return results

# 리뷰 스크린샷 찍기 함수
def take_screenshot(driver, review, save_directory):
    try:
        reviewer_name = review["reviewer_name"]
        review_date = review["review_date"]
        print(f"Taking screenshot for review by {reviewer_name} on {review_date}")

        # 리뷰를 찾아서 스크롤
        review_element = driver.find_element(By.XPATH, f"//*[contains(text(), '{review['content']}')]")
        driver.execute_script("arguments[0].scrollIntoView();", review_element)
        time.sleep(1)  # 잠시 대기하여 요소가 완전히 로드되도록 함

        screenshot_path = os.path.join(save_directory, f"{review_date}_{reviewer_name}.png")
        review_element.screenshot(screenshot_path)
        print(f"Screenshot saved to {screenshot_path}")
    except Exception as e:
        print(f"Failed to take screenshot: {e}")

# 결과 엑셀 파일로 저장
def save_results_to_excel(results, file_path):
    df = pd.DataFrame(results)

    # 텍스트 길이 제한 및 자동 줄바꿈
    df["screenshot_text"] = df["screenshot_text"].apply(lambda x: '\n'.join([x[i:i+60] for i in range(0, len(x), 60)]))
    df["matched_review"] = df["matched_review"].apply(lambda x: '\n'.join([str(x)[i:i+60] for i in range(0, len(str(x)), 60)]))

    writer = pd.ExcelWriter(file_path, engine='xlsxwriter')
    df.to_excel(writer, index=False)

    worksheet = writer.sheets['Sheet1']
    for idx, col in enumerate(df):
        worksheet.set_column(idx, idx, 60, None, {'text_wrap': True})  # 칼럼 너비를 60으로 설정하고 자동 줄바꿈

    writer.close()  # 여기서 close() 메서드를 사용해야 합니다.
    print(f"Results saved to {file_path}")

# 변수 정의
url = input_url
last_page_to_crawl = input_pages
sleep_time = 2
product_review_button_selector = "#btfTab > ul.tab-titles > li:nth-child(2)"
new_review_button_selector = '#btfTab > ul.tab-contents > li.product-review.tab-contents__content > div > div.sdp-review__article.js_reviewArticleContainer > section.sdp-review__article__order.js_reviewArticleOrderContainer.sdp-review__article__order--active > div.sdp-review__article__order__sort > button.sdp-review__article__order__sort__newest-btn.js_reviewArticleNewListBtn.js_reviewArticleSortBtn'
review_selector = "#btfTab > ul.tab-contents > li.product-review.tab-contents__content > div > div.sdp-review__article.js_reviewArticleContainer > section.js_reviewArticleListContainer > article"
page_button_selector_template = '#btfTab > ul.tab-contents > li.product-review.tab-contents__content > div > div.sdp-review__article.js_reviewArticleContainer > section.js_reviewArticleListContainer > div.sdp-review__article__page.js_reviewArticlePagingContainer > button:nth-child({})'
next_group_button_selector = '#btfTab > ul.tab-contents > li.product-review.tab-contents__content > div > div.sdp-review__article.js_reviewArticleContainer > section.js_reviewArticleListContainer > div.sdp-review__article__page.js_reviewArticlePagingContainer > button:nth-child(12)'

# 웹 페이지 열기
driver.get(url)
time.sleep(sleep_time)

# 페이지 끝까지 스크롤
try:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    print("Scrolled to the bottom of the page.")
except Exception as e:
    print(f"Failed to scroll: {e}")
    close_alert_if_present(driver)

time.sleep(sleep_time)

# 상품평 버튼 클릭
try:
    product_review_button = driver.find_element(By.CSS_SELECTOR, product_review_button_selector)
    product_review_button.click()
    print("Product review button clicked.")
except Exception as e:
    print(f"Failed to click product review button: {e}")
    close_alert_if_present(driver)

time.sleep(3)

# 최신순 버튼 클릭 
try:
    new_review_button = driver.find_element(By.CSS_SELECTOR, new_review_button_selector)
    new_review_button.click()
    print("New review button clicked.")
except Exception as e:
    print(f"Failed to click new review button: {e}")

time.sleep(3)

# 리뷰 크롤링
print("Crawling reviews...")
crawled_reviews = crawl_reviews(driver, last_page_to_crawl)
print(f"Crawled {len(crawled_reviews)} reviews.")

# 스크린샷 리뷰 텍스트 추출
print("Extracting text from screenshot reviews...")
screenshot_texts = []
for filename in os.listdir(screenshot_folder):
    if filename.endswith(".png") or filename.endswith(".jpg"):
        file_path = os.path.join(screenshot_folder, filename)
        text = extract_text(file_path)
        if text:
            screenshot_texts.append(text)
print(f"Extracted text from {len(screenshot_texts)} screenshot reviews.")

# 리뷰 분류
print("Classifying reviews...")
classified_reviews = classify_reviews(crawled_reviews, screenshot_texts)
print(f"Classified {len(classified_reviews)} reviews.")

# 스크린샷 저장 경로
save_directory = os.path.join(os.getcwd(), "matched_screenshots")
os.makedirs(save_directory, exist_ok=True)

# 새로 스크린샷 찍기
print("Taking screenshots of matched reviews...")
for review in classified_reviews:
    if review["classification"] in ["확인", "확인 요망"]:
        take_screenshot(driver, review["matched_review"], save_directory)

# 결과 엑셀 파일로 저장
output_excel_path = os.path.join(os.getcwd(), "review_results.xlsx")
save_results_to_excel(classified_reviews, output_excel_path)

# 드라이버 종료+
driver.quit()


# STEP2.블로거 최근글 크롤링 ver.1.2
- step 1단계에서 추출한 블로거들의 최근 글을 크롤링합니다.
### 방법
- step 1단계에서 추출한 엑셀파일을 그대로 넣고 실행하면 됨
- (a2)는 연결하지 말것. 점령할 키워드는 한번에 하나씩진행할 것 (셀이 안맞아서 C레벨에서 이상하게 나옴)
- 처음 로딩시간이 제법 걸리는 점 참고. 3분 이내로 되긴 함

In [1]:
import tkinter as tk
from tkinter import filedialog
import re
from openpyxl import load_workbook, Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def select_excel_file():
    root = tk.Tk()
    root.withdraw()  # 루트 창 숨기기
    root.update()  # 상태 갱신

def select_excel_file():
    root = tk.Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(title="엑셀 파일 선택", filetypes=[("Excel files", "*.xlsx")])
    return file_path

def setup_webdriver():
    options = Options()
    options.headless = False
    driver = webdriver.Chrome(options=options)
    return driver


def extract_blog_ids(file_path):
    wb = load_workbook(file_path)
    ws = wb.active
    urls, names = [], []
    for row in range(1, ws.max_row + 1):
        url = ws['E' + str(row)].value
        name = ws['A' + str(row)].value
        if url and url.startswith("https://blog.naver.com/"):
            blog_id = url.split('/')[3]
            urls.append(f"https://blog.naver.com/PostList.naver?blogId={blog_id}&skinType=&skinId=&from=menu")
            names.append(name)
        time.sleep(1)
    return urls, names



def set_15_line_view(driver):
    select_box = driver.find_elements(By.CSS_SELECTOR, 'a.btn_select.pcol2._ListCountToggle._returnFalse')
    #목록열려있을 때 --> 만약, 셀렉박스가 보이면 클릭해서 15열이 보이게 만들어라. -->
    if select_box:
        select_box[0].click()
        time.sleep(1)
        fifteen_lines_option = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//a[@data-value='15']"))
        )
        fifteen_lines_option.click()
        time.sleep(2)

    #목록 닫혀있다면 --> 목록열기 먼저 누를 것.
    else:
        open_list_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, 'btn_openlist.pcol2._toggleTopList._returnFalse'))
        )
        open_list_button.click()
        time.sleep(1)
        
        # 그다음 셀렉박스 열어서 15개로 맞출 것
        select_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.btn_select.pcol2._ListCountToggle._returnFalse'))
        )
        select_button.click()
        time.sleep(1)
        fifteen_lines_option = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//a[@data-value='15']"))
        )
        fifteen_lines_option.click()
        time.sleep(2)


def scrape_blog_data(driver, url):
    driver.get(url)
    time.sleep(5)
    set_15_line_view(driver)
    try:
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'table.blog2_list'))
        )
        posts = driver.find_elements(By.CSS_SELECTOR, 'table.blog2_list tbody tr')[:15]
        data = []
        for post in posts:
            title_elements = post.find_elements(By.CSS_SELECTOR, 'span.ell2.pcol2')
            date_elements = post.find_elements(By.CSS_SELECTOR, 'div.wrap_td span.date.pcol2')
            if title_elements and date_elements:
                title = title_elements[0].text
                date = date_elements[0].text
                data.append((title, date))
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        data = []
    return data



def apply_excel_styles(ws):
    header_font = Font(bold=True)
    header_fill = PatternFill(start_color="FCD5B4", end_color="FCD5B4", fill_type="solid")
    center_alignment = Alignment(horizontal='center', vertical='center')
    thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin'))
    grey_fill = PatternFill(start_color="F0F0F0", end_color="F0F0F0", fill_type="solid")
    
    ws.column_dimensions['A'].width = 16
    ws.column_dimensions['B'].width = 16
    ws.column_dimensions['C'].width = 70
    ws.column_dimensions['D'].width = 16

    for cell in ws[1]:
        cell.font = header_font
        cell.alignment = center_alignment
        cell.fill = header_fill
        cell.border = thin_border
    
    return grey_fill

def save_data_to_excel(urls, names, data_list, ws, grey_fill):
    for index, (url, name, data) in enumerate(zip(urls, names, data_list)):
        for title, date in data:
            row = [name, "", title, date]
            ws.append(row)
            if index % 2 == 1:
                for cell in ws[ws.max_row]:
                    cell.fill = grey_fill

def main():
    file_path = select_excel_file()
    if not file_path:
        return

    driver = setup_webdriver()
    urls, names = extract_blog_ids(file_path)
    data_list = [scrape_blog_data(driver, url) for url in urls]

    wb = Workbook()
    ws = wb.active
    ws.append(["NAME", "2차 키워드", "글 제목", "작성일"])
    grey_fill = apply_excel_styles(ws)
    save_data_to_excel(urls, names, data_list, ws, grey_fill)

    save_path = filedialog.asksaveasfilename(defaultextension=".xlsx", title="저장 경로 선택", filetypes=[("Excel files", "*.xlsx")])
    wb.save(save_path)

    driver.quit()

if __name__ == "__main__":
    main()

    ## 이게 거의 최종일듯


# STEP3. 블로거 인플크롤링
- 1행은 헤더
- 2행부터 키워드 입력되어있어야함


In [5]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
from tkinter import Tk, filedialog
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side

def get_excel_file_path():
    root = Tk()
    root.withdraw()
    file_path = filedialog.askopenfilename(title='DB 엑셀 파일 선택', filetypes=[("Excel files", "*.xlsx *.xls")])
    return file_path

def load_keywords_from_excel(file_path):
    df = pd.read_excel(file_path, header=0)
    print(df.columns)
    if df.columns[0] not in df or df.columns[1] not in df:
        print("필요한 열이 엑셀 파일에 없습니다.")
        return pd.DataFrame()
    return df.dropna(subset=[df.columns[1]])

def save_results_to_excel(data_dict, keyword_index):
    output_path = filedialog.asksaveasfilename(defaultextension='.xlsx', title='저장할 파일 경로를 지정하세요')
    if not output_path:
        print("저장 경로가 지정되지 않았습니다.")
        return
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        for sheet_name, records in data_dict.items():
            df = pd.DataFrame(records, columns=['2차키워드', '검색량', 'NAME', '제목', '게시일', 'URL', '이메일주소', '상위노출여부'])
            df.to_excel(writer, sheet_name=sheet_name, index=False)
            worksheet = writer.sheets[sheet_name]

            # 열 너비 설정
            column_widths = {'A': 16, 'B': 16, 'C': 16, 'D': 70, 'E': 16, 'F': 8, 'G': 24, 'H': 16}
            for col, width in column_widths.items():
                worksheet.column_dimensions[col].width = width

            # 스타일 설정
            red_bold_font = Font(bold=True, color="FF0000")
            grey_fill = PatternFill(start_color='EAEAEA', end_color='EAEAEA', fill_type="solid")
            standard_border = Border(left=Side(style='thin'), right=Side(style='thin'),
                                     top=Side(style='thin'), bottom=Side(style='thin'))

            for row in range(2, worksheet.max_row + 1):
                for col in range(1, 9):  # A to H
                    cell = worksheet.cell(row=row, column=col)
                    cell.border = standard_border
                    
                    # 짝수번째 키워드 색상 적용
                    if keyword_index[df.iloc[row-2, 0]] % 2 == 1:
                        cell.fill = grey_fill
                    
                    # C열에 시트 제목과 동일한 내용이 있다면
                    if worksheet.cell(row=row, column=3).value == sheet_name:
                        cell.font = red_bold_font
                        if col == 8:  # H열에 "criteria" 입력
                            worksheet.cell(row=row, column=8).value = "criteria"
                
            # 헤더 스타일
            for col in range(1, 9):
                cell = worksheet.cell(row=1, column=col)
                cell.font = Font(bold=True, color='FFFFFF')
                cell.fill = PatternFill(start_color='CCC0DA', end_color='CCC0DA', fill_type="solid")
                cell.alignment = Alignment(horizontal='center', vertical='center')
                cell.border = Border(left=Side(style='thin'), right=Side(style='thin'),
                                     top=Side(style='thin'), bottom=Side(style='thin'))

            # 필터 추가
            worksheet.auto_filter.ref = "A1:H" + str(worksheet.max_row)

    print("데이터가 성공적으로 저장되었습니다.")


options = Options()
options.headless = False

driver = webdriver.Chrome(options=options)

file_path = get_excel_file_path()
if not file_path:
    driver.quit()
    exit()

df_keywords = load_keywords_from_excel(file_path)
if df_keywords.empty:
    driver.quit()
    exit()

data_dict = {}
keyword_index = {}

try:
    for index, row in df_keywords.iterrows():
        group_name, keyword = row[df_keywords.columns[0]], row[df_keywords.columns[1]]
        if keyword not in keyword_index:
            keyword_index[keyword] = len(keyword_index)
        driver.get('https://www.naver.com')
        time.sleep(1)

        driver.find_element(By.ID, "query").send_keys(keyword)
        driver.find_element(By.ID, "search-btn").click()
        time.sleep(1)

        driver.find_element(By.LINK_TEXT, '블로그').click()
        time.sleep(2)

        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(2)

        html = BeautifulSoup(driver.page_source, 'html.parser')
        name_links = html.find_all('a', class_='name')[:15]
        title_links = html.find_all('a', class_='title_link')[:15]
        date_links = html.find_all('span', class_='sub')[:15]

        data = []
        for name_link, title_link, date_link in zip(name_links, title_links, date_links):
            url = title_link['href']
            email = ""
            if "https://blog.naver.com/" in url:
                username = url.split('/')[3]
                email = f"{username}@naver.com"
            elif "https://adcr.naver.com/" in url:
                email = "파워콘텐츠"
            elif "https://post.naver.com/" in url:
                email = "포스트"

            data.append([keyword, '', name_link.text.strip(), title_link.text.strip(), date_link.text.strip(), url, email, ''])

        if group_name in data_dict:
            data_dict[group_name].extend(data)
        else:
            data_dict[group_name] = data
finally:
    driver.quit()

if data_dict:
    save_results_to_excel(data_dict, keyword_index)

Index(['NAME', '2차 키워드', '글 제목', '작성일'], dtype='object')
데이터가 성공적으로 저장되었습니다.


# STEP4 블덱스 지수 검색기 (E열에 블로그 주소)
- b열 삭제하면 무난
- 블로그 주소명을 수정해야됌
- =LEFT(H2, FIND("/", H2, FIND("/", H2, 9) + 1) - 1)

In [6]:
import pandas as pd
import tkinter as tk
from tkinter import filedialog
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill
from openpyxl.worksheet.filters import FilterColumn

# GUI 초기화
root = tk.Tk()
root.withdraw()


# 입력 엑셀 파일 선택
excel_path = filedialog.askopenfilename(title="블로그 URL이 포함된 엑셀 파일 선택", filetypes=[("Excel files", "*.xlsx *.xls")])

# 저장 경로 미리 선택
save_path = filedialog.asksaveasfilename(title="결과를 저장할 엑셀 파일 선택", defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx *.xls")])

# 엑셀 파일 불러오기
df = pd.read_excel(excel_path, usecols=[4], header=0)  # E열에서 데이터 읽기, 첫 행은 헤더

# Selenium 웹드라이버 설정
options = Options()
options.headless = False
driver = webdriver.Chrome(options=options)


# Blogdex 로그인 페이지로 이동
url = "https://blogdex.space/login?from=/blog-index"
driver.get(url)
time.sleep(3)  # 로그인 이후 타임슬립 3초

# 카카오톡 로그인 버튼 클릭
driver.find_element(By.CLASS_NAME, "border-primary").click()
time.sleep(0.3)
driver.find_element(By.XPATH, "//button[contains(., '카카오톡')]").click()
time.sleep(3)  # 로그인 이후 타임슬립 3초

# 로그인 정보 입력
driver.find_element(By.CLASS_NAME, "tf_g").send_keys("01048460380")
time.sleep(1)
password_field = driver.find_elements(By.CLASS_NAME, "tf_g")[1]
password_field.send_keys('@gusqls2')
time.sleep(0.3)
driver.find_element(By.CSS_SELECTOR, ".btn_g.highlight.submit").click()

# 로그인 승인 대기 및 클릭
try:
    login_button = WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CSS_SELECTOR, "button.btn_agree[name='user_oauth_approval'][value='true']")))
    login_button.click()
except Exception as e:
    print("로그인 버튼을 찾을 수 없거나 클릭할 수 없습니다:", e)

time.sleep(3)  # 로그인 이후 타임슬립 3초

data_list = []
save_count = 0

for index, row in df.iterrows():
    try:
        blog_url = row[0]
        blog_id = blog_url.split("/")[-1]  # URL에서 블로그 ID 추출
        url_2 = f"https://blogdex.space/blog-index/{blog_id}"
        
        driver.get(url_2)
        time.sleep(6)  # 아이디마다 크롤링할 때 타임슬립 4초

        # 필요한 항목들 스크래핑
        data = {
            "블로그명": driver.find_element(By.CSS_SELECTOR, "div.flex.space-x-1.pr-0.md\\:pr-24 p.text-sm.font-medium.leading-none").text,
            "블로그주소": f"https://blog.naver.com/{blog_id}",
            "블로그지수": driver.find_element(By.CSS_SELECTOR, "svg > text[font-family='Pretendard'][font-size='22px'][font-weight='700'][y='-60']").text,
            "블로그지수(확인용)": driver.find_element(By.CSS_SELECTOR, "div.flex.flex-1.justify-center.px-5 text:nth-child(2)").text,
            "블로그생성일": driver.find_element(By.CSS_SELECTOR, "div.w-full.space-x-1.pr-0.pt-4.md\\:pr-24 div.flex.items-center.justify-center.space-x-2.md\\:justify-start p.text-sm.font-medium.leading-none").text,
            "총방문자": driver.find_element(By.CSS_SELECTOR, "div.w-full.space-x-1.pb-8.pr-0.pt-4.md\\:pb-0.md\\:pr-24 div.flex.items-center.justify-center.space-x-2.md\\:justify-start p.text-sm.font-medium.leading-none").text,
            "총포스팅": driver.find_element(By.CSS_SELECTOR, "#__next > div.flex.min-h-screen.flex-col > main > div > div.flex.flex-col.gap-4 > div:nth-child(1) > div.p-6.pt-0 > div:nth-child(3) > div:nth-child(1) > div > div").text,
            "총구독자": driver.find_element(By.CSS_SELECTOR, "#__next > div.flex.min-h-screen.flex-col > main > div > div.flex.flex-col.gap-4 > div:nth-child(1) > div.p-6.pt-0 > div:nth-child(5) > div:nth-child(1) > div > div").text,
            "주제지수": driver.find_element(By.CSS_SELECTOR, "#__next > div.flex.min-h-screen.flex-col > main > div > div.flex.flex-col.gap-4 > div:nth-child(1) > div.p-6.pt-0 > div.flex.flex-col.justify-center.space-y-12.py-5.md\\:flex-row.md\\:justify-between.md\\:space-x-0.md\\:space-y-0.md\\:py-0 > div.divide.md\\:auto.flex.w-full.flex-1.flex-col.items-center.space-y-4.divide-y.px-5.text-center.md\\:items-end.md\\:text-right > div.pl-0.pt-8.md\\:pl-24.md\\:pt-0 > div > div > p").text,
            "종합지수": driver.find_element(By.CSS_SELECTOR, "#__next > div.flex.min-h-screen.flex-col > main > div > div.flex.flex-col.gap-4 > div:nth-child(1) > div.p-6.pt-0 > div.flex.flex-col.justify-center.space-y-12.py-5.md\\:flex-row.md\\:justify-between.md\\:space-x-0.md\\:space-y-0.md\\:py-0 > div.divide.md\\:auto.flex.w-full.flex-1.flex-col.items-center.space-y-4.divide-y.px-5.text-center.md\\:items-end.md\\:text-right > div:nth-child(2) > div > div > p").text,
            "최고지수": driver.find_element(By.CSS_SELECTOR, "#__next > div.flex.min-h-screen.flex-col > main > div > div.flex.flex-col.gap-4 > div:nth-child(1) > div.p-6.pt-0 > div.flex.flex-col.justify-center.space-y-12.py-5.md\\:flex-row.md\\:justify-between.md\\:space-x-0.md\\:space-y-0.md\\:py-0 > div.divide.md\\:auto.flex.w-full.flex-1.flex-col.items-center.space-y-4.divide-y.px-5.text-center.md\\:items-end.md\\:text-right > div:nth-child(3) > div > div > p").text,
            "블로그주제": driver.find_element(By.CSS_SELECTOR, "div.w-full.pt-4.md\\:w-auto.md\\:pt-0 div.flex.items-center.justify-center.space-x-2.md\\:justify-end p.text-sm.font-medium.leading-none").text,
            "블덱스전체랭킹": driver.find_element(By.CSS_SELECTOR, "#__next > div.flex.min-h-screen.flex-col > main > div > div.flex.flex-col.gap-4 > div:nth-child(1) > div.p-6.pt-0 > div:nth-child(5) > div:nth-child(3) > div > div > p").text,
            "블덱스주제랭킹": driver.find_element(By.CSS_SELECTOR, "div.ml-0.w-full.pt-4.md\\:ml-16.md\\:w-auto.md\\:pt-0 div.flex.items-center.space-x-2.justify-center.md\\:justify-center p.text-sm.font-medium.leading-none").text,
            "최적화수치": driver.find_element(By.CSS_SELECTOR, "div.relative.flex.rounded-md.w-9\\/10 div.bg-primary.h-6.rounded-l-md p.absolute.left-1\\/2.top-1\\/2").text,
            "메일주소": f"{blog_id}@naver.com"
        }
        data_list.append(data)
        save_count += 1

        # 실시간으로 출력
        print(f"아이디: {blog_id}")
        for key, value in data.items():
            print(f"{key}: {value}")
        print("\n")


        # 50개 단위로 중간 저장
        if save_count % 50 == 0:
            temp_df = pd.DataFrame(data_list)
            temp_save_path = f"{save_path.rsplit('.', 1)[0]}_temp_{save_count}.xlsx"
            temp_df.to_excel(temp_save_path, index=False)
            print(f"{save_count}개의 데이터가 {temp_save_path}에 중간 저장되었습니다.")

    except Exception as e:
        print(f"일부 요소를 찾을 수 없습니다: {e}")
        continue

# DataFrame으로 변환
result_df = pd.DataFrame(data_list)

# 최종 엑셀 파일로 저장
result_df.to_excel(save_path, index=False)

# 엑셀 파일 열기 및 서식 지정
wb = load_workbook(save_path)
ws = wb.active

# 열 너비 설정
column_widths = [18, 35, 12, 12, 14, 10, 9, 9, 9, 9, 9, 12, 20, 20, 12, 24]
for i, width in enumerate(column_widths, start=1):
    ws.column_dimensions[chr(64 + i)].width = width

# '최적' 포함된 셀 서식 변경
red_fill = PatternFill(start_color='FD1E19', end_color='FD1E19', fill_type='solid')
bold_font = Font(bold=True, color='FFFFFF')

for cell in ws['C']:
    if '최적' in str(cell.value):
        cell.font = bold_font
        cell.fill = red_fill

ws.auto_filter.ref = ws.dimensions

# 변경사항 저장
wb.save(save_path)

# 드라이버 종료
driver.quit()

print(f"모든 데이터가 {save_path}에 저장되었습니다.")


  blog_url = row[0]


아이디: sunlight232
블로그명: 예경(sunlight232)
블로그주소: https://blog.naver.com/sunlight232
블로그지수: 최적2+
블로그지수(확인용): 최적2+
블로그생성일: 2006-01-08
총방문자: 15,112,296
총포스팅: 2,571
총구독자: 6,616
주제지수: 최적1+
종합지수: 최적2+
최고지수: 최적2+
블로그주제: 상품리뷰
블덱스전체랭킹: 46,326등(상위 5.0%)
블덱스주제랭킹: 971등(상위 4.5%)
최적화수치: 90.1%
메일주소: sunlight232@naver.com


아이디: al_satang
블로그명: 알사탕(al_satang)
블로그주소: https://blog.naver.com/al_satang
블로그지수: 최적2+
블로그지수(확인용): 최적2+
블로그생성일: 2015-08-26
총방문자: 16,983,228
총포스팅: 3,057
총구독자: 22,474
주제지수: 최적2+
종합지수: 최적2+
최고지수: 최적3+
블로그주제: 인테리어·DIY
블덱스전체랭킹: 11,684등(상위 1.3%)
블덱스주제랭킹: 444등(상위 1.3%)
최적화수치: 90.4%
메일주소: al_satang@naver.com


아이디: nkh9475
블로그명: 끝도없는 인기(nkh9475)
블로그주소: https://blog.naver.com/nkh9475
블로그지수: 최적2+
블로그지수(확인용): 최적2+
블로그생성일: 2015-03-04
총방문자: 75,646,485
총포스팅: 5,380
총구독자: 30,103
주제지수: 최적2+
종합지수: 최적2+
최고지수: 최적4+
블로그주제: 요리·레시피
블덱스전체랭킹: 203등(상위 0.0%)
블덱스주제랭킹: 44등(상위 0.6%)
최적화수치: 91.8%
메일주소: nkh9475@naver.com


아이디: k_saja
블로그명: 세렝게티의김사자(k_saja)
블로그주소: https://blog.naver.com/k_saja
블로그지수: 최적2+
블로그지수(확인용

In [5]:
import PySimpleGUI as sg
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.utils.dataframe import dataframe_to_rows

# 엑셀 파일 선택 및 불러오기
filename = sg.popup_get_file('검색어가 포함된 엑셀 파일을 선택하세요', file_types=(('Excel Files', '*.xlsx'),))
if not filename:
    sg.popup_error('파일을 선택하지 않았습니다.')
    raise SystemExit

# 검색어 목록 불러오기
df_keywords = pd.read_excel(filename, usecols='B')  # B열만 불러온다고 가정
keywords = df_keywords.iloc[1:].squeeze().tolist()  # 첫 번째 행(헤더)를 제외하고 리스트로 변환

# GUI 설정
layout = [
    [sg.Text('가져올 글의 수를 선택하세요:', size=(25, 1)), sg.Radio('10개', "NUM_POSTS", default=True, key='10'),
     sg.Radio('20개', "NUM_POSTS", key='20'), sg.Radio('30개', "NUM_POSTS", key='30'),
     sg.Radio('50개', "NUM_POSTS", key='50')],
    [sg.Submit('Start'), sg.Cancel()]
]

window = sg.Window('Post Count Selection', layout)
event, values = window.read()

if event in (None, 'Cancel'):
    window.close()
    raise SystemExit

# 선택된 글의 수 파악
num_posts = next((int(k) for k, v in values.items() if v and k.isdigit()), 10)
window.close()

options = Options()
options.headless = False
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

wb = Workbook()
ws = wb.active

for search_query in keywords:
    driver.get("https://www.naver.com")
    time.sleep(1)

    driver.find_element(By.ID, "query").send_keys(search_query)
    driver.find_element(By.ID, "search-btn").click()
    time.sleep(1)

    driver.find_element(By.LINK_TEXT, "블로그").click()
    time.sleep(2)

    html = BeautifulSoup(driver.page_source, 'html.parser')
    title_links = html.find_all('a', class_='title_link', limit=num_posts)
    name_links = html.find_all('a', class_='name', limit=num_posts)
    date_links = html.find_all('span', class_='sub', limit=num_posts)

    data = []
    for name_link, title_link, date_link in zip(name_links, title_links, date_links):
        url = title_link['href']
        email = ''
        if "https://blog.naver.com/" in url:
            username = url.split('/')[3]
            email = f"{username}@naver.com"
        elif "https://adcr.naver.com/" in url:
            email = "파워콘텐츠"
        elif "https://post.naver.com/" in url:
            email = "포스트"

        data.append((search_query, name_link.text.strip(), title_link.text.strip(), date_link.text.strip(), url, email))

    for row in data:
        ws.append(row)

# 엑셀 파일 저장
save_filename = sg.popup_get_file(
    '저장할 엑셀 파일 경로를 선택하세요',
    save_as=True,
    no_window=True,
    default_extension='xlsx',
    file_types=(('Excel Files', '*.xlsx'),)
)

if save_filename:
    wb.save(save_filename)
    sg.popup('저장 완료', f'{save_filename} 파일이 저장되었습니다.')

driver.quit()


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
