# 각 선제품의 링크별 댓글들을 크롤링하는 코드입니다

In [1]:
import time
import csv
import os
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from itertools import zip_longest
from selenium import webdriver
from selenium.webdriver.common.by import By

In [1]:
def load_data():# 선제품 링크 url 불러오기
    data = []
    with open("./crawling_data/suncream_link.csv") as fr:
        reader = csv.DictReader(fr)
        for row in reader:
            data.append(row)
    return data

In [5]:
def write_data(data):# 제품 데이터 수집
    file_path = "./data/suncream_review.csv"
    file_exists = os.path.isfile("./data/total_reviews.csv")
    
    with open(file_path, "a", newline='', encoding='utf-8') as fw:
        writer = csv.DictWriter(fw, fieldnames=["page", "product_name", "title", "review","skin_type","score"])
        
        # 파일이 존재하지 않으면 헤더를 작성
        if not file_exists:
            writer.writeheader()
        
        for row in data:
            writer.writerow(row)

In [6]:
def parse_data(url):# 파싱하여 필요한 데이터 추출
    
    driver = webdriver.Chrome()
    
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="reviewInfo"]'))).click()
        time.sleep(5)

        for page_num in range(1, 200):
            parse_review_text_list = []
            html = driver.page_source
            soup = BeautifulSoup(html, 'lxml')

            product_name_tag = soup.find("p", class_="prd_name")
            product_name = product_name_tag.text if product_name_tag else []

            user_clrfix_tags = soup.find_all("div", class_="user clrfix")
            review_tags = soup.find_all("div", class_="txt_inner")# 제품 리뷰 댓글
            title_tags = soup.find_all("div", class_="poll_sample")# 리뷰 제목
            score_tags = soup.find_all("span", class_="point")# 평점
            combined_list = list(zip_longest(title_tags,score_tags, review_tags, user_clrfix_tags, fillvalue=None))

            for title_tag, score_tag, review_tag, user_clrfix_tag in combined_list:
                review_text = review_tag.text if review_tag else None
                title_text = [tag.text.strip() for tag in title_tag.find_all("span")[1::2]] if title_tag else None
                span_texts = [span.text.strip() for span in user_clrfix_tag.find_all("span")[1:]] if user_clrfix_tag else None
                score_texts = [score_tag.text.strip() if score_tag else None]
                review_data = {
                    "page": page_num,
                    "product_name": product_name,
                    "title": title_text,
                    "review": review_text,
                    "skin_type": span_texts,
                    "score": score_texts
                }
                parse_review_text_list.append(review_data)

            write_data(parse_review_text_list)

            try:# 리뷰 페이지 넘기기
                next_button = driver.find_element(By.XPATH, f"//a[@data-page-no='{page_num + 1}']")
                next_button.click()
                time.sleep(5)
            except NoSuchElementException:
                print(f"Page {page_num} is the last page. \n ")
                break

    finally:
        driver.quit()
    
    return parse_review_text_list

In [None]:
if __name__ == '__main__':# 최종 크롤링 실행
    data = load_data()
    parse_review_list = []
    
    for i, review in enumerate(data):
        url = review["product_link"]
        try:
            parse_review = parse_data(url)
        except:
            continue
        print(f"{i}번 제품 끝")