# Data Crawling

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyautogui as p
from openpyxl import Workbook

In [7]:
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from openpyxl import Workbook
from openpyxl import load_workbook

from bs4 import BeautifulSoup
import time

In [3]:
def set_chrome_driver():
    options = ChromeOptions()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

## 솔드아웃 상품 목록, 상세 정보 Crawling

In [38]:
# 솔드아웃 상품 목록, 상세 정보 crawling
# 상위 몇개까지만 할지는 미정
# browser.back() 적용 아직 안함
browser = set_chrome_driver()
result = []

base_url = "https://www.soldout.co.kr"
url_soldout = base_url + "/search/product/list"
browser.get(url=url_soldout)

# 스크롤 내리기
interval = 3

prev_height = browser.execute_script("return document.body.scrollHeight")

while True:
    browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(interval)
    cur_height = browser.execute_script("return document.body.scrollHeight")

    if cur_height == prev_height:
        break

    prev_height = cur_height
  

time.sleep(3)

soup = BeautifulSoup(browser.page_source, "lxml")

products = soup.find_all("div", class_="product-item")


products = products[:1000]
# 상품 목록 페이지에서 데이터 crawling
for info in products:
    # 상품 한글명
    name_kor = info.find("p", class_="product-name").text
    # 브랜드명
    brand = info.find("span", class_="brand-logo__text").text
    # 이미지 주소
    img_tag = info.find("img")
    img_url = img_tag["src"]
    # 상세페이지 주소
    product_detail_tag = info.find("a",class_="link-for-seo")
    # https://www.soldout.co.kr/trade/detail/5534466
    product_detail_url = base_url + product_detail_tag["href"]

    # 상품 상세 페이지에서 데이터 crawling
    product_detail_url_list = []
    product_detail_url_list.append(product_detail_url)

    for url in product_detail_url_list:
        browser.get(url=url)

        # 스크롤 내리기
        interval = 3
        prev_height = browser.execute_script("return document.body.scrollHeight")

        while True:
            browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(interval)
            cur_height = browser.execute_script("return document.body.scrollHeight")

            if cur_height == prev_height:
                break

            prev_height = cur_height

        time.sleep(2)

        soup = BeautifulSoup(browser.page_source, "lxml")
       
        # 상품 영어명
        name_eng = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div.item_info__wrap > p")
        # 모델번호
        model_no = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div:nth-child(8) > dl:nth-child(3) > dd")
        # 출시일(DataFrame에서 날짜형태로 바꾸기)
        release_date = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div:nth-child(8) > dl:nth-child(4) > dd")
        # 색상
        color = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div:nth-child(8) > dl.product-info__dl.color > dd")
        # 원래 가격("원" 제거 후 DataFrame에서 int형으로 바꾸기)
        original_price = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div:nth-child(8) > dl:nth-child(6) > dd")

        if name_eng and model_no and release_date and color and original_price:
            name_eng = name_eng.text
            model_no = model_no.text
            release_date = release_date.text
            color = color.text
            original_price = original_price.text
        else:
            name_eng = np.nan
            model_no = np.nan
            release_date = np.nan
            color = np.nan
            original_price = np.nan

        # print(brand, name_kor, name_eng, model_no, release_date, color, original_price, img_url, product_detail_url)

    result.append([brand, name_kor, name_eng, model_no, release_date, color, original_price, img_url, product_detail_url])

columns = ["Brand", "Name_Kor", "Name_Eng", "Model_No", "Release_Date", "Color", "Original_Price", "Image_URL", "Product_Detail_URL"]
df = pd.DataFrame(result, columns=columns)
df.to_excel("./data/soldout_products2.xlsx", index=False)

wb = load_workbook("./data/soldout_products2.xlsx")
ws = wb.active

# 열의 너비 설정
ws.column_dimensions["A"].width = 20
ws.column_dimensions["B"].width = 100
ws.column_dimensions["C"].width = 100
ws.column_dimensions["D"].width = 50
ws.column_dimensions["E"].width = 50
ws.column_dimensions["F"].width = 50
ws.column_dimensions["G"].width = 50
ws.column_dimensions["H"].width = 100
ws.column_dimensions["I"].width = 60

# 최종 엑셀 파일로 저장
wb.save("./data/soldout_products2.xlsx")

print(len(products))
print("엑셀 파일로 저장 완료!")

1000
엑셀 파일로 저장 완료!


## 솔드아웃 거래내역 Crawling

In [55]:
# 솔드아웃 로그인
browser = set_chrome_driver()
browser.get(url_soldout)
result2 = []

time.sleep(2)

login_button = browser.find_element(By.XPATH, '//*[@id="__layout"]/div/div[1]/header/div/ul/li[1]/a')
login_button.click()

time.sleep(2)

id_input = browser.find_element(By.CSS_SELECTOR, "#__layout > div > div.layout-container > div > form > div:nth-child(1) > div > input")
id_input.send_keys("uj05273")

pwd_input = browser.find_element(By.CSS_SELECTOR, "#__layout > div > div.layout-container > div > form > div:nth-child(2) > div > input")
pwd_input.send_keys("brian981103")

signin_button = browser.find_element(By.CLASS_NAME, "btn-primary")
signin_button.click()

time.sleep(2)

# 거래내역 crawling
for info in products[:100]:
    # 상세페이지 주소
    product_detail_tag = info.find("a",class_="link-for-seo")
    # https://www.soldout.co.kr/trade/detail/5534466
    product_detail_url = base_url + product_detail_tag["href"]

    # 상품 상세 페이지에서 데이터 crawling
    product_detail_url_list = []
    product_detail_url_list.append(product_detail_url)

    for url in product_detail_url_list:
        #print(url)
        browser.get(url=url)

        interval = 3

        prev_height = browser.execute_script("return document.body.scrollHeight")

        while True:
            browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(interval)
            cur_height = browser.execute_script("return document.body.scrollHeight")

            if cur_height == prev_height:
                break

            prev_height = cur_height
        

        time.sleep(3)

        # 거래내역 전체보기 버튼 클릭
        view_all_btn = browser.find_element(By.CLASS_NAME, "btn-show-all")
        view_all_btn.click()

        time.sleep(3)

        # 상품 목록 데이터 중 한글이름과 merge 하기 위함
        name_kor = soup.find("p",class_="name_kor")
        if name_kor:
            name_kor = name_kor.text
        
        # 스크롤을 위해 마우스 중앙으로 옮기기
        p.moveTo(1270,815,0.5)
        p.click()

        modal_content = browser.find_element(By.CSS_SELECTOR, "body > div.trade_modal.BaseModal > div > div > div.base-table.trade_modal__table.modal-table > table > tbody")
        prev_height = browser.execute_script("return arguments[0].scrollHeight", modal_content)

        # 모달 창 스크롤 내리기
        while True:
            browser.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight;", modal_content)
            time.sleep(1)
            cur_height = browser.execute_script("return arguments[0].scrollHeight", modal_content)

            if cur_height == prev_height:
                break

            prev_height = cur_height
        
        soup = BeautifulSoup(browser.page_source, "lxml")
        trades = soup.select("body > div.trade_modal.BaseModal > div > div > div.base-table.trade_modal__table.modal-table > table > tbody > tr")
        
        for trade in trades:
            trade_dates = trade.select_one("tbody > tr > td:nth-child(1)")
            trade_sizes = trade.select_one("tbody > tr > td:nth-child(2)")
            trade_prices = trade.select_one("tbody > tr > td:nth-child(3) > span")

            if trade_dates and trade_sizes and trade_prices:
                trade_dates = trade_dates.text
                trade_sizes = trade_sizes.text
                trade_prices = trade_prices.text
        
            result2.append([name_kor,trade_dates,trade_sizes,trade_prices])


columns = ["Name_Kor", "Trade_Dates", "Trade_Sizes", "Trade_Prices"]
df = pd.DataFrame(result2, columns=columns)
df.to_excel("./data/soldout_trades.xlsx", index=False)

wb = load_workbook("./data/soldout_trades.xlsx")
ws = wb.active

# 열의 너비 설정
ws.column_dimensions["A"].width = 50
ws.column_dimensions["B"].width = 35
ws.column_dimensions["C"].width = 30
ws.column_dimensions["D"].width = 30

wb.save("./data/soldout_trades.xlsx")

print("엑셀 파일로 저장 완료!")
            
    # X버튼으로 나가기 -> 뒤로가기 버튼
    # x_btn = browser.find_element(By.XPATH,'/html/body/div[5]/div/header/div[3]/button')
    # x_btn.click()
    # browser.back()

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".btn-show-all"}
  (Session info: chrome=126.0.6478.127); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00D6C1C3+27395]
	(No symbol) [0x00D03DC4]
	(No symbol) [0x00C01B7F]
	(No symbol) [0x00C42C65]
	(No symbol) [0x00C42D3B]
	(No symbol) [0x00C7EC82]
	(No symbol) [0x00C639E4]
	(No symbol) [0x00C7CB24]
	(No symbol) [0x00C63736]
	(No symbol) [0x00C37541]
	(No symbol) [0x00C380BD]
	GetHandleVerifier [0x01023A93+2876371]
	GetHandleVerifier [0x01077F5D+3221661]
	GetHandleVerifier [0x00DED634+556916]
	GetHandleVerifier [0x00DF474C+585868]
	(No symbol) [0x00D0CE04]
	(No symbol) [0x00D09818]
	(No symbol) [0x00D099B7]
	(No symbol) [0x00CFBF0E]
	BaseThreadInitThunk [0x75D7FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x774180CE+286]
	RtlGetAppContainerNamedObjectPath [0x7741809E+238]


## Kream에서 상품 상세페이지, 상품코드 crawling

In [45]:
browser = set_chrome_driver()
base_url = "https://kream.co.kr"
url = base_url + "/search"
browser.get(url=url)

# 스크롤 내리기
# interval = 3
# prev_height = browser.execute_script("return document.body.scrollHeight")

# while True:
#     browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
#     time.sleep(interval)
#     cur_height = browser.execute_script("return document.body.scrollHeight")

#     if cur_height == prev_height:
#         break

#     prev_height = cur_height

# time.sleep(4)

# 상품마다 각 상세 페이지 가져오기
soup = BeautifulSoup(browser.page_source, "lxml")
products = soup.find_all("div",class_="search_result_item product")
for product in products[:1]:
    product_detail_url = product.find("a",class_="item_inner")["href"]
    product_detail_urls = base_url + product_detail_url
    
    # 상품 상세 페이지에서 데이터 crawling
    product_detail_url_list = []
    product_detail_url_list.append(product_detail_urls)

    for url in product_detail_url_list:
        browser.get(url=url)

        soup = BeautifulSoup(browser.page_source, "lxml")

        model_no = soup.select_one("div.product_info_wrap > div > dl > div:nth-child(3) > div.product_info")
        if model_no:
            model_no = model_no.text
            # print(model_no)
        
        # https://kream.co.kr/social/products/12831
        social_url = base_url + "/social" + product_detail_url
        
        browser.get(url=social_url)
        soup = BeautifulSoup(browser.page_source, "lxml")

        # 리뷰 전체를 보기위하여 스크롤 내리기
        interval = 3
        prev_height = browser.execute_script("return document.body.scrollHeight")

        while True:
            browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(interval)
            cur_height = browser.execute_script("return document.body.scrollHeight")

            if cur_height == prev_height:
                break

            prev_height = cur_height

        time.sleep(4)

        socials = soup.find_all("div",class_="feed_card")
        social_text_list = []
        for social in socials:
            social_text = social.find("p",class_="text_box").text
        

 휴일은 딸과 함께~ 


In [36]:
browser = set_chrome_driver()
result = []

# base_url = "https://www.soldout.co.kr"
# url_soldout = base_url + "/search/product/list"
browser.get(url="https://kream.co.kr/search")

# 스크롤 내리기
interval = 3
total_items = 0
target_items = 3
prev_height = browser.execute_script("return document.body.scrollHeight")

while total_items < target_items:
    browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(interval)
    cur_height = browser.execute_script("return document.body.scrollHeight")

    soup = BeautifulSoup(browser.page_source, "lxml")

    products = soup.find_all("div", class_="product")

    if cur_height == prev_height:
        break

    prev_height = cur_height

    total_items += len(products)

time.sleep(3)

# soup = BeautifulSoup(browser.page_source, "lxml")

# products = soup.find_all("div", class_="product")
products = products[:target_items]
# 상품 목록 페이지에서 데이터 crawling

for info in products:
    # 상품 한글명
    name_kor = info.find("p", class_="translated_name")
    # 상품 영어명
    name_eng = info.find("p", class_="name")
    # 브랜드명
    brand = info.find("p", class_="product_info_brand")
    
    # 이미지 주소
    img_tag = info.find("img")
    img_url = img_tag["src"]
    # 상세페이지 주소
    product_detail_tag = info.find("a",class_="item_inner")
    
    product_detail_url = "https://kream.co.kr" + product_detail_tag["href"]

    # 상품 상세 페이지에서 데이터 crawling
    # https://kream.co.kr/products/21935
    product_detail_url_list = []
    product_detail_url_list.append(product_detail_url)

    if name_kor and name_eng and brand:
        name_kor = name_kor.text
        name_eng = name_eng.text
        brand = brand.text

    for url in product_detail_url_list:
        # browser.get(url=url)
        

#         # 스크롤 내리기
#         interval = 3
#         prev_height = browser.execute_script("return document.body.scrollHeight")

#         while True:
#             browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
#             time.sleep(interval)
#             cur_height = browser.execute_script("return document.body.scrollHeight")

#             if cur_height == prev_height:
#                 break

#             prev_height = cur_height

#         time.sleep(2)

#         soup = BeautifulSoup(browser.page_source, "lxml")
       
       
#         # 모델번호
#         model_no = soup.select_one("div.column_top > div.product_info_wrap > div > dl > div:nth-child(3) > div.product_info")
#         # 출시일(DataFrame에서 날짜형태로 바꾸기)
#         release_date = soup.select_one("div.column_top > div.product_info_wrap > div > dl > div:nth-child(4) > div.product_info")
#         # 색상
#         color = soup.find("div",class_="color-target")
#         # 원래 가격("원" 제거 후 DataFrame에서 int형으로 바꾸기)
#         original_price = soup.select_one("div.column_top > div.product_info_wrap > div > dl > div:nth-child(2) > div.product_info")

#         if name_eng and model_no and release_date and color and original_price:
#             model_no = model_no.text
#             release_date = release_date.text
#             color = color.text
#             original_price = original_price.text
#         else:
#             name_eng = np.nan
#             model_no = np.nan
#             release_date = np.nan
#             color = np.nan
#             original_price = np.nan

#         # print(brand, name_kor, name_eng, model_no, release_date, color, original_price, img_url, product_detail_url)

#     result.append([brand, name_kor, name_eng, model_no, release_date, color, original_price, img_url, product_detail_url])
# print(brand, name_kor, name_eng, model_no, release_date, color, original_price, img_url, product_detail_url)
# columns = ["Brand", "Name_Kor", "Name_Eng", "Model_No", "Release_Date", "Color", "Original_Price", "Image_URL", "Product_Detail_URL"]
# df = pd.DataFrame(result, columns=columns)
# df.to_excel("./data/soldout_products.xlsx", index=False)

# wb = load_workbook("./data/soldout_products.xlsx")
# ws = wb.active

# # 열의 너비 설정
# ws.column_dimensions["A"].width = 20
# ws.column_dimensions["B"].width = 100
# ws.column_dimensions["C"].width = 100
# ws.column_dimensions["D"].width = 50
# ws.column_dimensions["E"].width = 50
# ws.column_dimensions["F"].width = 50
# ws.column_dimensions["G"].width = 50
# ws.column_dimensions["H"].width = 100
# ws.column_dimensions["I"].width = 60

# # 최종 엑셀 파일로 저장
# wb.save("./data/soldout_products.xlsx")

# print(len(products))
# print("엑셀 파일로 저장 완료!")

https://kream.co.kr/products/21935


TypeError: 'NoneType' object is not subscriptable