# Data Crawling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from bs4 import BeautifulSoup
import time

In [4]:
def set_chrome_driver():
    options = ChromeOptions()
    options.add_argument("--start-maximized")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

## 솔드아웃 상품 목록, 상세 정보 Crawling

In [96]:
# 솔드아웃 상품 목록, 상세 정보 crawling
browser = set_chrome_driver()
result = []

base_url = "https://www.soldout.co.kr"
url_soldout = base_url + "/search/product/list"
browser.get(url=url_soldout)

# 스크롤 내리기
interval = 4
prev_height = browser.execute_script("return document.body.scrollHeight")

while True:
    browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(interval)
    cur_height = browser.execute_script("return document.body.scrollHeight")

    if cur_height == prev_height:
        break

    prev_height = cur_height

time.sleep(4)

soup = BeautifulSoup(browser.page_source, "lxml")

products = soup.find_all("div", class_="product-item")

# 상품 목록 페이지에서 데이터 crawling
for info in products[:3]:
    # 상품 한글명
    name_kor = info.find("p", class_="product-name").text
    # 브랜드명
    brand = info.find("span", class_="brand-logo__text").text
    # 이미지 주소
    img_tag = info.find("img")
    img_url = img_tag["src"]
    # 상세페이지 주소
    product_detail_tag = info.find("a",class_="link-for-seo")
    # https://www.soldout.co.kr/trade/detail/5534466
    product_detail_url = base_url + product_detail_tag["href"]

    # 상품 상세 페이지에서 데이터 crawling
    product_detail_url_list = []
    product_detail_url_list.append(product_detail_url)

    for url in product_detail_url_list:
        browser.get(url=url)

        # 스크롤 내리기
        interval = 3
        prev_height = browser.execute_script("return document.body.scrollHeight")

        while True:
            browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(interval)
            cur_height = browser.execute_script("return document.body.scrollHeight")

            if cur_height == prev_height:
                break

            prev_height = cur_height

        time.sleep(2)

        soup = BeautifulSoup(browser.page_source, "lxml")
       
        # 상품 영어명
        name_eng = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div.item_info__wrap > p")
        # 모델번호
        model_no = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div:nth-child(8) > dl:nth-child(3) > dd")
        # 출시일(DataFrame에서 날짜형태로 바꾸기)
        release_date = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div:nth-child(8) > dl:nth-child(4) > dd")
        # 색상
        color = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div:nth-child(8) > dl.product-info__dl.color > dd")
        # 원래 가격("원" 제거 후 DataFrame에서 int형으로 바꾸기)
        original_price = soup.select_one("#__layout > div > div.layout-container > div > div.item-container__in > div.container-right > div:nth-child(8) > dl:nth-child(6) > dd")

        if name_eng and model_no and release_date and color and original_price:
            name_eng = name_eng.text
            model_no = model_no.text
            release_date = release_date.text
            color = color.text
            original_price = original_price.text
        else:
            name_eng = np.nan
            model_no = np.nan
            release_date = np.nan
            color = np.nan
            original_price = np.nan

        print(url,name_eng, model_no, release_date, color, original_price)

    result.append([brand, name_kor, name_eng, model_no, release_date, color, original_price, img_url, product_detail_url])

# 1차 test : 2736개 7분 43초(사이트 계속 크롤링하는 동안 냅두기)
# print(len(result))

https://www.soldout.co.kr/trade/detail/5534466 Carhartt Loose Fit Heavyweight Short-Sleeve Pocket Regular T-Shirt Black K87-BLK - 블랙 $19
https://www.soldout.co.kr/trade/detail/5540084 Stussy Basic Stussy T-Shirt Black (2024) 1905000 (black) - 블랙 68,000원
https://www.soldout.co.kr/trade/detail/3333 Nike Air Force 1 Low '07 White 315122-111/CW2288-111 2018.01.02 화이트 139,000원


### 솔드아웃 거래내역 Crawling

In [100]:
# 솔드아웃 로그인
browser = set_chrome_driver()
browser.get(url_soldout)

login_button = browser.find_element(By.XPATH, '//*[@id="__layout"]/div/div[1]/header/div/ul/li[1]/a')
login_button.click()

id_input = browser.find_element(By.CSS_SELECTOR, "#__layout > div > div.layout-container > div > form > div:nth-child(1) > div > input")
id_input.send_keys("uj05273")

pwd_input = browser.find_element(By.CSS_SELECTOR, "#__layout > div > div.layout-container > div > form > div:nth-child(2) > div > input")
pwd_input.send_keys("brian981103")

signin_button = browser.find_element(By.CLASS_NAME, "btn-primary")
signin_button.click()

time.sleep(2)

# 거래내역 crawling
for url in product_detail_url_list[:2]:
    browser.get(url=url)

    # 스크롤 내리기
    interval = 3
    prev_height = browser.execute_script("return document.body.scrollHeight")

    while True:
        browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        time.sleep(interval)
        cur_height = browser.execute_script("return document.body.scrollHeight")

        if cur_height == prev_height:
            break

        prev_height = cur_height

    time.sleep(2)

    # 거래내역 전체보기 버튼 클릭
    view_all_btn = browser.find_element(By.CLASS_NAME, "btn-show-all")
    view_all_btn.click()

    time.sleep(3)

    # 마우스를 중앙으로 옮기기 -> 스크롤 끝까지 내리기 -> 거래내역 데이터 크롤링 -> X버튼으로 나가기 -> 뒤로가기 버튼
    