In [1]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
baseUrl = 'https://xn--2e0bs4kirwfni.kr'

### 1. Crawl category data

In [3]:
def get_category_data(url):
    headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text)
    category_el = soup.select('#JD-Header > .gnb > ul > li.xans-record-', href=True)
    
    categories = []
    for i in category_el:
        url = i.find('a', href=True)['href']
        if 'http' not in url:
            url = baseUrl + url
        categories.append({i.text: url})
    return categories

In [4]:
 categoriesMap = get_category_data('https://xn--2e0bs4kirwfni.kr/')

### 2. Crawl product name per category

In [5]:
def get_product_name(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text)
    
    product_el = soup.select('.xans-element-.xans-product.xans-product-normalpackage.list_normal ul.list_common > li')
    
    products = []
    
    for i in product_el:
        soup.find('b', {'class':'hdd_txt'}).decompose()
        productName = i.find('div', {'class':'opt_set'}).find('span')(text=True)
        producturl = i.find('p', {'class':'tit'}).find('a', href=True)['href']
        if 'http' not in producturl:
            producturl = baseUrl + producturl
        products.append([productName[0], producturl])
    return products

In [6]:
productMap = {}
for i in categoriesMap:
    url = list(i.values())[0]
    products = get_product_name(url)
    productMap[list(i.keys())[0]] = products

### 3. crawl review data for each product in category

In [7]:
def extract_review_data(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(3)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    category_el = soup.select('.xans-product-review .ec-base-table table tbody tr')
    driver.close()
    
    reviews = []
    for i, el in enumerate(category_el, start=1):
        if (i % 2 == 1): #odd
            postNum = el.findAll('td')[0](text=True)[0]
            title = el.findAll('td')[1](text=True)[1]
            reviewer = el.findAll('td')[2](text=True)[0]
            postDate = el.findAll('td')[3](text=True)[0]
            score = el.findAll('td')[5].find('img')['alt'].split('점')[0]
            continue
        else:            # even
            text = el.find('p', {'class':'word'})
            if text is None:
                text = el.find('p').get_text()
            else:
                text = text.text
            imgUrls = el.find('div', {'class':'view'}).findAll('img')

            imgUrlList = []
            for img in imgUrls:
                imgurl = 'https:' + img['src']
                imgUrlList.append(imgurl)
            reviews.append([
                postNum,
                title,
                reviewer,
                postDate,
                score,
                text,
                imgUrlList,
            ])
    return reviews
    
    # method 1 - using normal request w/o selenium, its fast but not accurate eg. no text review content 
#     resp = requests.get(url, headers=headers)
#     soup = BeautifulSoup(resp.text)
#     category_el = soup.select('.xans-product-review .ec-base-table table tbody tr')
#     print('length of category_el: ', len(category_el))

#     categories = []
#     for i in category_el:
#         row = i.findAll('td')[0](text=True)
#         print('1. ', row[0])      

In [8]:
for category in productMap.keys():
    
    print(category)
    
    for l in productMap[category]:
        productName = l[0]
        productUrl = l[1]
        print(productName, productUrl)
        # extract_review_data(productUrl) # not stable function, this is for extracting all review data
    print()

쿨리어런스
벤티 스트랩 샌들 https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들-se-sdacs2c085/2285/category/137/display/1/
기대이상 투웨이 플랫 샌들 https://xn--2e0bs4kirwfni.kr/product/기대이상-투웨이-플랫-샌들-se-sdsdr2b5300/1748/category/137/display/1/
오슬로 투웨이 라탄 샌들 https://xn--2e0bs4kirwfni.kr/product/오슬로-투웨이-라탄-샌들-se-sdltr2c129/2317/category/137/display/1/
메이프 핸드메이드 라탄 에스파드류 https://xn--2e0bs4kirwfni.kr/product/메이프-핸드메이드-라탄-에스파드류-se-soacr2c094/2269/category/137/display/1/
커스티 핸드메이드 라탄 뮬 슬리퍼 https://xn--2e0bs4kirwfni.kr/product/커스티-핸드메이드-라탄-뮬-슬리퍼-se-spact2c093/2268/category/137/display/1/
솔레이 플랫 뮬 샌들 https://xn--2e0bs4kirwfni.kr/product/솔레이-플랫-뮬-샌들-ne-spacr2b3091/1852/category/137/display/1/
스테파니 스트랩 샌들힐 https://xn--2e0bs4kirwfni.kr/product/스테파니-스트랩-샌들힐/1763/category/137/display/1/
베네토 짜임 슬리퍼 https://xn--2e0bs4kirwfni.kr/product/베네토-짜임-슬리퍼-se-splts2c101/2282/category/137/display/1/
라끄 투웨이 스트랩 샌들 https://xn--2e0bs4kirwfni.kr/product/라끄-투웨이-스트랩-샌들-se-sdlts2b3011c1/1843/category/137/display/1/
마일렌 절개 플랫 https://xn--2e0bs4k

켈타 페니로퍼 https://xn--2e0bs4kirwfni.kr/product/켈타-페니로퍼-lflts0a919c0/40/category/140/display/1/
해브 스퀘어 플랫 https://xn--2e0bs4kirwfni.kr/product/해브-스퀘어-플랫-pllts1c047/2243/category/140/display/1/
라이크 메리제인 뮬 https://xn--2e0bs4kirwfni.kr/product/라이크-메리제인-뮬-mlltr1b3110c0/1358/category/140/display/1/
리브 스퀘어 버클 플랫 https://xn--2e0bs4kirwfni.kr/product/리브-스퀘어-버클-플랫-plltr1b2040/2030/category/140/display/1/
디어 플랫 뮬 샌들 https://xn--2e0bs4kirwfni.kr/product/디어-플랫-뮬-샌들-mllts1a663-1/178/category/140/display/1/
버킨 라운드 블로퍼 https://xn--2e0bs4kirwfni.kr/product/버킨-라운드-블로퍼-bfltr1c042/2216/category/140/display/1/
엘르 베이직 플랫슈즈 https://xn--2e0bs4kirwfni.kr/product/엘르-베이직-플랫슈즈-plsdr1a1363/91/category/140/display/1/
테라 홀스빗 로퍼 https://xn--2e0bs4kirwfni.kr/product/테라-홀스빗-로퍼-se-lfltr1c029/2176/category/140/display/1/
레베토 스퀘어 페니 로퍼 https://xn--2e0bs4kirwfni.kr/product/레베토-스퀘어-페니-로퍼-lfsds3b1310/2075/category/140/display/1/
케이트 페니 로퍼 https://xn--2e0bs4kirwfni.kr/product/케이트-페니-로퍼-lflts0c013/2182/category/140/display/1/
어반

Heel Sandals c135 https://xn--2e0bs4kirwfni.kr/product/heel-sandals-c135-se-sdems2c135/2329/category/26/display/1/
휘트니 격자 슬링백 https://xn--2e0bs4kirwfni.kr/product/휘트니-격자-슬링백-se-sbltt2c120/2304/category/26/display/1/
모코코 사선 스트랩 샌들힐 https://xn--2e0bs4kirwfni.kr/product/모코코-사선-스트랩-샌들힐-se-sdlts2c106/2294/category/26/display/1/
허쉬 미들힐 샌들 https://xn--2e0bs4kirwfni.kr/product/허쉬-미들힐-샌들/2266/category/26/display/1/
밀랜드 웨지힐 샌들 https://xn--2e0bs4kirwfni.kr/product/밀랜드-웨지힐-샌들-se-sdacr2b5510/1850/category/26/display/1/
레이나 스틸레토 슬링백 https://xn--2e0bs4kirwfni.kr/product/레이나-스틸레토-슬링백-sbltt1c023/2248/category/26/display/1/
안젤라 미들힐 스퀘어 펌프스 https://xn--2e0bs4kirwfni.kr/product/안젤라-미들힐-스퀘어-펌프스-pplts3b4040/1872/category/26/display/1/
티파니 스틸레토 슬링백 https://xn--2e0bs4kirwfni.kr/product/티파니-스틸레토-슬링백-sbltt1c007/2204/category/26/display/1/
폴리 스틸레토 뮬 샌들 https://xn--2e0bs4kirwfni.kr/product/폴리-스틸레토-뮬-샌들-mlltt1b3030c2/1813/category/26/display/1/
나오미 스틸레토 뮬 https://xn--2e0bs4kirwfni.kr/product/나오미-스틸레토-뮬-ne-mlltt1c0

엘르 베이직 플랫슈즈 https://xn--2e0bs4kirwfni.kr/product/엘르-베이직-플랫슈즈-plsdr1a1363/91/category/130/display/1/
컨트리 스퀘어 블로퍼 https://xn--2e0bs4kirwfni.kr/product/컨트리-스퀘어-블로퍼-ne-bflts1c055/2228/category/130/display/1/
클레르 스틸레토 플랫 https://xn--2e0bs4kirwfni.kr/product/클레르-스틸레토-플랫-ne-plsdt1b2310/2073/category/130/display/1/
넬디아 통굽 스니커즈 https://xn--2e0bs4kirwfni.kr/product/넬디아-통굽-스니커즈-ne-snacr3b8700/2035/category/130/display/1/
어반 베이직 블로퍼 https://xn--2e0bs4kirwfni.kr/product/어반-베이직-블로퍼-bflts1c056/2227/category/130/display/1/
제이엘 스틸레토 플랫 https://xn--2e0bs4kirwfni.kr/product/제이엘-스틸레토-플랫-se-plltt1c025/2246/category/130/display/1/

액세서리
AC-01 앞부분 젤 쿠션패드 https://xn--2e0bs4kirwfni.kr/product/ac-01-앞부분-젤-쿠션패드/1318/category/61/display/1/
AC-02 뒤꿈치 패드 https://xn--2e0bs4kirwfni.kr/product/ac-02-뒤꿈치-패드/1320/category/61/display/1/
AC-03 실리콘 뒤꿈치 패드 https://xn--2e0bs4kirwfni.kr/product/ac-03-실리콘-뒤꿈치-패드/1322/category/61/display/1/
AC-04 T자형 뒤꿈치 패드 https://xn--2e0bs4kirwfni.kr/product/ac-04-t자형-뒤꿈치-패드/1323/category/61/

In [9]:
# this is just a category = '쿨리어런스' case
category = '쿨리어런스'
product = productMap['쿨리어런스'][0][0]
product_detail_url = productMap['쿨리어런스'][0][1]
review_data = extract_review_data('https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들-se-sdacs2c085/2285/category/137/display/1/')

file_data = []

for rd in review_data:
    file_data.append([
        category,
        product,
        product_detail_url,
        rd[0],
        rd[1],
        rd[2],
        rd[3],
        rd[4],
        rd[5],
        rd[6],
    ])
    

### 4. Convert list to DataFrame

In [10]:
df = pd.DataFrame(file_data, columns=['Category','Product','Product_detail_url','PostNum','Title','Reviewer', 'Date', 'Score', 'Text', 'ImageUrls'])
df.head()

Unnamed: 0,Category,Product,Product_detail_url,PostNum,Title,Reviewer,Date,Score,Text,ImageUrls
0,쿨리어런스,벤티 스트랩 샌들,https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들...,53,그냥 그래용 뒤에 끈 헐렁하다는건 알고 삿지만 진짜 헐렁하네요 밟고 다니다가 걍 짤...,네****,2020-08-15,4,그냥 그래용 뒤에 끈 헐렁하다는건 알고 삿지만 진짜 헐렁하네요 밟고 다니다가 걍 짤...,[https://xn--2e0bs4kirwfni.kr/file_data/chakan...
1,쿨리어런스,벤티 스트랩 샌들,https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들...,52,새 신발 신고 휴가가요~ㅎㅎㅎ,네****,2020-08-14,5,새 신발 신고 휴가가요~ㅎㅎㅎ,[https://xn--2e0bs4kirwfni.kr/file_data/chakan...
2,쿨리어런스,벤티 스트랩 샌들,https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들...,51,끈길이 짝짝ㅇㅣ,조****,2020-08-13,5,한쪽은 꽉 죄고 한쪽은 헐렁.. 디자인은 예뻐요 디자인만 보고 검정으로 한개 더 시...,[]
3,쿨리어런스,벤티 스트랩 샌들,https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들...,50,디자인예쁨 내구성약함,조****,2020-08-12,5,끈 길이가 좌우가 좀 다르고 내구성이 약해요,[]
4,쿨리어런스,벤티 스트랩 샌들,https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들...,49,보통,네****,2020-08-12,3,가격대비 굿 완전 좋아용(2020-08-11 13:09:38 에 등록된 네이버 페이...,[]


### 5. save filtered data

In [11]:
df.to_csv('ChackHan_product.csv')

In [12]:
#extract_review_data('https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들-se-sdacs2c085/2285/category/137/display/1/')

In [13]:
#extract_review_data('https://xn--2e0bs4kirwfni.kr/product/%EC%86%94%EB%A0%88%EC%9D%B4-%ED%94%8C%EB%9E%AB-%EB%AE%AC-%EC%83%8C%EB%93%A4-ne-spacr2b3091/1852/category/137/display/1/')

In [14]:
#extract_review_data('https://xn--2e0bs4kirwfni.kr/product/%EC%98%A4%EC%8A%AC%EB%A1%9C-%ED%88%AC%EC%9B%A8%EC%9D%B4-%EB%9D%BC%ED%83%84-%EC%83%8C%EB%93%A4-se-sdltr2c129/2317/category/137/display/1/')