In [1]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
baseUrl = 'https://xn--2e0bs4kirwfni.kr'

### 1. Crawl category data

In [3]:
def get_category_data(url):
    headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text)
    category_el = soup.select('#JD-Header > .gnb > ul > li.xans-record-', href=True)
    
    categories = []
    for i in category_el:
        url = i.find('a', href=True)['href']
        if 'http' not in url:
            url = baseUrl + url
        categories.append({i.text: url})
    return categories

In [4]:
 categoriesMap = get_category_data('https://xn--2e0bs4kirwfni.kr/')

### 2. Crawl product name per category

In [5]:
def get_product_name(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text)
    
    product_el = soup.select('.xans-element-.xans-product.xans-product-normalpackage.list_normal ul.list_common > li')
    
    products = []
    
    for i in product_el:
        soup.find('b', {'class':'hdd_txt'}).decompose()
        productName = i.find('div', {'class':'opt_set'}).find('span')(text=True)
        producturl = i.find('p', {'class':'tit'}).find('a', href=True)['href']
        if 'http' not in producturl:
            producturl = baseUrl + producturl
        products.append([productName[0], producturl])
    return products

In [None]:
productMap = {}
for i in categoriesMap:
    url = list(i.values())[0]
    products = get_product_name(url)
    productMap[list(i.keys())[0]] = products

### 3. crawl review data for each product in category

In [None]:
def extract_review_data(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=options)
    driver.implicitly_wait(3)
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    category_el = soup.select('.xans-product-review .ec-base-table table tbody tr')
    driver.close()
    
    reviews = []
    for i, el in enumerate(category_el, start=1):
        if (i % 2 == 1): #odd
            postNum = el.findAll('td')[0](text=True)[0]
            title = el.findAll('td')[1](text=True)[1]
            reviewer = el.findAll('td')[2](text=True)[0]
            postDate = el.findAll('td')[3](text=True)[0]
            score = el.findAll('td')[5].find('img')['alt'].split('점')[0]
            continue
        else:            # even
            text = el.find('p', {'class':'word'})
            if text is None:
                text = el.find('p').get_text()
            else:
                text = text.text
            imgUrls = el.find('div', {'class':'view'}).findAll('img')

            imgUrlList = []
            for img in imgUrls:
                imgurl = 'https:' + img['src']
                imgUrlList.append(imgurl)
            reviews.append([
                postNum,
                title,
                reviewer,
                postDate,
                score,
                text,
                imgUrlList,
            ])
    return reviews
    
    # method 1 - using normal request w/o selenium, its fast but not accurate eg. no text review content 
#     resp = requests.get(url, headers=headers)
#     soup = BeautifulSoup(resp.text)
#     category_el = soup.select('.xans-product-review .ec-base-table table tbody tr')
#     print('length of category_el: ', len(category_el))

#     categories = []
#     for i in category_el:
#         row = i.findAll('td')[0](text=True)
#         print('1. ', row[0])      

In [None]:
for category in productMap.keys():
    
    print(category)
    
    for l in productMap[category]:
        productName = l[0]
        productUrl = l[1]
        print(productName, productUrl)
        # extract_review_data(productUrl) # not stable function, this is for extracting all review data
    print()

In [None]:
# this is just a category = '쿨리어런스' case
category = '쿨리어런스'
product = productMap['쿨리어런스'][0][0]
product_detail_url = productMap['쿨리어런스'][0][1]
review_data = extract_review_data('https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들-se-sdacs2c085/2285/category/137/display/1/')

file_data = []

for rd in review_data:
    file_data.append([
        category,
        product,
        product_detail_url,
        rd[0],
        rd[1],
        rd[2],
        rd[3],
        rd[4],
        rd[5],
        rd[6],
    ])
    

### 4. Convert list to DataFrame

In [None]:
df = pd.DataFrame(file_data, columns=['Category','Product','Product_detail_url','PostNum','Title','Reviewer', 'Date', 'Score', 'Text', 'ImageUrls'])
df.head()

### 5. save filtered data

In [None]:
df.to_csv('ChackHan_product.csv')

In [None]:
#extract_review_data('https://xn--2e0bs4kirwfni.kr/product/벤티-스트랩-샌들-se-sdacs2c085/2285/category/137/display/1/')

In [None]:
#extract_review_data('https://xn--2e0bs4kirwfni.kr/product/%EC%86%94%EB%A0%88%EC%9D%B4-%ED%94%8C%EB%9E%AB-%EB%AE%AC-%EC%83%8C%EB%93%A4-ne-spacr2b3091/1852/category/137/display/1/')

In [None]:
#extract_review_data('https://xn--2e0bs4kirwfni.kr/product/%EC%98%A4%EC%8A%AC%EB%A1%9C-%ED%88%AC%EC%9B%A8%EC%9D%B4-%EB%9D%BC%ED%83%84-%EC%83%8C%EB%93%A4-se-sdltr2c129/2317/category/137/display/1/')