In [1]:
import os
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import boto3
import pyrootutils

pyrootutils.setup_root(os.curdir, indicator=".project-root", pythonpath=True)
from extras import constants, paths
from aws import rds, s3
from dailyjou.utils import search, utils

## 티셔츠 크롤링

In [2]:
page_url = "https://lookple.com/category/%EB%B0%98%ED%8C%94%ED%8B%B0%EB%AF%BC%EC%86%8C%EB%A7%A4%ED%8B%B0/59/?page=3"

In [3]:
response = requests.get(page_url, headers={"User-Agent": "Mozilla/5.0"})
html = response.text
soup = BeautifulSoup(html, "html.parser")

In [4]:
products_li = soup.select('ul.prdList.grid3 > [id^="anchorBoxId_"]')
len(products_li)

30

In [5]:
product_li = products_li[0]

In [6]:
product_url = "https://lookple.com" + product_li.select_one("div.prdBox > div.thumbnail > a")["href"]
product_url

'https://lookple.com/product/룩-어반-뉴욕-자수-오버-반팔티/6061/category/59/display/1/'

In [7]:
thumbnail_img_url = "https:" + product_li.select_one("div.prdBox > div.thumbnail > a > img")["src"]
thumbnail_img_url

'https://lookple.com/web/product/big/202304/535edbd88622315f2e57f916ccec1bf7.gif'

### 상품 상세 정보

In [30]:
product_url = "https://lookple.com/product/%EB%A3%A9-%EC%96%B4%EB%B0%98-%EB%89%B4%EC%9A%95-%EC%9E%90%EC%88%98-%EC%98%A4%EB%B2%84-%EB%B0%98%ED%8C%94%ED%8B%B0/6061/category/59/display/1/"

In [31]:
response = requests.get(product_url, headers={"User-Agent": "Mozilla/5.0"})
html = response.text
soup = BeautifulSoup(html, "html.parser")
product_dict = {}

In [32]:
def table2df(table):
    data = []
    for row in table.find_all("tr"):
        row_data = [cell.get_text() for cell in row.find_all(["th", "td"])]
        data.append(row_data)

    df = pd.DataFrame(data)
    df = df.set_index(0)

    return df

In [33]:
info_df = table2df(soup.select_one("table"))

#### product disabled

In [34]:
product_dict['disabled'] = "FALSE"

#### product price

In [35]:
if "할인판매가" in info_df.index:
    product_dict['price'] = int(re.sub(r'\([^)]*\)|,|원', '', info_df.loc["할인판매가", 1]).strip())
else:
    product_dict['price'] = int(re.sub(r'\([^)]*\)|,|원', '', info_df.loc["판매가", 1]).strip())

#### product text

In [36]:
text = info_df.loc["상품설명", 1]

#### product size

In [37]:
def df2size_dict_lst(size_df, category_id):
    size_dict_lst = []
    for idx in size_df.index:
        row = size_df.loc[idx]
        size_dict = {
            "name": row["사이즈"],
            "product_id": "NULL",
            "top_id": "NULL",
            "outer_id": "NULL",
            "bottom_id": "NULL",
            "dress_id": "NULL",
        }

        cat_size_dict = {}

        if category_id == 2:
            for col, key in constants.LOOKPLE_TOP_SIZE_COL2KEY.items():
                if col not in row.index:
                    cat_size_dict[key] = "NULL"
                else:
                    cat_size_dict[key] = float(row[col])
            size_dict_lst += [[size_dict, cat_size_dict]]
        elif category_id == 4:
            for col, key in constants.LOOKPLE_BOTTOM_SIZE_COL2KEY.items():
                if col not in row.index:
                    cat_size_dict[key] = "NULL"
                else:
                    cat_size_dict[key] = float(row[col])
            size_dict_lst += [[size_dict, cat_size_dict]]
        elif category_id == 3:
            for col, key in constants.LOOKPLE_DRESS_SIZE_COL2KEY.items():
                if col not in row.index:
                    cat_size_dict[key] = "NULL"
                else:
                    cat_size_dict[key] = float(row[col])
            size_dict_lst += [[size_dict, cat_size_dict]]
        elif category_id == 1:
            for col, key in constants.LOOKPLE_OUTER_SIZE_COL2KEY.items():
                if col not in row.index:
                    cat_size_dict[key] = "NULL"
                else:
                    cat_size_dict[key] = float(row[col])
            size_dict_lst += [[size_dict, cat_size_dict]]
    return size_dict_lst

In [38]:
def size_(x):
    x = x.strip()
    x = x.split(" : ")
    matches = re.findall(r'(\w+) (\d+)', x[1])

    # 추출한 데이터를 딕셔너리로 변환합니다.
    size_dict = {key: int(value) for key, value in matches}
    size_dict['사이즈'] = x[0]
    return size_dict

In [39]:
size_text = re.search(r'SIZE\s*(.*?)(?=\n\r\n)', text, re.DOTALL).group(1).strip()
size = size_text.split("\n")
size = [size_(s) for s in size]
size_df = pd.DataFrame(size)
size_df

Unnamed: 0,어깨,가슴,소매,암홀,총장,사이즈
0,56,60,27,27,74,FREE


In [59]:
size_dict_lst = df2size_dict_lst(size_df, 2)

#### product name

In [43]:
product_dict['name'] = info_df.loc["상품명", 1].strip()

#### product gender

In [44]:
product_dict['gender'] = "M"

#### product subcategory_id

In [45]:
product_dict["sub_category_id"] = 9

#### product category_id

In [46]:
product_dict['category_id'] = constants.SUB2CAT[product_dict['sub_category_id']]

#### product url

In [47]:
product_dict['url'] = product_url

#### product mall_id

In [48]:
product_dict['mall_id'] = constants.LOOKPLE_ID

#### img_url_lst

In [58]:
img_url_lst = [constants.LOOKPLE_ROOT_URL + img_tag['ec-data-src'] for img_tag in soup.select_one("div#prdDetail").select("img")]

### 최종 결과 product_dict, size_dict_lst, img_url_lst, text

In [60]:
product_dict

{'disabled': 'FALSE',
 'price': 29900,
 'name': '룩 어반 뉴욕 자수 오버 반팔티',
 'gender': 'M',
 'sub_category_id': 9,
 'category_id': 2,
 'url': 'https://lookple.com/product/%EB%A3%A9-%EC%96%B4%EB%B0%98-%EB%89%B4%EC%9A%95-%EC%9E%90%EC%88%98-%EC%98%A4%EB%B2%84-%EB%B0%98%ED%8C%94%ED%8B%B0/6061/category/59/display/1/',
 'mall_id': 3}

In [62]:
size_dict_lst

[[{'name': 'FREE',
   'product_id': 'NULL',
   'top_id': 'NULL',
   'outer_id': 'NULL',
   'bottom_id': 'NULL',
   'dress_id': 'NULL'},
  {'full': 74.0, 'shoulder': 56.0, 'chest': 60.0, 'sleeve': 27.0}]]

In [63]:
img_url_lst

['https://lookple.com/web/upload/NNEditor/20230421/0.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/0.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-1.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-2.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-3.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-4.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-5.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-6.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-7.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-8.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-9.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-10.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy-1682086293-11.jpg',
 'https://lookple.com/web/upload/NNEditor/20230421/copy

In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pyrootutils

pyrootutils.setup_root(os.curdir, indicator=".project-root", pythonpath=True)
from extras import constants
from aws import rds, s3, s3_rds
from lookple.utils import utils, search


In [2]:
def crawling_product(subcategory_id, product_url):
    response = requests.get(product_url, headers={"User-Agent": "Mozilla/5.0"})
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    info_df = utils.table2df(soup.select_one("table"))

    product_dict = {}

    not_sold_out = soup.select("div.btnArea > span.displaynone.sold")

    if not_sold_out:
        product_dict["disabled"] = "FALSE"
    else:
        product_dict["disabled"] = "TRUE"

    product_dict["price"] = search.get_price(info_df)
    product_dict["name"] = info_df.loc["상품명", 1].strip()
    product_dict["gender"] = "M"
    product_dict["sub_category_id"] = subcategory_id
    product_dict["category_id"] = constants.SUB2CAT[product_dict["sub_category_id"]]
    product_dict["url"] = product_url
    product_dict["mall_id"] = constants.LOOKPLE_ID

    text = info_df.loc["상품설명", 1]
    img_url_lst = search.get_img_url_lst(soup)
    size_dict_lst = search.get_size_dict_lst(text, product_dict["category_id"])

    return product_dict, size_dict_lst, img_url_lst, text

In [3]:
product_dict, size_dict_lst, img_url_lst, text = crawling_product(8, "https://lookple.com/product/%EB%A3%A9-%EC%84%B8%EC%9D%B8%ED%8A%B8-%ED%95%98%EC%B0%8C-%EB%8B%A8%EA%B0%80%EB%9D%BC-%EB%B0%98%ED%8F%B4%EB%9D%BC%EB%8B%88%ED%8A%B8/5911/category/128/display/1/")

  사이즈
0   M
1   L


In [6]:
size_dict_lst[0][1]['full']

'NULL'

In [6]:
import pandas as pd
def get_product_df(cursor, mall_id=1):
    query = f"""
        SELECT * FROM PRODUCT;
    """
    cursor.execute(query)
    products = cursor.fetchall()
    products_df = pd.DataFrame(products)
    return products_df

In [7]:
conn, cursor = rds.connect()
products_df = get_product_df(cursor, mall_id=3)

In [8]:
products_df.shape

(1176, 11)

In [13]:
for row in products_df.iterrows():
    print(row[1][url])
    break

룩 유엔 오버핏 텐타 덤블워싱 뉴욕 쭈리 맨투맨


In [41]:
def crawling_page(subcategory_id, page_url):
    product_li_lst = search.get_product_li(page_url)

    if not product_li_lst:
        return False

    print(f"crawling page {page_url}")
    for product_li in tqdm(product_li_lst, total=len(product_li_lst)):
        try:
            product_url, thumbnail_image_url = search.get_url(product_li)
            crawling_product(
                subcategory_id, product_url
            )

        
        except IndexError as e:
            print(f"error: {e} | product_url: {product_url}")
        # except requests.exceptions.InvalidURL as e:
        #     print(f"error: {e} | product_url: {product_url}")
        # except requests.exceptions.ConnectionError as e:
        #     print(f"error: {e} | product_url: {product_url}")
        except ValueError as e:
            print(f"error: {e} | product_url: {product_url}")
    return True

In [42]:
page_url = "https://lookple.com/category/%EB%B0%98%ED%8C%94%ED%8B%B0%EB%AF%BC%EC%86%8C%EB%A7%A4%ED%8B%B0/59/?page=6"
crawling_page(2, page_url)

crawling page https://lookple.com/category/%EB%B0%98%ED%8C%94%ED%8B%B0%EB%AF%BC%EC%86%8C%EB%A7%A4%ED%8B%B0/59/?page=6


 90%|█████████ | 27/30 [00:13<00:01,  1.87it/s]

룩 펠리트 피그먼트 박스 반팔티 is sold out


 93%|█████████▎| 28/30 [00:14<00:01,  1.92it/s]

룩 트윈사 카피 박스 트레이닝 is sold out


 97%|█████████▋| 29/30 [00:14<00:00,  2.02it/s]

룩 빌리 고텐션 카라티 is sold out


100%|██████████| 30/30 [00:15<00:00,  1.98it/s]

룩 코마 모달40% 베이직 반팔티 is sold out





True

In [34]:
# product_url = "https://lookple.com/product/%EB%A3%A9-%EC%9C%A0%EC%97%94-%EC%98%A4%EB%B2%84%ED%95%8F-%ED%85%90%ED%83%80-%EB%8D%A4%EB%B8%94%EC%9B%8C%EC%8B%B1-%EB%89%B4%EC%9A%95-%EC%AD%88%EB%A6%AC-%EB%A7%A8%ED%88%AC%EB%A7%A8/6269/category/59/display/1/"
product_url = "https://lookple.com/product/%EB%A3%A9-%EB%93%9C%EB%9E%8D-%EC%BB%A4%EB%B2%84-%EC%98%A4%EB%B2%84%EB%B0%98%ED%8C%94%ED%8B%B0/2391/category/59/display/1/"
product_dict, _, _, _ = crawling_product(2, product_url)

[]


In [35]:
product_dict

{'disabled': 'TRUE',
 'price': 17000,
 'name': '룩 드랍 커버 오버반팔티',
 'gender': 'M',
 'sub_category_id': 2,
 'category_id': 1,
 'url': 'https://lookple.com/product/%EB%A3%A9-%EB%93%9C%EB%9E%8D-%EC%BB%A4%EB%B2%84-%EC%98%A4%EB%B2%84%EB%B0%98%ED%8C%94%ED%8B%B0/2391/category/59/display/1/',
 'mall_id': 3}