In [1]:
import os
import re
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import boto3
import pyrootutils

pyrootutils.setup_root(os.curdir, indicator=".project-root", pythonpath=True)
from extras import constants, paths
from aws import rds, s3
from dailyjou.utils import search, utils

## 티셔츠 크롤링

In [2]:
page_url = "http://merryaround.co.kr/category/tee/140/?page=1"

In [3]:
response = requests.get(page_url, headers={"User-Agent": "Mozilla/5.0"})
html = response.text
soup = BeautifulSoup(html, "html.parser")

In [4]:
products_li = soup.select('div.xans-element-.xans-product.xans-product-listnormal.ec-base-product > ul.prdList.grid3 > [id^="anchorBoxId_"]')
len(products_li)

30

In [5]:
product_li = products_li[0]

In [6]:
product_url = "https://merryaround.co.kr" + product_li.select_one("div.box > div.thumbnail > div.prdImg > a")["href"]
product_url

'https://merryaround.co.kr/product/mlabel-슬림-골지-스퀘어-cd/25271/category/140/display/1/'

In [7]:
thumbnail_img_url = "https:" + product_li.select_one("div.box > div.thumbnail > div.prdImg > a > img")["src"]
thumbnail_img_url

'https://merryaround.co.kr/web/product/medium/202310/4309549c297689b45fb63930dcf907df.gif'

### 상품 상세 정보

In [32]:
product_url = "https://merryaround.co.kr/product/%EB%A7%88%EC%9D%BC-%EB%B3%B4%ED%8A%B8%EB%84%A5-tee/22303/category/140/display/1/"

In [33]:
response = requests.get(product_url, headers={"User-Agent": "Mozilla/5.0"})
html = response.text
soup = BeautifulSoup(html, "html.parser")
product_dict = {}

#### product disabled

In [34]:
product_dict['disabled'] = "FALSE"

#### product price

In [35]:
price_lst = list(filter(lambda x: x['rel'] == "할인판매가", soup.select_one("div.infowrap").select('div.xans-element-.xans-product.xans-product-detaildesign > table > tbody > tr')))
if not price_lst:
    price_lst = list(filter(lambda x: x['rel'] == "판매가", soup.select_one("div.infowrap").select('div.xans-element-.xans-product.xans-product-detaildesign > table > tbody > tr')))

In [36]:
price = int(''.join(re.findall(r'\d+', price_lst[0].select_one("td").text)))
product_dict['price'] = price

#### product text

In [39]:
soup.select_one("ul.sect.deco2").select("p")

[]

In [41]:
text = None
for p in soup.select_one("ul.sect.deco2").select("p"):
    text = p.find_all(string=True)
    if text:
        break

# text = '\n'.join(text)
print(text)

None


#### product size

In [14]:
string_lst = soup.select_one("ul.sect.deco3 > div > div").find_all(string=True)
size_text_dict = {}
flag = 0
for i, string in enumerate(string_lst):
    string = string.strip()
    if flag and string == '':
        break
    if flag:
        size_split = string.split('/')
        # print(size_split)
        if len(size_split) == 1:
            size_name = 'FREE'
            size_value_string = size_split[0]
        else:
            size_name, size_value_string = size_split
        size_text_dict[size_name] = size_value_string.strip()
    if string == 'Size':
        flag = 1
    
size_text_dict

{'FREE': '어깨31.5 가슴37.5 암홀20 소매단면13 소매길이60.5 총장53.5 밑단35.5'}

In [16]:
def df2size_dict_lst(size_text_dict, category_id):
    size_dict_lst = []
    for name in size_text_dict:
        matches = re.findall(r'(\D+)(\d+\.\d+|\d+)', size_text_dict[name])
        cat_size_dict_ori = {key.strip(): float(value) for key, value in matches}

        size_dict = {
            "name": name,
            "product_id": "NULL",
            "top_id": "NULL",
            "outer_id": "NULL",
            "bottom_id": "NULL",
            "dress_id": "NULL",
        }

        cat_size_dict = {}

        if category_id == 2:
            for col, key in constants.MERRYAROUND_TOP_SIZE_COL2KEY.items():
                if col not in cat_size_dict_ori:
                    cat_size_dict[key] = "NULL"
                else:
                    cat_size_dict[key] = float(cat_size_dict_ori[col])
            size_dict_lst += [[size_dict, cat_size_dict]]
        elif category_id == 4:
            for col, key in constants.LOOKPLE_BOTTOM_SIZE_COL2KEY.items():
                if col not in row.index:
                    cat_size_dict[key] = "NULL"
                else:
                    cat_size_dict[key] = float(row[col])
            size_dict_lst += [[size_dict, cat_size_dict]]
        elif category_id == 3:
            for col, key in constants.LOOKPLE_DRESS_SIZE_COL2KEY.items():
                if col not in row.index:
                    cat_size_dict[key] = "NULL"
                else:
                    cat_size_dict[key] = float(row[col])
            size_dict_lst += [[size_dict, cat_size_dict]]
        elif category_id == 1:
            for col, key in constants.LOOKPLE_OUTER_SIZE_COL2KEY.items():
                if col not in row.index:
                    cat_size_dict[key] = "NULL"
                else:
                    cat_size_dict[key] = float(row[col])
            size_dict_lst += [[size_dict, cat_size_dict]]
    return size_dict_lst

In [17]:
df2size_dict_lst(size_text_dict, 2)

[[{'name': 'FREE',
   'product_id': 'NULL',
   'top_id': 'NULL',
   'outer_id': 'NULL',
   'bottom_id': 'NULL',
   'dress_id': 'NULL'},
  {'full': 53.5, 'shoulder': 31.5, 'chest': 37.5, 'sleeve': 60.5}]]

#### product name

In [18]:
product_dict['name'] = soup.select_one("div.headingArea > h2").text

#### product gender

In [19]:
product_dict['gender'] = "F"

#### product subcategory_id

In [20]:
product_dict['sub_category_id'] = 9

#### product category_id

In [21]:
product_dict['category_id'] = constants.SUB2CAT[product_dict['sub_category_id']]

#### product url

In [22]:
product_dict['url'] = product_url

#### product mall_id

In [23]:
product_dict['mall_id'] = constants.MERRYAROUND_ID

### img_url_lst

In [24]:
img_url_lst = [constants.MERRYAROUND_ROOT_URL + img_tag['src'].replace(' ', '%20') for img_tag in soup.select_one("div#prdDetail").select("img")]

In [25]:
img_url_lst

['https://merryaround.co.kr/web/2022bong2/20230918_dufl_up2.jpg',
 'https://merryaround.co.kr/web/2022bong2/231012_tmffla_st.jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_3_01_%20(5).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_3_01_%20(6).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_3_01_%20(7).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_3_01_%20(1).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_3_01_%20(2).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_3_01_%20(3).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_3_01_%20(4).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_4_01_%20(5).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_4_01_%20(6).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_4_01_%20(7).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_4_01_%20(1).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_1_01_%20(4).jpg',
 'https://merryaround.co.kr/web/2022jiwon/J231012_1_0