# 오뚜기 사이트 접근

In [1]:
import os
import re
import time
import random
import requests
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
import pymysql

In [2]:
session = requests.Session()
HEADERS = {
    'accept': '*/*',
    'accept-language': 'ko,en;q=0.9',
    'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'x-requested-with': 'XMLHttpRequest',
    'User-Agent' : 'Mozilla/5.0'
}

url_base = 'https://www.otokimall.com/front/product/category/{category}?ptype=0&psort=pop&pageSize=80&listBtn=0'
ajax_url      = 'https://www.otokimall.com/front/product/product_list.ajax'

index = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

In [3]:
def init_session(category):
    url = url_base.format(category=category)
    response = session.get(url)
    response.raise_for_status()
    HEADERS['referer'] = url

In [4]:
def fetch_raw_html(category, page = 1):
    init_session(category)
    data = {
        'ptype' : 0,
        'psort' : 'pop',
        'pageSize' : 80,
        'listBtn' : 0,
        'type' : 'search',
        'scate' : category,
        'sbrand' : 0,
        'keyword' : '',
        'keywordRe' : '',
        'storeNo' : 0,
        'depthCheck' : 1,
        'page' : page,
    }
    response = session.post(ajax_url, headers=HEADERS, data=data, timeout=10)
    response.raise_for_status()
    return response.text

In [5]:
def extract_product_info(ajax_html):
    soup = BeautifulSoup(ajax_html, 'html.parser')
    
    url_list  = []
    product_name_list = []

    for p in soup.find_all('p', 'name'):
        name = p.get_text(strip=True)
        
        a = p.find_parent('a')
        if a and a.has_attr('href'):
            href = a['href']
        else:
            href = None
        
        product_name_list.append(name)
        url_list.append(href)
    
    return url_list, product_name_list

In [6]:
def clean_product_name(raw_name: str) -> str:
    s = re.sub(r'\[.*?\]', '', raw_name)
    s = re.sub(r'\(.*?\)', '', s)
    s = re.sub(r'\d+개$', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [7]:
def get_last_page(ajax_html):
    soup = BeautifulSoup(ajax_html, 'html.parser')
    pages = [int(a['page']) for a in soup.find_all('a', attrs={'page': True})]
    return max(pages) if pages else 1

In [8]:
all_urls = []
all_names = []
for idx in index:
    first_html = fetch_raw_html(idx, page = 1)
    last_page = get_last_page(first_html)

    for page in range(1, last_page + 1):
        html = fetch_raw_html(idx, page=page)
        urls, names = extract_product_info(html)
        
        all_urls.extend(urls)
        all_names.extend([clean_product_name(name) for name in names])


print(len(all_urls))
print(len(all_names))


1193
1193


In [9]:
company_name = []
company_name.extend([10001 for _ in range(len(all_urls))])
company_name

[10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,
 10001,


In [10]:
import pandas as pd

data = {'company': company_name, 'product_name': all_names, 'url': all_urls}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,company,product_name,url
0,10001,진라면 매운맛,https://www.otokimall.com/front/product/503
1,10001,참깨라면 용기 110G,https://www.otokimall.com/front/product/543
2,10001,열라면,https://www.otokimall.com/front/product/501
3,10001,진라면 순한맛,https://www.otokimall.com/front/product/502
4,10001,진라면 매운맛 용기 110G,https://www.otokimall.com/front/product/546


In [14]:
df.to_csv('오뚜기_제품_정보.csv', sep = ',', encoding = 'UTF-8-sig')

In [11]:
allergy_dict = {1: "난류", 2: "가금류", 3: "계란", 4: "소고기", 5: "돼지고기", 6: "닭고기", 7: "새우", 8: "게",
                9: "오징어", 10: "고등어", 11: "조개류", 12: "굴", 13: "전복", 14: "홍합", 15: "우유", 16: "땅콩",
                17: "호두", 18: "잣", 19: "대두", 20: "복숭아", 21: "토마토", 22: "밀", 23: "메밀", 24: "이황산류"}

In [12]:
def ocr_with_size_check(img_path, lang='kor'):
    MAX_SIDE = 32767
    img = Image.open(img_path)
    w, h = img.size

    # 가로 또는 세로가 MAX_SIDE 초과 시 하단 MAX_SIDE 영역만 사용
    if w > MAX_SIDE or h > MAX_SIDE:
        crop_height = min(h, MAX_SIDE)
        box = (0, h - crop_height, w, h)
        img = img.crop(box)

    return pytesseract.image_to_string(img, lang=lang)

In [13]:
def all_code(idx_start, idx_end):
    db_host = 'localhost'
    db_user = 'root'
    db_password = '1234'
    db_port = 3306

    allergy_dict = {
        1: "난류",   2: "가금류",  3: "계란",   4: "소고기",
        5: "돼지고기",6: "닭고기", 7: "새우",   8: "게",
        9: "오징어", 10: "고등어",11: "조개류",12: "굴",
        13: "전복", 14: "홍합",  15: "우유",  16: "땅콩",
        17: "호두", 18: "잣",   19: "대두",  20: "복숭아",
        21: "토마토",22: "밀",   23: "메밀",  24: "이황산류"
    }

    db = pymysql.connect(host=db_host, user=db_user, password=db_password, port=db_port, database='Food_Allergy_DB')
    cursor = db.cursor()

    get_food_id_sql = "SELECT food_id FROM Food WHERE food_name = %s;"
    insert_fa_sql   = "INSERT INTO food_allergy (food_id, allergy_id) VALUES (%s, %s);"

    tmp_dir = 'tmp_detail_images'
    os.makedirs(tmp_dir, exist_ok=True)

    for idx in range(idx_start, min(idx_end, len(all_urls))):
        product_url  = all_urls[idx]
        product_name = all_names[idx]
        safe_name    = re.sub(r'[\\/:"*?<>|]+', '', product_name).replace(' ', '_')

        # referer 세팅
        m_cat = re.search(r'/category/(\d+)', product_url)
        init_session(m_cat.group(1) if m_cat else index[0])

        # 상세 페이지 GET
        try:
            resp = session.get(product_url, headers=HEADERS, timeout=10)
            resp.raise_for_status()
        except Exception as e:
            print(f"[{idx}] {product_name}: 요청 실패 → {e}")
            time.sleep(random.uniform(1.0, 2.5))
            continue

        soup = BeautifulSoup(resp.text, 'html.parser')

        # detail-page 이미지 수집
        detail_imgs = []
        for img in soup.find_all('img', src=True):
            m = re.search(r'detail-page(?:-?(\d+))?\.(jpg|png|gif)', img['src'])
            if m:
                page_no = int(m.group(1)) if m.group(1) else 1
                detail_imgs.append((page_no, img['src']))

        if not detail_imgs:
            print(f"[{idx}] {product_name}: detail-page 이미지 없음")
            continue

        # 마지막 이미지 다운로드
        _, img_url = max(detail_imgs, key=lambda x: x[0])
        img_resp = session.get(img_url, headers=HEADERS, timeout=10)
        ext = os.path.splitext(img_url)[1].split('?')[0] or '.jpg'
        img_path = os.path.join(tmp_dir, f"{safe_name}{ext}")
        with open(img_path, 'wb') as f:
            f.write(img_resp.content)

        # OCR (크기 체크 후 하단 영역만 처리)
        text = ocr_with_size_check(img_path)

        # 알러지 검출
        found_allergens = [
            aid for aid, name in allergy_dict.items()
            if name in text
        ]
        if not found_allergens:
            print(f"[{idx}] {product_name}: 알러지 없음")
            os.remove(img_path)
            time.sleep(random.uniform(1.0, 2.5))
            continue

        # food_id 조회
        cursor.execute(get_food_id_sql, (product_name,))
        row = cursor.fetchone()
        if not row:
            print(f"[{idx}] {product_name}: Food 테이블에 없음")
            os.remove(img_path)
            time.sleep(random.uniform(1.0, 2.5))
            continue
        food_id = row[0]

        # food_allergy 삽입
        for aid in found_allergens:
            try:
                cursor.execute(insert_fa_sql, (food_id, aid))
                print(f"  → inserted (food_id={food_id}, allergy_id={aid})")
            except pymysql.err.IntegrityError:
                print(f"  ☑ 이미 존재 (food_id={food_id}, allergy_id={aid})")

        # 임시 이미지 삭제
        os.remove(img_path)

        # 요청 지연
        delay = random.uniform(1.0, 10.0)
        print(f"[{idx}] {product_name}: 다음 요청까지 {delay:.2f}초 대기")
        time.sleep(delay)

    db.commit()
    cursor.close()
    db.close()
    print("완료: food_allergy 테이블 업데이트")

In [14]:
all_code(0, 200)

  → inserted (food_id=100001, allergy_id=3)
  → inserted (food_id=100001, allergy_id=5)
  → inserted (food_id=100001, allergy_id=6)
  → inserted (food_id=100001, allergy_id=7)
  → inserted (food_id=100001, allergy_id=8)
  → inserted (food_id=100001, allergy_id=9)
  → inserted (food_id=100001, allergy_id=10)
  → inserted (food_id=100001, allergy_id=11)
  → inserted (food_id=100001, allergy_id=12)
  → inserted (food_id=100001, allergy_id=14)
  → inserted (food_id=100001, allergy_id=15)
  → inserted (food_id=100001, allergy_id=16)
  → inserted (food_id=100001, allergy_id=19)
  → inserted (food_id=100001, allergy_id=21)
  → inserted (food_id=100001, allergy_id=22)
[0] 진라면 매운맛: 다음 요청까지 3.29초 대기
  → inserted (food_id=100002, allergy_id=3)
  → inserted (food_id=100002, allergy_id=5)
  → inserted (food_id=100002, allergy_id=6)
  → inserted (food_id=100002, allergy_id=7)
  → inserted (food_id=100002, allergy_id=8)
  → inserted (food_id=100002, allergy_id=9)
  → inserted (food_id=100002, allergy

In [15]:
all_code(200, 400)

[200] 컵밥 진짬뽕밥 217.5G: 알러지 없음
  → inserted (food_id=100203, allergy_id=8)
[201] 옛날 구수한누룽지 30G: 다음 요청까지 3.48초 대기
  → inserted (food_id=100204, allergy_id=3)
  → inserted (food_id=100204, allergy_id=5)
  → inserted (food_id=100204, allergy_id=6)
  → inserted (food_id=100204, allergy_id=7)
  → inserted (food_id=100204, allergy_id=8)
  → inserted (food_id=100204, allergy_id=9)
  → inserted (food_id=100204, allergy_id=11)
  → inserted (food_id=100204, allergy_id=12)
  → inserted (food_id=100204, allergy_id=13)
  → inserted (food_id=100204, allergy_id=15)
  → inserted (food_id=100204, allergy_id=19)
  → inserted (food_id=100204, allergy_id=21)
  → inserted (food_id=100204, allergy_id=22)
[202] 컵밥 햄버그덮밥 315G: 다음 요청까지 1.88초 대기
[203] 컵밥 황태콩나물해장국밥 301.5G: 알러지 없음
  → inserted (food_id=100206, allergy_id=3)
  → inserted (food_id=100206, allergy_id=5)
  → inserted (food_id=100206, allergy_id=6)
  → inserted (food_id=100206, allergy_id=7)
  → inserted (food_id=100206, allergy_id=8)
  → inserted (food

In [16]:
all_code(400, 600)

  → inserted (food_id=100402, allergy_id=3)
  → inserted (food_id=100402, allergy_id=5)
  → inserted (food_id=100402, allergy_id=7)
  → inserted (food_id=100402, allergy_id=9)
  → inserted (food_id=100402, allergy_id=11)
  → inserted (food_id=100402, allergy_id=15)
  → inserted (food_id=100402, allergy_id=19)
  → inserted (food_id=100402, allergy_id=21)
  → inserted (food_id=100402, allergy_id=22)
[400] 하이라이스소스 100G: 다음 요청까지 5.21초 대기
  → inserted (food_id=100403, allergy_id=3)
  → inserted (food_id=100403, allergy_id=5)
  → inserted (food_id=100403, allergy_id=6)
  → inserted (food_id=100403, allergy_id=7)
  → inserted (food_id=100403, allergy_id=9)
  → inserted (food_id=100403, allergy_id=11)
  → inserted (food_id=100403, allergy_id=12)
  → inserted (food_id=100403, allergy_id=13)
  → inserted (food_id=100403, allergy_id=14)
  → inserted (food_id=100403, allergy_id=15)
  → inserted (food_id=100403, allergy_id=19)
  → inserted (food_id=100403, allergy_id=21)
[401] 헬로베지 채소가득카레 200G: 다음 

In [17]:
all_code(600, 800)

  → inserted (food_id=100602, allergy_id=22)
[600] 스테이크소스 470G: 다음 요청까지 2.58초 대기
[601] 우스타소스 415G: 알러지 없음
  → inserted (food_id=100604, allergy_id=15)
  → inserted (food_id=100604, allergy_id=22)
[602] 닭볶음탕양념 470G: 다음 요청까지 1.84초 대기
  → inserted (food_id=100605, allergy_id=3)
  → inserted (food_id=100605, allergy_id=6)
  → inserted (food_id=100605, allergy_id=8)
  → inserted (food_id=100605, allergy_id=11)
  → inserted (food_id=100605, allergy_id=12)
  → inserted (food_id=100605, allergy_id=15)
  → inserted (food_id=100605, allergy_id=19)
  → inserted (food_id=100605, allergy_id=22)
[603] 페퍼팝 마라맛 20G: 다음 요청까지 5.10초 대기
  → inserted (food_id=100606, allergy_id=22)
[604] 1000아일랜드드레싱 250G: 다음 요청까지 1.94초 대기
[605] 소갈비양념 480G: detail-page 이미지 없음
  → inserted (food_id=100608, allergy_id=3)
  → inserted (food_id=100608, allergy_id=6)
  → inserted (food_id=100608, allergy_id=8)
  → inserted (food_id=100608, allergy_id=15)
  → inserted (food_id=100608, allergy_id=19)
  → inserted (food_id=100608, 

In [18]:
all_code(800, 1000)

  ☑ 이미 존재 (food_id=100780, allergy_id=3)
  ☑ 이미 존재 (food_id=100780, allergy_id=5)
  ☑ 이미 존재 (food_id=100780, allergy_id=6)
  ☑ 이미 존재 (food_id=100780, allergy_id=7)
  ☑ 이미 존재 (food_id=100780, allergy_id=8)
  ☑ 이미 존재 (food_id=100780, allergy_id=9)
  ☑ 이미 존재 (food_id=100780, allergy_id=10)
  ☑ 이미 존재 (food_id=100780, allergy_id=11)
  ☑ 이미 존재 (food_id=100780, allergy_id=12)
  ☑ 이미 존재 (food_id=100780, allergy_id=13)
  ☑ 이미 존재 (food_id=100780, allergy_id=14)
  ☑ 이미 존재 (food_id=100780, allergy_id=15)
  ☑ 이미 존재 (food_id=100780, allergy_id=16)
  ☑ 이미 존재 (food_id=100780, allergy_id=17)
  ☑ 이미 존재 (food_id=100780, allergy_id=19)
  ☑ 이미 존재 (food_id=100780, allergy_id=21)
  ☑ 이미 존재 (food_id=100780, allergy_id=22)
  ☑ 이미 존재 (food_id=100780, allergy_id=23)
[800] 제주담음 제주청귤담은 고등어구이 222G: 다음 요청까지 3.90초 대기
  ☑ 이미 존재 (food_id=100729, allergy_id=5)
  ☑ 이미 존재 (food_id=100729, allergy_id=6)
  ☑ 이미 존재 (food_id=100729, allergy_id=7)
  ☑ 이미 존재 (food_id=100729, allergy_id=8)
  ☑ 이미 존재 (food_id=100729, allergy_id=9

In [19]:
all_code(1000, len(all_urls))

[1000] 요즘미역/가벼운참치 라이트스탠다드/참기름: 알러지 없음
  → inserted (food_id=101003, allergy_id=6)
  → inserted (food_id=101003, allergy_id=8)
  → inserted (food_id=101003, allergy_id=10)
  → inserted (food_id=101003, allergy_id=11)
  → inserted (food_id=101003, allergy_id=12)
  → inserted (food_id=101003, allergy_id=15)
  → inserted (food_id=101003, allergy_id=19)
  → inserted (food_id=101003, allergy_id=20)
  → inserted (food_id=101003, allergy_id=22)
[1001] 제주담음 제주한라봉마말레이드 300G: 다음 요청까지 4.51초 대기
  → inserted (food_id=101004, allergy_id=6)
  → inserted (food_id=101004, allergy_id=8)
  → inserted (food_id=101004, allergy_id=11)
  → inserted (food_id=101004, allergy_id=12)
  → inserted (food_id=101004, allergy_id=15)
  → inserted (food_id=101004, allergy_id=19)
  → inserted (food_id=101004, allergy_id=20)
  → inserted (food_id=101004, allergy_id=22)
[1002] LIGHT&JOY 당을줄인 논산딸기쨈 290G: 다음 요청까지 2.58초 대기
  → inserted (food_id=101005, allergy_id=6)
  → inserted (food_id=101005, allergy_id=15)
  → inserted (f