In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# 옵션 설정
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')

driver = webdriver.Chrome(options=options)



In [None]:
# 크롤링할 URL
url = "https://app.catchtable.co.kr/ct/curation/culinaryclasswars2?curationQuickFilterKey=53507&hasShopRefsFromClient=1&isUseExhibitionFilter=1&metaContractedType=1&currentExhibitionKey=classwars2-all&serviceType=INTEGRATION&sortMethod=price_level_desc&zoomLevel=10&centerBoundsLat=37.5404502815133&centerBoundsLng=126.97067134216111&isNewSearchInMap=1&isSearchedInMap=0&isShowMapSearchButton=0&subFilterGroupItem=%255B%255D&showTargetShopTooltip=0&isSettingTargetShopBound=0&prevSearchType=FILTER_SEARCH&location=CAT011_CAT011001_CAT011002_CAT011003_CAT011004_CAT011005_CAT011006_CAT011007_CAT011008_CAT011009_CAT011010&uniqueListId=1768350426690&shopClickCount=9&viewMode=LIST&renderMapComponent=1&showShopSlide=0"
driver.get(url)
time.sleep(3) # 페이지 초기 로딩 대기

In [None]:
# Selector 설정
CSS_CARD_CONTAINER = "div.l0xety0.l0xety2.l0xety4"
CSS_NAME = ".l0xety9"
CSS_CHIEF = ".vrh4k92 span"
CSS_RATING = ".oron0y1"
CSS_REVIEW_COUNT = ".oron0y2.oron0y4"
CSS_LOCATION_CATEGORY = "span._1nmepil1._1nmepil3"
CSS_STATUS_TIME = ".ruh8m12.ruh8m14.ruh8m16"
CSS_PRICE = ".ruh8m12.ruh8m13.ruh8m16"
CSS_MESHLIN = ".vrh4k93 span"

print("Selector 설정 완료")

In [None]:
# [로직 수정] '화면이 처음으로 돌아가는 현상' 방지
# 원인: 고정된 XPath(상단 영역)로 마우스를 계속 이동시키면 브라우저가 위로 스크롤합니다.
# 해결: 현재 보이는 '마지막 카드'로 마우스를 이동시킨 뒤 PageDown을 하여 계속 아래로 내려갑니다.

collected_data = {} 
no_new_data_count = 0

try:
    print("수집 시작...")
    
    while True:
        # 1. 데이터 수집
        visible_cards = driver.find_elements(By.CSS_SELECTOR, CSS_CARD_CONTAINER)
        new_data_found = False
        
        for card in visible_cards:
            try:
                try: name = card.find_element(By.CSS_SELECTOR, CSS_NAME).text
                except: continue
                
                if name in collected_data:
                    continue

                new_data_found = True
                
                # 정보 추출
                try: chief = card.find_element(By.CSS_SELECTOR, CSS_CHIEF).text
                except: chief = ""
                try: rating = card.find_element(By.CSS_SELECTOR, CSS_RATING).text
                except: rating = ""
                try: review_count = card.find_element(By.CSS_SELECTOR, CSS_REVIEW_COUNT).text
                except: review_count = ""
                try:
                    loc_cats = card.find_elements(By.CSS_SELECTOR, CSS_LOCATION_CATEGORY)
                    location_category = " · ".join([el.text for el in loc_cats if el.text.strip() and el.text != "·"])
                except: location_category = ""
                try: status_time = card.find_element(By.CSS_SELECTOR, CSS_STATUS_TIME).text
                except: status_time = ""
                try: price = card.find_element(By.CSS_SELECTOR, CSS_PRICE).text
                except: price = ""
                try:
                    meshlin_elements = card.find_elements(By.CSS_SELECTOR, CSS_MESHLIN)
                    meshlin = ", ".join([el.text for el in meshlin_elements if el.text.strip()])
                except: meshlin = ""

                collected_data[name] = {
                    "name": name,
                    "chief_info": chief,
                    "rating": rating,
                    "review_count": review_count,
                    "location_category": location_category,
                    "status_time": status_time,
                    "price": price,
                    "badge_info": meshlin
                }
            except: continue

        # 2. 종료 조건 체크
        if not new_data_found:
            no_new_data_count += 1
            if no_new_data_count >= 10: 
                print("수집 종료 (더 이상 새로운 데이터 없음)")
                break
        else:
            no_new_data_count = 0
            print(f"현재 수집된 데이터 개수: {len(collected_data)}개")
            
        # 3. 스크롤 (가장 마지막 요소로 이동 -> PageDown x 2)
        if visible_cards:
            last_card = visible_cards[-1] # 현재 보이는 마지막 카드로 이동
            try:
                actions = ActionChains(driver)
                actions.move_to_element(last_card).perform()
                time.sleep(0.5)
                
                # PageDown 입력
                actions.send_keys(Keys.PAGE_DOWN).pause(0.5).send_keys(Keys.PAGE_DOWN).perform()
                
                time.sleep(1.5)
            except Exception as e:
                print(f"스크롤 중 에러: {e}")
                # 실패 시 body를 대상으로 시도
                body = driver.find_element(By.TAG_NAME, "body")
                body.send_keys(Keys.PAGE_DOWN)
                time.sleep(.5)

except Exception as e:
    print(f"에러 관련 메시지: {e}")
    
print(f"총 {len(collected_data)}개 데이터 수집 완료")

In [None]:
# 결과 저장
df = pd.DataFrame(list(collected_data.values()))
display(df.head())
df.to_csv("catchtable_crawling_final.csv", index=False, encoding="utf-8-sig")

----
# 주소갖고오기

In [None]:
from curl_cffi import requests
import pandas as pd
import time
import json

# ==========================================
# 1. 설정
# ==========================================
TARGET_CURATION_KEY = "classwars2-all"
url = "https://ct-api.catchtable.co.kr/api/v6/search/curation/list"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Content-Type": "application/json",
    "Referer": "https://app.catchtable.co.kr/",
    "Origin": "https://app.catchtable.co.kr",
    # 쿠키 (403 에러 발생 시 개발자 도구에서 새 쿠키로 교체 필요)
    "Cookie": "_gcl_au=1.1.56958207.1768291961; _hackle_hid=e2ecdc37-4231-478c-b3d6-8bbd45b5bfcf; _hackle_did_7dQgTKfweH0n436c9aJLVh84yOncuWxD=e2ecdc37-4231-478c-b3d6-8bbd45b5bfcf; _gid=GA1.3.1186047315.1768291965; airbridge_migration_metadata__catchtable=%7B%22version%22%3A%221.11.0%22%7D; ab180ClientId=9b12e792-7ce3-4a13-adeb-876798c35d60; airbridge_referrer_campaign_params__catchtable=google.adwords%24%24%7B%22channel%22%3A%22google.adwords%22%2C%22campaign%22%3A%2223339493589%22%2C%22campaign_id%22%3A%2223339493589%22%2C%22ad_group%22%3A%22%22%2C%22ad_group_id%22%3A%22%22%2C%22ad_creative%22%3A%22%22%2C%22ad_creative_id%22%3A%22%22%2C%22ad_creative_id%22%3A%22%22%2C%22term%22%3A%22%22%2C%22sub_id%22%3A%22x%22%2C%22sub_id_1%22%3A%22%22%2C%22sub_id_2%22%3A%22%22%2C%22sub_id_3%22%3A%22%22%7D; airbridge_referrer_campaign_params_cta_parameter__catchtable=%7B%7D; airbridge_referrer_campaign_params_url__catchtable=https%3A//app.catchtable.co.kr/ct/exhibition/2512_limited_the_final%3Fairbridge_referrer%3Dairbridge%253Dtrue%2526channel%253Dgoogle.adwords%2526campaign%253D23339493589%2526campaign_id%253D23339493589%2526ad_group%253D%2526ad_group_id%253D%2526ad_creative%253D%2526ad_creative_id%253D%2526term%253D%2526sub_id%253Dx%2526sub_id_1%253D%2526sub_id_2%253D%2526sub_id_3%253D%2526click_id%253DCjwKCAiA95fLBhBPEiwATXUsxL2VSCprFaWAlwr-WyxMqd19ekacyd5d9rfHKgc2RXC_OnwXrPNxABoClgcQAvD_BwE%2526gclid%253DCjwKCAiA95fLBhBPEiwATXUsxL2VSCprFaWAlwr-WyxMqd19ekacyd5d9rfHKgc2RXC_OnwXrPNxABoClgcQAvD_BwE%2526ad_type%253Dclick%26gad_source%3D1%26gad_campaignid%3D23339498608%26gbraid%3D0AAAAACuLLC0l2L-_q2iMIKvxERkwhs35c%26gclid%3DCjwKCAiA95fLBhBPEiwATXUsxL2VSCprFaWAlwr-WyxMqd19ekacyd5d9rfHKgc2RXC_OnwXrPNxABoClgcQAvD_BwE; airbridge_referrer_campaign_params_timestamp__catchtable=1768304915629; _gcl_gs=2.1.k1$i1768357823$u51298297; _gac_UA-117680739-4=1.1768357824.CjwKCAiA95fLBhBPEiwATXUsxDMDnk5Zk79hJgE0kcOXco5-WKtEUJ45HzPY7UpVpzlra5qxvrHbURoC3B4QAvD_BwE; _gcl_aw=GCL.1768357827.CjwKCAiA95fLBhBPEiwATXUsxDMDnk5Zk79hJgE0kcOXco5-WKtEUJ45HzPY7UpVpzlra5qxvrHbURoC3B4QAvD_BwE; _hackle_mkt_7dQgTKfw=%7B%7D; _ga=GA1.1.956063542.1768291961; airbridge_session__catchtable=%7B%22id%22%3A%22cb8bfe40-add9-4e29-89fc-e149f34c9f40%22%2C%22timeout%22%3A1800000%2C%22start%22%3A1768357494043%2C%22end%22%3A1768359431412%7D; _ga_95C07ZWW1T=GS2.1.s1768357482$o6$g1$t1768359512$j43$l0$h0; _hackle_session_id_eH0n436c9aJLVh84yOncuWxD=1768362613467.5e3e8762; _ga_9ENCGJ7C7P=GS2.1.s1768362613$o7$g0$t1768362613$j60$l0$h0; __cf_bm=_ZMbbh3qpTJy.O4IIqGiHsRrpO3rroqess8Hxl7URlo-1768362615-1.0.1.1-g.cJrnnyfW_VQCOw8ZIpkCnZKysMJs6NancQuLhtUMPTBNTNgZrg6pHS4hHWgE35cKNc3eEg3HxNRygRr8V0kP2xWw02411ZvzQ5xC3pmpU; _hackle_last_event_ts_eH0n436c9aJLVh84yOncuWxD=1768362615917"
}

# ==========================================
# 2. 수집 실행 (강제 패턴 적용)
# ==========================================
all_data = []
current_count = 0  # 0, 20, 40 ... 이렇게 증가할 숫자

print(f"수집 시작...")

while True:
    # -----------------------------------------------------------
    # ★ 핵심: 사용자님이 찾은 패턴대로 Offset 생성
    # -----------------------------------------------------------
    if current_count == 0:
        offset_str = "0" # 첫 페이지는 0.0
    else:
        # 20:99:91-20:37:27-20:0:0 형식으로 조립
        offset_str = f"{current_count}:99:91-{current_count}:37:27-{current_count}:0:0"
        
    print(f"\n[데이터 {current_count}번부터 요청 중...] Offset: {offset_str}")
    
    payload = {
        "paging": {"offset": offset_str, "size": 20},
        "divideType": "NON_DIVIDE",
        "curation": {"curationKey": TARGET_CURATION_KEY},
        "sort": {"sortType": "recommended"},
        "userInfo": {"clientGeoPoint": {"lat": 37.563398, "lon": 126.9863309}}
    }

    try:
        response = requests.post(url, headers=headers, json=payload, impersonate="chrome110")
        
        if response.status_code != 200:
            print(f"!!! 에러: {response.status_code} (Offset 문제일 수 있음)")
            break
            
        json_resp = response.json()
        inner_data = json_resp.get('data', {})
        shop_results = inner_data.get('shopResults', {})
        items = shop_results.get('shops', [])
        
        if not items:
            print(">>> 데이터가 비어있습니다. 수집을 종료합니다.")
            break
            
        print(f">>> {len(items)}개 확보 완료! (누적 {len(all_data) + len(items)}개)")

        # 데이터 파싱
        for item in items:
            meta = item.get('shopMeta', {})
            coords = meta.get('shopCoord', {})
            prices = meta.get('prices', {})
            
            shop_info = {
                '식당명': meta.get('shopName'),
                '카테고리': meta.get('foodKind'),
                '평점': meta.get('avgScore'),
                '리뷰수': meta.get('reviewCount'),
                '지역': meta.get('landName'),
                '전화번호': meta.get('shopPhone'),
                '저녁가격': prices.get('dinnerPriceText'),
                '위도': coords.get('lat'),
                '경도': coords.get('lon'),
                '고유ID': meta.get('shopRef')
            }
            all_data.append(shop_info)
            
        # 다음 페이지를 위해 카운트 20 증가
        current_count += 20
        time.sleep(1.5) # 안전하게 1.5초 대기

    except Exception as e:
        print(f"에러 발생: {e}")
        break

# 저장
if all_data:
    df = pd.DataFrame(all_data)
    df.to_csv("catchtable_full_data.csv", index=False, encoding="utf-8-sig")
    print(f"\n[완료] 총 {len(df)}개의 식당 정보를 저장했습니다!")
    print(df.tail()) # 마지막에 수집된 데이터 확인
else:
    print("\n[실패] 데이터가 없습니다.")

In [None]:
df1 = pd.read_csv('catchtable_crawling_final.csv')
df2 = pd.read_csv('catchtable_full_data_with_url.csv')


In [None]:
df1

In [None]:
# 1. 보기 편하게 변수로 저장
names_df1 = set(df1['name'])      # 원본 리스트 (예: 흑백요리사 정보)
names_df2 = set(df2['식당명'])     # 크롤링한 리스트

# 2. df1(원본)에는 있는데, df2(크롤링)에는 없는 것 찾기 (누락된 식당)
not_crawled = names_df1 - names_df2

# 3. df2(크롤링)에는 있는데, df1(원본)에는 없는 것 찾기 (이름이 미묘하게 다른 경우 등)
unexpected_crawled = names_df2 - names_df1

print(f"=== [주의] df1엔 있지만 df2엔 없는 것 ({len(not_crawled)}개) ===")
# 보기 좋게 리스트로 변환해서 출력
print(list(not_crawled))

print(f"\n=== [참고] df2엔 있지만 df1엔 없는 것 ({len(unexpected_crawled)}개) ===")
print(list(unexpected_crawled))

df2 = 주소정보 들어있음
df1 = 가게 정보들어있음

df1엔 있지만 df2엔 없는 것 => 여기있는 것들은 주소가 없다는 거겠네
df2엔 있지만 df1엔 없는 것 => 여기는 서울이 아닌 데이터가 있겠네

In [None]:
df2.head()

In [None]:
mergedata = pd.merge(df2,df1, how='outer', left_on = '식당명',right_on= 'name')

In [None]:
추가위치 = pd.read_csv('./data/추가위치데이터.csv')
추가위치.head()

In [None]:
추가위치.info()

In [None]:
concat_data = pd.concat([df2.drop('고유ID',axis=1),추가위치],ignore_index= True)

In [None]:
concat_data

In [None]:
df3 = pd.merge(concat_data, df1, left_on = '식당명', right_on = 'name', how='inner')

In [None]:
# 1. 보기 편하게 변수로 저장
names_df1 = set(df1['name'])      # 원본 리스트 (예: 흑백요리사 정보)
names_df2 = set(df3['식당명'])     # 크롤링한 리스트

# 2. df1(원본)에는 있는데, df2(크롤링)에는 없는 것 찾기 (누락된 식당)
not_crawled = names_df1 - names_df2

# 3. df2(크롤링)에는 있는데, df1(원본)에는 없는 것 찾기 (이름이 미묘하게 다른 경우 등)
unexpected_crawled = names_df2 - names_df1

print(f"=== [주의] df1엔 있지만 df2엔 없는 것 ({len(not_crawled)}개) ===")
# 보기 좋게 리스트로 변환해서 출력
print(list(not_crawled))

print(f"\n=== [참고] df2엔 있지만 df1엔 없는 것 ({len(unexpected_crawled)}개) ===")
print(list(unexpected_crawled))

In [None]:
df3

In [None]:
가게정보 = pd.read_csv('캐치테이블_가게정보2.csv')

In [None]:
가게정보.head()

In [None]:
import pandas as pd

pd.read_csv('캐치테이블_가게정보2.csv').head()