In [45]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm 
import datetime as dt
import pytz
import glob
import os
from custom_modules.hugdd_preprocess.crawling import parse_oneline, parse_onepage, get_img_link
from custom_modules.hugdd_preprocess.kakao import kakaomap_rest_api

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [None]:
kakao_api_key = os.getenv('KAKAO_REST_API_KEY')

In [None]:
kakaomap = kakaomap_rest_api(kakao_api_key)

In [46]:
def get_seoul_time():
    seoul_tz = pytz.timezone('Asia/Seoul')
    seoul_time = dt.datetime.now(seoul_tz)
    return seoul_time
current_date = get_seoul_time()
current_str = current_date.strftime('%y%m%d')
current_str

'241107'

## 든든전세 페이지 크롤링

In [49]:
sample = parse_onepage(1) 
service_date = sample.loc[0, '공고일자']

In [50]:
list_num = 50
datas = [parse_onepage(n) for n in tqdm(range(1,list_num+1))]

  0%|          | 0/13 [00:00<?, ?it/s]

In [51]:
final_data = pd.concat(datas).reset_index(drop=True).assign(address = lambda df: df['주소'].apply(lambda x: x.split('  ')[0] if ',' not in x else  x.split('  ')[0].split(',')[0])
                                               # .apply(lambda x: ' '.join([i for i in x.split(' ') if i !='']))
                                              )
final_data.to_csv(f'data/crawling{service_date}_{current_str}.csv', index = False, encoding = 'utf-8-sig')

In [52]:
final_data=pd.read_csv(f'data/crawling{service_date}_{current_str}.csv')

## 카카오 api 설정
### 주소 좌표 변환(위경도)

In [54]:
coordinates = [kakaomap.convert_address_to_coordinates(i) for i in tqdm(final_data.address.values)]

  0%|          | 0/122 [00:00<?, ?it/s]

In [55]:
final_data0 = pd.concat([final_data, pd.DataFrame(coordinates, columns = ['x','y'])], axis=1)

In [None]:
final_data0.to_csv(f'data/crawling{service_date}_{current_str}_addcoord.csv', index = False, encoding = 'utf-8-sig')

In [None]:
final_data0 = pd.read_csv("data/crawling20250326_250331_addcoord.csv")

## 해당 매물 구조도 이미지 가져오기

In [57]:
# 배치 처리 방식으로 이미지 링크 수집
def get_all_img_links_batch(href_ids, dt='20241007', batch_size=5):
    results = []
    for i in tqdm(range(0, len(href_ids), batch_size)):
        batch = href_ids[i:i + batch_size]
        for href_id in batch:
            img_link = get_img_link(href_id, dt)
            results.append(img_link)
        # 배치 처리 후 잠시 대기
        time.sleep(1.5)
    return results



In [58]:
# 실행
imgs = get_all_img_links_batch(final_data0.href_id.values, dt=service_date, batch_size=5)

  0%|          | 0/122 [00:00<?, ?it/s]

In [59]:
final_data = final_data0.assign(img = imgs)

In [60]:
final_data.to_csv(f'data/data{service_date}_{current_str}.csv', index = False, encoding = 'utf-8-sig')

In [61]:
final_data.head(1)

Unnamed: 0,번호,공고일자,청약 접수기간,시도,시군구,주소,주택유형,전용면적(m2),임대보증금액,신청자수,href_id,address,x,y,img
0,122,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,서울특별시,서울 강동구,"서울 강동구 천호동 562, 562-1 스카이캐슬라 8층 802호",다세대주택,15.21,152100000,668,no=2023040345,서울 강동구 천호동 562,37.539597,127.130622,http://www.khug.or.kr/updata/khgc/khgccms/cms/...


In [62]:
processing_data = pd.read_csv(f'data/data{service_date}_{current_str}.csv')

## 지하철 역 거리 계산

In [63]:
import geopandas as gpd
from shapely.geometry import Point
import shapely

In [64]:
processing_final = processing_data.assign(
    deposit = lambda df: df['임대보증금액'].apply(lambda x: int(x.replace(',', ''))/10000),
    m2 = lambda df: df['전용면적(m2)'],
    deposit_m2 = lambda df: df.deposit/df.m2)


In [65]:
processing_final.columns


In [66]:
sample_x, sample_y = processing_final.loc[0,['x', 'y']]

In [67]:
def find_near_subway_station(x, y, max_distance = 3000):
    time.sleep(0.2)
    results = kakaomap.search_by_category('SW8', y, x, 3000) # 위경도 바꿔어서 입력
    if len(results.get('documents'))!=0:
        near_result = results.get('documents')[0]
        return near_result.get('distance'), near_result.get('place_name')
    else:
        print('no result')
    

In [68]:
near_stations = [find_near_subway_station(row.x, row.y) for row in tqdm(processing_final.itertuples(), total = processing_final.shape[0])]

In [69]:
final = pd.concat([processing_final, pd.DataFrame(near_stations, columns = ["distanceM_near_station", "near_station"])], axis=1)

In [None]:
final.to_csv(f'data/data{service_date}_{current_str}_addstation.csv', index = False, encoding = 'utf-8-sig')

In [70]:
final = pd.read_csv(f'data/data{service_date}_{current_str}_addstation.csv')

## 통근시간 계산

In [86]:
from dateutil import relativedelta
tomorrow = current_date+relativedelta.relativedelta(days=1)

In [118]:
comp_x, comp_y =  37.5058315272521, 127.040806473603


In [139]:
expected_times = [kakaomap.calculate_transit_time(i.y, i.x, comp_y, comp_x) for i in tqdm(final.itertuples(), total = final.shape[0])]

  0%|          | 0/122 [00:00<?, ?it/s]

In [144]:
final_csv = pd.concat([final, pd.DataFrame(expected_times, columns = ['expected_time', 'distance_comp'])], axis=1)
final_csv.to_csv(f'data/final{service_date}_{current_str}.csv', index = False, encoding='utf-8-sig')

## 신청자수 업데이트

In [None]:
list_num = 50
datas = [parse_onepage(n) for n in tqdm(range(1,list_num+1))]

In [None]:
final_data = pd.concat(datas).reset_index(drop=True).assign(address = lambda df: df['주소'].apply(lambda x: x.split('  ')[0] if ',' not in x else  x.split('  ')[0].split(',')[0])
                                               # .apply(lambda x: ' '.join([i for i in x.split(' ') if i !='']))
                                              )
final_data.to_csv(f'data/crawling{service_date}_{current_str}.csv', index = False, encoding = 'utf-8-sig')
csv_list = sorted(glob.glob(f"data/final{service_date}_*.csv"))
final_csv=pd.read_csv(csv_list[-1])

In [None]:
final_csv0 = final_csv.drop(columns = '신청자수')
final_csv0['신청자수'] = final_data['신청자수']
final_csv0.to_csv(f'data/final{service_date}_{current_str}.csv', index = False, encoding='utf-8-sig')

## 이미지 다운로드 및 llm 기반 방구조도 분석

### 이미지 다운로드 

In [None]:
def download_image(img_url, save_path, filename, max_retries=3, timeout=30):
    for attempt in range(max_retries):
        try:
            # SSL 검증 비활성화 및 타임아웃 설정
            response = requests.get(img_url, verify=False, timeout=timeout)
            response.raise_for_status()
            
            os.makedirs(save_path, exist_ok=True)
            file_path = os.path.join(save_path, filename)
            
            with open(file_path, 'wb') as f:
                f.write(response.content)
            return True
            
        except requests.Timeout:
            if attempt < max_retries - 1:
                print(f"타임아웃 발생 ({filename}), {attempt + 1}/{max_retries} 재시도")
                time.sleep(2)  # 타임아웃 발생 시 더 긴 대기
            else:
                print(f"최대 타임아웃 재시도 횟수 초과 ({filename})")
                return False
                
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"다운로드 시도 {attempt + 1}/{max_retries} 실패: {e}")
                time.sleep(1)
            else:
                print(f"최대 재시도 횟수 초과 ({filename}): {e}")
                return False

def download_images_batch(df, batch_size=2, save_path='downloaded_images', max_retries=3, 
                         timeout=30, delay_between_batches=3):
    downloaded_paths = []
    
    # 이미지 다운로드
    for i in tqdm(range(0, df.shape[0], batch_size), desc="이미지 다운로드 중"):
        batch = df.loc[i:i + batch_size, :]
        
        for row in batch.itertuples():
            img_url = row.img
            if img_url:
                filename = f"{row.번호}.jpg"
                if download_image(img_url, save_path, filename, max_retries=max_retries, timeout=timeout):
                    downloaded_paths.append(os.path.join(save_path, filename))
                else:
                    print(f"이미지 다운로드 실패: {filename}")
        
        # 배치 처리 후 대기
        time.sleep(delay_between_batches)
        
        # # 진행 상황 저장 (선택사항)
        # progress = {
        #     'downloaded': downloaded_paths,
        #     'current_index': i + batch_size
        # }
        # with open('download_progress.json', 'w') as f:
        #     json.dump(progress, f)
    
    return downloaded_paths



In [None]:
# 실행
downloaded_paths = download_images_batch(
    final_csv.loc[:30,:], 
    batch_size=3,  # 배치 사이즈
    max_retries=3,
    timeout=30,    # 타임아웃 시간
    delay_between_batches=5  # 배치 간 대기 시간
)

### llm 기반 구조도 분석

In [None]:
from custom_modules.room_analyzer.nodes import logger
import base64
def save_image(state:GraphState):
    """
    Save image

    Args:
        state (GraphState): The current graph state
    """

    logger.info("---SAVE IMAGE ---")
    save_path = "downloaded_images"
    os.makedirs(save_path, exist_ok=True)
    filename = f"{state['image_id']:03d}"
    base64_string = state['image_str']
    file_path = os.path.join(save_path, filename)

    if ',' in base64_string:
        base64_string = base64_string.split(',')[1]
    image_data = base64.b64decode(base64_string)
    
    with open(file_path+".png", 'wb') as f:
        f.write(image_data)
        
    print("Image Saved in", file_path)

In [None]:
from langgraph.graph import StateGraph, START, END
from custom_modules.room_analyzer.nodes import download_image, describe_image, check_image_description
from custom_modules.room_analyzer.edges import decide_to_generate, decide_to_regenerate
from custom_modules.room_analyzer.models import GraphState

# Define a new graph
workflow = StateGraph(GraphState)

# Define the nodes we will cycle between
workflow.add_node("download_image", download_image)
workflow.add_node("save_image", save_image)
workflow.add_node("describe_image", describe_image)
workflow.add_node("check_image_description", check_image_description)

# Add edges
workflow.add_edge(START, "download_image")
workflow.add_edge("download_image", "save_image")


workflow.add_conditional_edges(
    "download_image",
    decide_to_generate,
    {
        "end": END,
        "generate": "describe_image"
    }
)

workflow.add_edge("describe_image", "check_image_description")

workflow.add_conditional_edges(
    "check_image_description",
    decide_to_regenerate,
    {
        "nextstep": END,
        "regenerate": "describe_image",
        "end": END
    }
)

# Compile
graph = workflow.compile()

In [None]:
from IPython.display import Image, display

try:
    display(Image(graph.get_graph(xray=True).draw_mermaid_png()))
except Exception:
    print(graph.get_graph().draw_ascii())
    pass

In [None]:
result_end = graph.invoke({"image_url":final_csv.loc[0,"img"], "image_id":final_csv.loc[0,"번호"]})