In [45]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm 
import datetime as dt
import pytz

In [46]:
def get_seoul_time():
    seoul_tz = pytz.timezone('Asia/Seoul')
    seoul_time = dt.datetime.now(seoul_tz)
    return seoul_time
current_date = get_seoul_time()
current_str = current_date.strftime('%y%m%d')
current_str

'241107'

In [47]:
def parse_oneline(txt):
    output = [i for i in txt.split('\n')]
    return output

def parse_onepage(page_num):
    html=requests.get(f"https://www.khug.or.kr/jeonse/web/s07/s070102.jsp?cur_page={page_num}")
    tb_TF = False
    try_cnt=0
    while not tb_TF:
        if try_cnt>10:
            raise
        time.sleep(1) 
        bs_test = BeautifulSoup(html.text, 'html.parser')
        if bs_test.find('table'):
            tb_TF = True 
        else:
            tb_TF = False
        try_cnt+=1
        
    tables = bs_test.find('table')
    thead = tables.find('thead')
    tab_cols = parse_oneline(thead.get_text().strip())
    tbody = tables.find('tbody')
    output_tab = pd.DataFrame([parse_oneline(i.strip()) for i in tbody.get_text().strip().split('\n\n')], columns=tab_cols)
    output_tab = output_tab.assign(href_id = [re.search('no=\d{10}', i['href']).group() for i in tbody.find_all(href=True)])
    return output_tab

In [49]:
sample = parse_onepage(1) 
service_date = sample.loc[0, '공고일자']

In [50]:
list_num = 13
datas = [parse_onepage(n) for n in tqdm(range(1,list_num+1))]

  0%|          | 0/13 [00:00<?, ?it/s]

In [51]:
final_data = pd.concat(datas).reset_index(drop=True).assign(address = lambda df: df['주소'].apply(lambda x: x.split('  ')[0] if ',' not in x else  x.split('  ')[0].split(',')[0])
                                               # .apply(lambda x: ' '.join([i for i in x.split(' ') if i !='']))
                                              )
final_data.to_csv(f'data/crawling{service_date}_{current_str}.csv', index = False, encoding = 'utf-8-sig')

In [52]:
final_data=pd.read_csv(f'data/crawling{service_date}_{current_str}.csv')

In [None]:
# def find_coordinates(address):
#     time.sleep(1)
#     apiurl = "https://api.vworld.kr/req/address?"
#     params = {
#     	"service": "address",
#     	"request": "getcoord",
#     	"crs": "epsg:5186",
#     	"address": address,
#     	"format": "json",
#     	"type": "parcel",
#     	"key": ""
#     }
#     response = requests.get(apiurl, params=params)
#     if response.status_code == 200:
#         result = response.json()['response']
#         # print(result)
#         if 'result' in result.keys():
#             return result['result']['point']['x'], result['result']['point']['y']
#         else:
#             return 0,0


import requests

def set_rest_api(rest_api_token):
    def convert_address_to_coordinates(address):
        """
        입력받은 주소를 WGS84 좌표계 좌표로 변환(카카오맵api)
        """
    
        url = 'https://dapi.kakao.com/v2/local/search/address.json?query=' + address
        
        header = {'Authorization': 'KakaoAK ' + rest_api_token}
     
        r = requests.get(url, headers=header)
        
        if (r.status_code == 200) and len(r.json()["documents"])>0:
            lng = float(r.json()["documents"][0]["address"]['x'])
            lat = float(r.json()["documents"][0]["address"]['y'])
        else:
            return None
        return lat, lng
        
    return convert_address_to_coordinates

kakao_api_key = ""
convert_address_to_coordinates = set_rest_api(kakao_api_key)

In [54]:
coordinates = [convert_address_to_coordinates(i) for i in tqdm(final_data.address.values)]

  0%|          | 0/122 [00:00<?, ?it/s]

In [55]:
final_data0 = pd.concat([final_data, pd.DataFrame(coordinates, columns = ['x','y'])], axis=1)

In [56]:
final_data0

Unnamed: 0,번호,공고일자,청약 접수기간,시도,시군구,주소,주택유형,전용면적(m2),임대보증금액,신청자수,href_id,address,x,y
0,122,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,서울특별시,서울 강동구,"서울 강동구 천호동 562, 562-1 스카이캐슬라 8층 802호",다세대주택,15.21,152100000,668,no=2023040345,서울 강동구 천호동 562,37.539597,127.130622
1,121,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,서울특별시,서울 강서구,서울 강서구 등촌동 637-19 라빌라스 101동 4층 402호,오피스텔(주거용),29.99,210600000,161,no=2022367225,서울 강서구 등촌동 637-19 라빌라스 101동 4층 402호,37.556256,126.859242
2,120,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,서울특별시,서울 강서구,"서울 강서구 등촌동 643-16, 643-17 라테라스 4층 404호",다세대주택,33.51,251100000,133,no=2022362498,서울 강서구 등촌동 643-16,37.555297,126.859798
3,119,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,서울특별시,서울 강서구,서울 강서구 화곡동 105-207 바로크빌 4층 402호,다세대주택,29.90,142200000,62,no=2022348197,서울 강서구 화곡동 105-207 바로크빌 4층 402호,37.539887,126.844564
4,118,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,서울특별시,서울 강서구,"서울 강서구 화곡동 1111, 1111-1 에스제이라벨라 10층 1002호",오피스텔(주거용),20.57,143100000,196,no=2023198064,서울 강서구 화곡동 1111,37.554948,126.852357
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,5,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,경기도,경기 부천시 원미구,경기 부천시 원미구 원미동 169-1 수팰리스 B동 4층 403호,다세대주택,34.92,144000000,6,no=2022363045,경기 부천시 원미구 원미동 169-1 수팰리스 B동 4층 403호,37.494132,126.792039
118,4,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,경기도,경기 부천시 원미구,"경기 부천시 원미구 원미동 200-2, 200-4 송원리치빌 4층 402호",다세대주택,58.20,154800000,11,no=2023204224,경기 부천시 원미구 원미동 200-2,37.488685,126.789119
119,3,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,경기도,경기 부천시 원미구,"경기 부천시 원미구 원미동 201-14, 201-17, 201-18 수팰리스 2층 ...",다세대주택,67.15,190800000,15,no=2022388671,경기 부천시 원미구 원미동 201-14,37.488593,126.789558
120,2,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,경기도,경기 부천시 원미구,경기 부천시 원미구 원미동 42-1 해냄스토리주건축물 1동 10층 1002호,오피스텔(주거용),57.87,226800000,36,no=2023205918,경기 부천시 원미구 원미동 42-1 해냄스토리주건축물 1동 10층 1002호,37.498914,126.789699


In [57]:
def get_img_link(href_id, dt = '20241007'):
    page_url = f"https://www.khug.or.kr/jeonse/web/s07/s070103.jsp?dt={dt}&{href_id}"
    imgTF = False
    while not imgTF:
        time.sleep(0.5) 
        html=requests.get(page_url)
        bs_test = BeautifulSoup(html.content, 'html.parser')
        img_src = bs_test.find(id = 'imgSor0')
        if img_src!=None:
            imgTF = True
    img_link = img_src.get('src')
    return img_link

In [58]:
imgs = [get_img_link(i, dt = service_date) for i in tqdm(final_data0.href_id.values)]

  0%|          | 0/122 [00:00<?, ?it/s]

In [59]:
final_data = final_data0.assign(img = imgs)

In [60]:
final_data.to_csv(f'data/data{service_date}_{current_str}.csv', index = False, encoding = 'utf-8-sig')

In [61]:
final_data.head(1)

Unnamed: 0,번호,공고일자,청약 접수기간,시도,시군구,주소,주택유형,전용면적(m2),임대보증금액,신청자수,href_id,address,x,y,img
0,122,20241031,2024.10.31. 10:00 ~ 2024.11.14. 17:00,서울특별시,서울 강동구,"서울 강동구 천호동 562, 562-1 스카이캐슬라 8층 802호",다세대주택,15.21,152100000,668,no=2023040345,서울 강동구 천호동 562,37.539597,127.130622,http://www.khug.or.kr/updata/khgc/khgccms/cms/...


In [62]:
processing_data = pd.read_csv(f'data/data{service_date}_{current_str}.csv')

In [63]:
import geopandas as gpd
from shapely.geometry import Point
import shapely

In [64]:
processing_final = processing_data.copy()
processing_geo = gpd.GeoDataFrame(processing_final.assign(geometry = lambda df:df.apply(lambda row:shapely.geometry.Point((row.y, row.x)), axis=1)))
sgg_select = processing_geo.filter(regex='시도|시군구').drop_duplicates()\
.apply(lambda row: f"{row['시도']} {row['시군구'].split()[-1]}" if "부천" not in row['시군구'] 
       else f"{row['시도']} {' '.join(row['시군구'].split()[1:])}" , axis=1).tolist()

In [65]:
seoul_geo0 = gpd.read_file('data/seoul_geo/LARD_ADM_SECT_SGG_11_202405.shx', encoding = 'cp949').to_crs('4326')
seoul_geo=seoul_geo0.query("SGG_NM in @sgg_select")
gyeonggi_geo0 = gpd.read_file('data/gyeonggi_geo/LARD_ADM_SECT_SGG_41_202405.shx', encoding = 'cp949').to_crs('4326')
gyeonggi_geo = gyeonggi_geo0.query("SGG_NM in @sgg_select")
incheon_geo0 = gpd.read_file('data/incheon_geo/LARD_ADM_SECT_SGG_28_202405.shx', encoding = 'cp949').to_crs('4326')
incheon_geo = incheon_geo0.query("SGG_NM in @sgg_select")


In [66]:
total_geo0 = shapely.union_all([shapely.union_all(seoul_geo0.geometry.values), shapely.union_all(incheon_geo0.geometry.values), shapely.union_all(gyeonggi_geo0.geometry.values)])
total_geo = shapely.union_all([shapely.union_all(seoul_geo.geometry.values), shapely.union_all(incheon_geo.geometry.values), shapely.union_all(gyeonggi_geo.geometry.values)])

In [67]:
station_preprocessed = pd.read_csv('data/preprocessed_241007.csv')

In [68]:
stations_geo0 = gpd.GeoDataFrame(station_preprocessed.assign(geometry = lambda df: df.apply(lambda row:shapely.geometry.Point((row.x, row.y)), axis=1)))
stations_geo = stations_geo0.assign(filter_TF = lambda df: df.geometry.apply(lambda x: shapely.within(x, total_geo))).query('filter_TF')

In [69]:
cross_tab= processing_geo.filter(regex='번호|geometry').merge(stations_geo.filter(regex='역명|geometry'),how='cross')\
.assign(distance = lambda df:df.apply(lambda row: shapely.distance(row.geometry_x, row.geometry_y), axis=1)).reset_index(drop=True)

In [70]:
final = processing_geo.merge(cross_tab.loc[cross_tab.groupby('번호').distance.idxmin(),['번호', '역명(한글)', 'distance']], on='번호').assign(
    deposit = lambda df: df['임대보증금액'].apply(lambda x: int(x.replace(',', ''))/10000),
    m2 = lambda df: df['전용면적(m2)'],
    deposit_m2 = lambda df: df.deposit/df.m2,
    distanceM_near_station = lambda df: df['distance'].apply(lambda x: x*100000.0),
    near_station = lambda df: df['역명(한글)']
    
).drop(columns = ['임대보증금액', 'distance', '공고일자', '청약 접수기간', '전용면적(m2)', 'href_id', '역명(한글)', 'address'])

In [43]:
company_pos = shapely.Point(127.040806473603, 37.5058315272521)

In [44]:
# ffinal = gpd.GeoDataFrame(final).assign(
#     distance_comp = lambda df: df.geometry.apply(lambda x: shapely.distance(x, company_pos)*100),
#     expected_time = lambda df: df.distance_comp*3.3)
# ffinal.to_csv(f'data/final{service_date}_{current_str}.csv', index = False, encoding='utf-8-sig')

In [86]:
from dateutil import relativedelta
tomorrow = current_date+relativedelta.relativedelta(days=1)

In [138]:
def calculate_transit_time(origin_y, origin_x, dest_y, dest_x):
    url = "https://apis-navi.kakaomobility.com/v1/directions"
    headers = {"Authorization": f"KakaoAK {kakao_api_key}"}
    params = {
        "origin": f"{origin_y},{origin_x}",
        "destination": f"{dest_y},{dest_x}",
        "priority": "RECOMMEND",
        "car_fuel": "GASOLINE",
        "car_hipass": True,
        "alternatives": False,
        "road_details": False,
        "roadevent":2
    }
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            result = response.json()
            return result['routes'][0]['summary']['duration'] / 60, result['routes'][0]['summary']['distance']
        return None
    except:
        return None

In [118]:
comp_x, comp_y =  37.5058315272521, 127.040806473603


In [139]:
expected_times = [calculate_transit_time(i.y, i.x, comp_y, comp_x) for i in tqdm(final.itertuples(), total = final.shape[0])]

  0%|          | 0/122 [00:00<?, ?it/s]

In [144]:
final_csv = pd.concat([final, pd.DataFrame(expected_times, columns = ['expected_time', 'distance_comp'])], axis=1)
final_csv.to_csv(f'data/final{service_date}_{current_str}.csv', index = False, encoding='utf-8-sig')