In [1]:
#!pip install retry

import requests
from retry import retry
from bs4 import BeautifulSoup
import pandas as pd
import random
import time

# LIFULL Home's の WEB クローリング

## 1. 設定
 - 物件一覧URL
 - ロボット対策


In [2]:
## 物件一覧URL
# 【ホームズ】文京区の賃貸[賃貸マンション・アパート]物件一覧｜住宅・お部屋探し情報
base_url = "https://www.homes.co.jp/chintai/tokyo/bunkyo-city/list/?page={}"

In [3]:
## ロボット対策

# headers定義
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}

# 定义函数，用于获取HTML内容
def get_html(url):
    try:
        # 添加随机延迟
        delay = random.uniform(1, 2)
        time.sleep(delay)

        r = requests.get(url, headers=headers)
        r.raise_for_status()
        soup = BeautifulSoup(r.content, "html.parser")
        return soup
    except Exception as e:
        print("Exception occurred:", e)
        return None

# 定义函数，用于获取指定页面的HTML内容
def get_page_html(page_num):
    url = base_url.format(page_num)
    return get_html(url)

# 重试装饰器函数
def retry(tries, delay, backoff):
    def deco_retry(f):
        def f_retry(*args, **kwargs):
            mtries, mdelay = tries, delay
            while mtries > 0:
                result = f(*args, **kwargs)
                if result:
                    return result
                else:
                    mtries -= 1
                    time.sleep(mdelay)
                    mdelay *= backoff
            return None
        return f_retry
    return deco_retry

# 装饰器修饰get_html函数，添加重试功能
@retry(tries=3, delay=10, backoff=2)
def get_html_with_retry(url):
    return get_html(url)

## 2. 生データ取得

In [4]:
## 接続テスト
url = base_url.format(1)
soup = get_html_with_retry(url)
if soup:
    print("Successfully fetched HTML content.")
else:
    print("Failed to fetch HTML content.")

## 最大ページ数確認
max_page = int(soup.find("li", {"class": "lastPage"}).getText())
print(f'最大ページ数: {max_page}')

Successfully fetched HTML content.
最大ページ数: 74


In [5]:
## データ取得 (df_all: マンション, 部屋基本)
df_all = pd.DataFrame({})

for page in range(1, max_page+1):
  url = base_url.format(page)
  try:
    soup = get_html_with_retry(url)

    # マンション情報
    buildings = soup.findAll("div", {"class": "moduleInner prg-building"})
    for soup_building in buildings:

      building_name = soup_building.find("span", {"class": "bukkenName prg-detailLinkTrigger"}).getText()
      building_address = soup_building.findAll("td")[0].getText()
      building_stationText = ';'.join([x.getText() for x in soup_building.findAll("span", {"class": "prg-stationText"})])
      building_AFText = soup_building.findAll("td")[2].getText()

      df_building = pd.DataFrame({
      'building_name': [building_name],
      'building_address': [building_address],
      'building_stationText': [building_stationText],
      'building_AFText': [building_AFText]})

      # 部屋情報
      room_floors = [x.getText() for x in soup_building.findAll("li", {"class": "roomKaisuu"})]
      room_layouts = [x.contents[0] for x in soup_building.findAll("td", {"class": "layout"})]
      room_urls =  [x['href'] for x in soup_building.findAll("a", {"class": "anchor prg-detailAnchor"})]

      # df作成
      df_room = pd.DataFrame({
      'room_floor': room_floors,
      'room_layout': room_layouts,
      'room_url': room_urls})

      # 合併
      df = df_building.assign(key=1).merge(df_room.assign(key=1), on='key').drop('key', axis=1)
      df_all = pd.concat([df_all, df])

  except:
    print(f'fail. url: {url}')

print(df_all.shape)
df_all.to_csv('../../data/temp/df_all_0.csv', encoding = 'utf-8-sig', index=0)

(2767, 7)


In [None]:
## データ取得 (df_details: 部屋詳細)
df_all = pd.read_csv('../../data/temp/df_all_0.csv')
df_details = pd.DataFrame({})

urls = df_all['room_url'].tolist()
for i in range(len(urls)):
  url = urls[i]
  
  if i % 10 == 0:
    print(f'now: row_{i}')

  try:
    soup = get_html_with_retry(url)

    # 基本情報
    section = soup.find("section", {"class": 'py-4 lg:py-8 bg-mono-50'})
    room_price = section.find("dd", {"class": "flex items-center grow py-3 lg:py-2 pl-3 pr-4 text-primary font-bold"}).getText()
    section1_dd = [x.getText().strip() for x in section.findAll("dd", {"class": "flex items-center grow py-3 lg:py-2 pl-3 pr-4"})]

    # 物件のこだわり／設備・条件
    section = soup.find("ul", {"class": 'mt-3 lg:mt-5 w-full flex flex-wrap border-b border-mono-200 text-sm'})
    room_detail = '、'.join([x.getText().strip() for x in section.findAll("div", {"class": 'grow py-3 lg:py-2 pl-3 pr-4'})])

    # 物件概要
    section = soup.find("dl", {"class": '-mx-px flex flex-wrap border-b border-mono-200 text-sm'})
    room_info = [x.getText().strip() for x in section.findAll("dd", {"class": 'flex items-center grow py-3 lg:py-2 pl-3 pr-4'})]

    # 情報公開、更新日
    room_begin = soup.find("dt", string="情報公開日").find_next_sibling().getText().strip()[:10]
    room_update = soup.find("dt", string="情報更新日").find_next_sibling().getText().strip()[:10]

    # df作成
    df = pd.DataFrame({
      'room_price': [room_price],
      'room_commonFee': [section1_dd[0]],
      'room_initialFeeText': [section1_dd[1]],
      'room_initialOtherFeeText': [section1_dd[2]],
      'building_year': [section1_dd[4]],
      'room_layoutText': [section1_dd[5]],
      'room_facing': [section1_dd[6]],
      'room_area': [section1_dd[7]],
      'room_detailText': [room_detail],
      'building_structure': [room_info[0]],
      'room_parking': [room_info[1]],
      'room_tatalRooms': [room_info[2]],
      'room_contract': [room_info[3]],
      'room_period': [room_info[4]],
      'room_renewalFeeText': [room_info[5]],
      'room_otherFeeText': [room_info[6]],
      'room_guarantyText': [room_info[7]],
      'room_insuranceText': [room_info[8]],
      'room_manage': [room_info[9]],
      'room_state': [room_info[10]],
      'room_move-in': [room_info[11]],
      'room_lifullID': [room_info[12]],
      'room_trans': [room_info[13]],
      'room_begin': [room_begin],
      'room_update': [room_update],
      'room_url': [url],
    })

    # 合併
    df_details = pd.concat([df_details, df])

  except:
    print(f'fail. Url: {url}. Row {i}')

print(df_details.shape)
df_details.to_csv('../../data/temp/df_details_raw.csv', encoding = 'utf-8-sig', index=0)

In [27]:
df_all = pd.read_csv('../../data/temp/df_all_0.csv')
df_test = df_all.merge(df_details, how='inner', on= 'room_url')
df_test.to_csv('../../data/0320/df_raw.csv', encoding = 'utf-8-sig', index=0)

(2767, 7)