In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install bs4

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import time
import random
import re

In [4]:
SECOND_HAND_URL = "https://esf.fang.com/house-a010-b01515/i3{}"    # Pagination template, {} Replace page numbers (1-20)
SECOND_HAND_CSV = r"D:\pythonai\yizhuang_sell.csv"    # Storage path for house sale data

RENTAL_URL = "https://zu.fang.com/house-a010-b01515/i3{}"  # Pagination template
RENTAL_CSV = r"D:\pythonai\yizhuang_rent.csv"  # Storage path for rental data

In [5]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
    "Cookie": "city=www; city.sig=OGYSb1kOr8YVFH0wBEXukpoi1DeOqwvdseB7aTrJ-zE; csrfToken=8VgsPpoSBQ1HMYiz1N5Lf5cu; global_cookie=wpb8xtl8e0x9pqia4hva7kkcl2qmgqb5rtt; __utma=147393320.941255132.1760430996.1760430996.1760430996.1; __utmc=147393320; __utmz=147393320.1760430996.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); g_sourcepage=esf_fy%5Elb_pc; otherid=379970e2caba3e43e708a017bd4ae66b; unique_cookie=U_wpb8xtl8e0x9pqia4hva7kkcl2qmgqb5rtt*10",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,*/*;q=0.8",
    "Referer": "https://esf.fang.com/"    # Simulate the jump from the homepage of Fangtianxia to reduce the risk of anti-crawling
}

PAGE_COUNT = 20    # Crawl the first 20 pages


In [6]:
def create_dir(path):
    dir_path = os.path.dirname(path)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print(f"Created folder：{dir_path}")


In [7]:
def parse_second_hand(html):
    """Analysis of the house sale page: Extract 'Area' and 'House Price per Square Meter'"""
    soup = BeautifulSoup(html, "html.parser")
    data = []
    house_list = soup.select("div.shop_list.shop_list_4 dl.clearfix")  
    
    for house in house_list:
        # Extraction area
        tel_shop = house.select_one("p.tel_shop")
        if tel_shop:
            area_match = re.search(r"(\d+\.?\d*)㎡", tel_shop.get_text(strip=True))
        else:
            area_match = None
        
        # Extract the housing price per square meter
        price_right = house.select_one("dd.price_right")
        if price_right:
            price_per_sqm_match = re.search(r"(\d+)元/㎡", price_right.get_text(strip=True))
        else:
            price_per_sqm_match = None
        
        if area_match and price_per_sqm_match:
            area = area_match.group(1)  # area
            price_per_sqm = price_per_sqm_match.group(1)  # the housing price per square meter
            data.append([area, price_per_sqm])
    return data


def parse_rental(html):
    """Analyze the rental page: Extract 'Area and 'Monthly Rent'"""
    soup = BeautifulSoup(html, "html.parser")
    data = []
    # List of location-based rentals
    rental_list = soup.select("dd.info.rel p.font15.mt12.bold") 
    
    for rental in rental_list:
        area_match = re.search(r"(\d+\.?\d*)㎡", rental.get_text(strip=True))    # Extraction area
        more_info = rental.find_next("div", class_="moreInfo")    # Extract the monthly rent
        if more_info:
            price_match = more_info.select_one("span.price")
            rent = price_match.get_text(strip=True) if price_match else None
        else:
            rent = None
        
        if area_match and rent:
            area = area_match.group(1)  
            data.append([area, rent])
    return data




In [10]:
def crawl_data(url_template, parse_func, csv_path, page_count):
    create_dir(csv_path)
    with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        if "sell" in csv_path:
            writer.writerow(["area_sqm", "unit_price_yuan_per_sqm"])
        else:
            writer.writerow(["area_sqm", "rent_yuan_per_month"])
    
    # Pagination crawling
    total_data = 0  
    for page in range(1, page_count + 1):
        url = url_template.format(page)    # Construct the current page URL (e.g., Page 1: i31, Page 2: i32)
        try:
            # Send a request (add random delay to avoid reverse crawling)
            time.sleep(random.uniform(1.5, 3.5))
            response = requests.get(url, headers=HEADERS, timeout=15)
            response.encoding = "utf-8"    # Enforce UTF-8 encoding to prevent garbled Chinese characters
            
            # Verify whether the request was successful
            if response.status_code == 200:
                page_data = parse_func(response.text)
                total_data += len(page_data)
                # Write to CSV (append mode)
                with open(csv_path, "a", newline="", encoding="utf-8-sig") as f:
                    writer = csv.writer(f)
                    writer.writerows(page_data)
                print(f" The crawling of page {page:2d} is completed | New {len(page_data)} data entries have been added | The total number of {total_data} entries has been accumulated")
            else:
                print(f" {page:2d} page request failed |  Status code: {response.status_code}")
        
        except Exception as e:
            print(f" Error was retrieved on page {page:2d} | Error message:{str(e)[:50]}")
            time.sleep(5)  # When an error occurs, extend the delay to avoid continuous triggering of anti-crawling
    
    print(f"\n The crawling has ended | Crawl the {page_count} page in total | A total of {total_data} pieces of data | Storage path:{csv_path}\n")




In [11]:
if __name__ == "__main__":
   
    print("\n" + "="*30 + " Start crawling the data of house sales " + "="*30)
    crawl_data(SECOND_HAND_URL, parse_second_hand, SECOND_HAND_CSV, PAGE_COUNT)
    
    print("="*30 + " Start crawling rental data " + "="*30)
    crawl_data(RENTAL_URL, parse_rental, RENTAL_CSV, PAGE_COUNT)
    
    print("All crawling tasks have been completed! The data has been saved to the folder D:/pythonai.")


 The crawling of page  1 is completed | New 60 data entries have been added | The total number of 60 entries has been accumulated
 The crawling of page  2 is completed | New 60 data entries have been added | The total number of 120 entries has been accumulated
 The crawling of page  3 is completed | New 60 data entries have been added | The total number of 180 entries has been accumulated
 The crawling of page  4 is completed | New 60 data entries have been added | The total number of 240 entries has been accumulated
 The crawling of page  5 is completed | New 60 data entries have been added | The total number of 300 entries has been accumulated
 The crawling of page  6 is completed | New 60 data entries have been added | The total number of 360 entries has been accumulated
 The crawling of page  7 is completed | New 60 data entries have been added | The total number of 420 entries has been accumulated
 The crawling of page  8 is completed | New 60 data entries have been added | The t