## homework 3-2: data mining

### prepare browser

In [None]:
%pip install selenium 

In [2]:
import time
import re

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

In [3]:
def setup_driver():
    """Set up Edge browser driver"""
    edge_options = Options()
    # enhance browser stability and compacity
    edge_options.add_argument('--disable-gpu')
    edge_options.add_argument('--no-sandbox')
    edge_options.add_argument('--disable-dev-shm-usage')
    edge_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0')
    
    try:
        driver = webdriver.Edge(options=edge_options)
    except Exception as e:
        print(f"Failed to auto-locate EdgeDriver: {e}")
        print("Please specify EdgeDriver path manually")
        return None
    
    return driver


### parepare catch data function

In [4]:
def parse_sell_data(driver, file_name):
    """Parse sales data page, deduplicate, and return record count"""
    data_count = 0 # if count nothing, this web page has no data
    house_set = set()  # This is used for deduplication

    try:
        # Get all house blocks on the page
        dl_elements = driver.find_elements(By.CSS_SELECTOR, "dl.clearfix")

        for dl in dl_elements:
            # ------------------- Get house details -------------------
            try:
                p_element = dl.find_element(By.CSS_SELECTOR, "p.tel_shop")
                tel_text = p_element.text.strip()

                # House area
                area_match = re.search(r"([\d.]+)\s*㎡", tel_text)
                area_sqm = area_match.group(1) if area_match else ""

                # Room type (number of rooms and halls)
                room_match = re.search(r"(\d+室\d+厅)", tel_text)
                room = room_match.group(1) if room_match else ""

                # Floor
                floor_match = re.search(r"(低层|中层|高层|\d+层)", tel_text)
                floor_raw = floor_match.group(1) if floor_match else ""
                if "低" in floor_raw:
                    floor = "low"
                elif "中" in floor_raw:
                    floor = "medium"
                elif "高" in floor_raw:
                    floor = "high"
                else:
                    floor = floor_raw  # Keep numeric value

                # Total floors
                total_floor_match = re.search(r"共(\d+)层", tel_text)
                total_floor = total_floor_match.group(1) if total_floor_match else ""

                # Direction: only extract 东南西北 and map to E/N/S/W
                raw_dir = "".join(re.findall(r"[东南西北]", tel_text))
                direction = (
                    raw_dir.replace("南", "S")
                           .replace("北", "N")
                           .replace("东", "E")
                           .replace("西", "W")
                    if raw_dir else ""
                )

            except NoSuchElementException:
                area_sqm = room = floor = total_floor = direction = ""

            # Get house address
            try:
                addr_element = dl.find_element(By.CSS_SELECTOR, "p.add_shop")
                addr_text = addr_element.text.strip().replace("\n", " ")
            except NoSuchElementException:
                addr_text = ""

            # get price
            try:
                price_element = dl.find_element(By.CSS_SELECTOR, "dd.price_right span:last-child")
                price_text = price_element.text.strip()
                price_match = re.search(r"([\d,\.]+)\s*元/㎡", price_text)
                unit_price_yuan_per_sqm = price_match.group(1).replace(",", "") if price_match else ""
            except NoSuchElementException:
                unit_price_yuan_per_sqm = ""

            # deduplicate houses
            key = (room, area_sqm, floor, total_floor, direction, addr_text)
            if area_sqm and unit_price_yuan_per_sqm and key not in house_set:
                house_set.add(key)
                data_str = f"{area_sqm},{unit_price_yuan_per_sqm}\n"

                with open(file_name, 'a', encoding='utf-8') as f:
                    f.write(data_str)

                data_count += 1

    except Exception as e:
        print(f"Error parsing sales data: {e}")

    return data_count

In [5]:
def parse_rent_data(driver, file_name):
    """Parse rental data page, deduplicate, and return record count"""
    data_count = 0
    house_set = set()  # This is used for deduplication

    try:
        # find all rental blocks on page
        dl_elements = driver.find_elements(By.CSS_SELECTOR, "dl.list.hiddenMap.rel")

        for dl in dl_elements:
            
            area_sqm = ""
            rent_yuan_per_month = ""
            room = ""
            direction = ""
            addr_text = ""

            # ------------------- Get house details -------------------
            try:
                summary = dl.find_element(By.CSS_SELECTOR, "p.font15.mt12.bold").text.strip()
                # area
                m = re.search(r'([\d.]+)\s*㎡', summary)
                if m:
                    area_sqm = m.group(1)
                # room (e.g. "3室2厅")
                m = re.search(r'(\d+室\d+厅)', summary)
                if m:
                    room = m.group(1)
                # direction: extract only 东南西北 chars then map to letters
                raw_dir = "".join(re.findall(r"[东南西北]", summary))
                if raw_dir:
                    direction = raw_dir.replace("南", "S").replace("北", "N").replace("东", "E").replace("西", "W")
            except NoSuchElementException:
                pass

            # address
            try:
                addr_elem = dl.find_element(By.CSS_SELECTOR, "p.gray6.mt12")
                # try to collect anchor texts first
                anchors = addr_elem.find_elements(By.TAG_NAME, "a")
                parts = []
                if anchors:
                    for a in anchors:
                        t = a.text.strip()
                        if t:
                            parts.append(t)
                else:
                    # fallback: split by hyphen or spaces
                    raw = addr_elem.text.strip()
                    # remove any parentheses or trailing extras
                    raw = re.sub(r'[\(\)（）]', '', raw)
                    # split on hyphen or greater-than-like separators and spaces
                    parts = [p.strip() for p in re.split(r'[->|]+', raw) if p.strip()]
                if parts:
                    addr_text = "_".join(parts)
            except NoSuchElementException:
                addr_text = ""

            # rent price
            try:
                # price usually inside .moreInfo or .mt5 block
                price_elem = dl.find_element(By.CSS_SELECTOR, "div.moreInfo p.mt5.alingC span.price")
                price_text = price_elem.text.strip()
                m = re.search(r'([\d,\.]+)', price_text)
                if m:
                    rent_yuan_per_month = m.group(1).replace(",", "")
            except NoSuchElementException:
                rent_yuan_per_month = ""

            # deduplicate
            key = (room, area_sqm, direction, addr_text)
            if area_sqm and rent_yuan_per_month and key not in house_set:
                house_set.add(key)
                data_str = f"{area_sqm},{rent_yuan_per_month}\n"
                with open(file_name, 'a', encoding='utf-8') as f:
                    f.write(data_str)
                data_count += 1

    except Exception as e:
        print(f"Error parsing rental data: {e}")

    return data_count

In [None]:
def fetch_data(data_type):
    """Fetch data using Selenium"""
    driver = setup_driver()
    if not driver:
        print("Cannot start Edge browser, please check EdgeDriver configuration")
        return
    
    try:
        start_num = 1
        failure_time = 0
        max_empty_pages = 3
        empty_page_count = 0
        
        if data_type == "sell":
            base_url = "https://lf.esf.fang.com/house-a014822/i3"
            file_name = "yanjiao_sell.csv"
            with open(file_name, 'w', encoding='utf-8') as f:
                f.write("area_sqm,unit_price_yuan_per_sqm\n")
        elif data_type == "rent":
            base_url = "https://lf.zu.fang.com/house-a014822/i3"
            file_name = "yanjiao_rent.csv"
            with open(file_name, 'w', encoding='utf-8') as f:
                f.write("area_sqm,rent_yuan_per_month\n")
        else:
            print("Unexpected Input!")
            return

        # fetch no more than 20 pages data, and if empty page is so many ,stop the process
        while start_num < 21 and empty_page_count < max_empty_pages:
            url = f"{base_url}{start_num}"
            print(f"Visiting: {url}")
            
            try:
                driver.get(url)
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                
                if "404" in driver.title or "Page Not Found" in driver.page_source or "页面不存在" in driver.page_source:
                    print("Info: 404 page, stopping scraping")
                    break
                
                # If a url is redirected,this implies no more data.
                current_url = driver.current_url
                if base_url not in current_url:
                    print(f"Redirect detected: {current_url}, stopping scraping")
                    break
                               
            # handle anti-bot, using longer delay and ask for change system proxy
            except Exception as e:
                failure_time += 1
                print(f"Page load failed: {e}")
                if failure_time >= 1:
                    time.sleep(6)
                    if failure_time >= 2:
                        print("\a")
                        time.sleep(3)
                        print("\a")
                        print("Current failure index: ", start_num)
                        input("Press Enter after changing system proxy: ")
                continue

            data_count = 0
            if data_type == "sell":
                data_count = parse_sell_data(driver, file_name)
            else:
                data_count = parse_rent_data(driver, file_name)

            # if we get no valid data, we should stop scraping
            if data_count == 0:
                empty_page_count += 1
                print(f"Page {start_num} got no valid data, empty count: {empty_page_count}")
            else:
                empty_page_count = 0
                print(f"Page {start_num} got {data_count} records")

            if empty_page_count >= max_empty_pages:
                print(f"Reached {max_empty_pages} consecutive empty pages, stopping scraping")
                break

            failure_time = 0
            start_num += 1
            time.sleep(3)
            
    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        driver.quit()


### fetch data

In [None]:
print("Start scraping sales data...")
fetch_data("sell")
print("Sell data is completed!")

In [None]:
print("Start scraping rental data...")
fetch_data("rent")
print("Rental data is completed!")