In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from time import sleep
import pandas as pd
from io import StringIO
from functools import partial
import matplotlib.pyplot as plt
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common import NoSuchElementException,StaleElementReferenceException
from matplotlib import style
from webdriver_manager.chrome import ChromeDriverManager
import re

# Initialize Edge driver
try:
    driver = webdriver.Edge()
    print("Edge WebDriver initialized successfully")
except Exception as e:
    print(f"Edge WebDriver initialization failed: {e}")
    exit()

# Initialize list of websites (Xisanqi, Qinghe)
location_list = ['Xisanqi', 'Qinghe']
url_price_list = ['https://esf.fang.com/house-a015277-b02314/i31/', 'https://esf.fang.com/house-a015277-b02315/i31/']
url_rent_list = ['https://zu.fang.com/house-a015277-b02314/', 'https://zu.fang.com/house-a015277-b02315/']

# Set max number of pages
Num_Pages = 20


# Scrape price info
for count in range(len(location_list)):
    # Log on website
    driver.get(url_price_list[count])

    # Initialize list to store data
    data = []

    i = 0
    while i < Num_Pages:
        # Find the container with house listings
        shop_list = driver.find_element(By.CLASS_NAME, 'shop_list')
    
        # Extract all house items
        house_items = shop_list.find_elements(By.TAG_NAME, 'dl')
    
        for item in house_items:
            try:
                # Extract area (m2)
                tel_shop = item.find_element(By.CLASS_NAME, 'tel_shop').text
                m2_match = re.search(r'(\d+(?:\.\d+)?)㎡', tel_shop)
                m2 = m2_match.group(1) if m2_match else 'N/A'
                
                # Extract price info
                price_right = item.find_element(By.CLASS_NAME, 'price_right')
                price = price_right.find_element(By.CSS_SELECTOR, 'span.red b').text
                
                # Organize extracted data into list
                row_data = [m2, price, location_list[count]]
                data.append(row_data)
                
            except Exception as e:
                print(f"Error extracting data: {e}")
                continue
        
        i += 1
        
        # Click next page button
        try:
            last = driver.find_elements(By.CLASS_NAME, 'last')
            next_page_element = None
            
            for last_element in last:
                if "下一页" in last_element.text:
                    next_page_element = last_element.find_element(By.TAG_NAME, 'a')
                    break
            
            if next_page_element:
                driver.execute_script("arguments[0].click();", next_page_element)
                sleep(0.5)  # Wait for page load
                print(f"Finished page {i}. Moving to next page...")
            else:
                print("Next page link not found")
                break
                
        except Exception as e:
            print(f"No more pages or error finding next page: {e}")
            break

    # Convert data to dataframe
    columns = ['m2', 'price', 'location']
    df = pd.DataFrame(data, columns=columns)

    # Save to CSV file
    df.to_csv(f'{location_list[count]}_Price.csv', index=False, encoding='utf-8-sig')
    print(f"Successfully scraped {len(data)} records")


# Scrape rent info
for count in range(len(location_list)):
    # Log on website
    driver.get(url_rent_list[count])

    # Initialize list to store data
    data = []

    i = 0
    while i < Num_Pages:
        # Find the container with house listings
        houseList = driver.find_element(By.CLASS_NAME, 'houseList')
        
        # Extract all house items
        house_items = houseList.find_elements(By.TAG_NAME, 'dl')
    
        for item in house_items:
            try:
                # Extract area (m2)
                info = item.find_element(By.CSS_SELECTOR, '.font15.mt12.bold').text
                m2_match = re.search(r'(\d+(?:\.\d+)?)㎡', info)
                m2 = m2_match.group(1) if m2_match else 'N/A'
                
                # Extract rent info
                rent = item.find_element(By.CLASS_NAME, 'price').text
                
                # Organize extracted data into list
                row_data = [m2, rent, location_list[count]]
                data.append(row_data)
                
            except Exception as e:
                print(f"Error extracting data: {e}")
                continue
        
        i += 1
        
        # Click next page button
        try:
            fanye = driver.find_element(By.CLASS_NAME, 'fanye')
            all_links = fanye.find_elements(By.TAG_NAME, 'a')
            next_page_element = None
            
            for link in all_links:
                if link.text == "下一页":
                    next_page_element = link
                    break
            
            if next_page_element:
                driver.execute_script("arguments[0].click();", next_page_element)
                sleep(0.5)  # Wait for page load
                print(f"Finished page {i}. Moving to next page...")
            else:
                print("Next page link not found")
                break
                
        except Exception as e:
            print(f"No more pages or error finding next page: {e}")
            break

    # Convert data to dataframe
    columns = ['m2', 'rent', 'location']
    df = pd.DataFrame(data, columns=columns)

    # Save to CSV file
    df.to_csv(f'{location_list[count]}_Rent.csv', index=False, encoding='utf-8-sig')
    print(f"Successfully scraped {len(data)} records")


# Close browser
driver.quit()

Edge WebDriver initialized successfully
Finished page 1. Moving to next page...
Finished page 2. Moving to next page...
Finished page 3. Moving to next page...
Finished page 4. Moving to next page...
Finished page 5. Moving to next page...
Finished page 6. Moving to next page...
Finished page 7. Moving to next page...
Finished page 8. Moving to next page...
Finished page 9. Moving to next page...
Finished page 10. Moving to next page...
Finished page 11. Moving to next page...
Finished page 12. Moving to next page...
Finished page 13. Moving to next page...
Finished page 14. Moving to next page...
Finished page 15. Moving to next page...
Finished page 16. Moving to next page...
Finished page 17. Moving to next page...
Finished page 18. Moving to next page...
Finished page 19. Moving to next page...
Finished page 20. Moving to next page...
Successfully scraped 1200 records
Finished page 1. Moving to next page...
Finished page 2. Moving to next page...
Finished page 3. Moving to next pag