In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import pandas as pd
import time

In [2]:
# Create a WebDriver instance
print("Initializing Chrome browser...")
service = Service("C:/Tools/chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service=service)  

Initializing Chrome browser...


In [3]:
# Navigate to the webpage
url = 'https://cq.esf.fang.com/house-a058-b04846/'
driver.get(url)
print("Page loaded")

Page loaded


In [4]:
# Set implicit wait to globally wait for elements to load
driver.implicitly_wait(5)

# Wait for the main content of the page to load
try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, 'dl'))
    )
    print("Page loaded successfully")
except Exception as e:
    print(f"Error waiting for page to load: {e}")

Page loaded successfully


In [5]:
# Initialize a list to store the extracted data
data = []

# Function to scrape data from the current page
def scrape_page():
    try:
        # Locate rows
        listings_container = driver.find_element(By.CLASS_NAME, 'shop_list')
        rows = listings_container.find_elements(By.TAG_NAME, 'dl')
        
        # Define selectors
        selectors = {
            'title': (By.CSS_SELECTOR, '.tit_shop'),
            'area': (By.CSS_SELECTOR, '.tel_shop'),
            'location': (By.CSS_SELECTOR, '.add_shop'),
            'subway': (By.CSS_SELECTOR, '.bg_none.icon_dt'),
            'price': (By.CLASS_NAME, 'red')
        }
        
        print(f"Extracting {len(rows)} listings...")
        success_count = 0
        
        # Process each listing
        for i, row in enumerate(rows, 1):
            try:
                # Extract sale information
                title = row.find_element(*selectors['title']).text.strip()
                area_info = row.find_element(*selectors['area']).text.strip()
                location = row.find_element(*selectors['location']).text.strip() 
                subInfor = row.find_element(*selectors['subway']).text.strip()
                price_info = row.find_element(*selectors['price']).text.strip()
                
                # Extract price with decimal points
                price = ''.join(c for c in price_info if c.isdigit() or c == '.')
                
                # Add data to list if title exists or at least some information is available
                if title or any([area_info, location, subInfor, price_info]):
                    data.append([title, area_info, location, subInfor, price])
                    success_count += 1
                
                # Print progress every 10 listings
                if i % 10 == 0:
                    print(f"Processed {i}/{len(rows)} listings")
                
            except Exception as e:
                print(f"Error extracting data from row {i}: {e}")
        
        print(f"Successfully extracted {success_count}/{len(rows)} valid listings")
        return success_count
    except Exception as e:
        print(f"Error extracting data from page: {e}")
        return 0

In [6]:
# Main scraping loop with pagination
try:
    total_pages_to_scrape = 20
    current_page = 1
    
    # Record the total number of listings extracted
    total_listings = 0
    
    print(f"Starting to scrape {total_pages_to_scrape} pages...")
    
    while current_page <= total_pages_to_scrape:
        print(f"\nScraping page {current_page}/{total_pages_to_scrape}")
        
        # Extract data from the current page
        listings_extracted = scrape_page()
        
        print(f"Page {current_page} completed. Successfully extracted {listings_extracted} listings.")
        
        # Check if there is a next page to navigate to
        if current_page < total_pages_to_scrape:
            try:
                # Locate and click the next page button
                next_page_button = WebDriverWait(driver, 8).until(
                    EC.element_to_be_clickable((By.XPATH, '//div[@class="page_al"]//a[contains(text(), "下一页")]'))
                )
                # Scroll to the button and click
                driver.execute_script("arguments[0].scrollIntoView(false);", next_page_button)
                next_page_button.click()
                current_page += 1
                
                time.sleep(3)
                
            except Exception as e:
                print(f"Error navigating to next page: {e}")
                print("Exiting pagination.")
                break
        else:
            break
    
    # Save the data to a Parquet file
    if data:
        # Convert to DataFrame
        df = pd.DataFrame(data, columns=['Title', 'Area Info', 'Location', 'Subway Info', 'Price'])
        
        # Process area extraction
        def extract_area(area_info):
            try:
                if not area_info or str(area_info).strip() == "N/A":
                    return "N/A"
                # Ensure area_info is of string type
                area_info = str(area_info).strip()
                parts = area_info.split('|')
                for part in parts:
                    if '㎡' in part:
                        # Extract the numeric part containing decimal points
                        extracted = ''.join(c for c in part if c.isdigit() or c == '.')
                        return extracted if extracted else "N/A"
            except Exception as e:
                return "N/A"
        
        # Apply area extraction to all rows
        df['Area'] = df['Area Info'].apply(extract_area)
        
        # Save the DataFrame to a Parquet file
        df.to_parquet('Longtousi_sale_data_20pages.parquet', index=False)
        
        print(f"\nScraping completed!")
        print(f"Successfully scraped {len(df)} listings from {current_page} pages")
        print(f"Data saved to Longtousi_sale_data_20pages.parquet")
    else:
        print("No data was scraped from the website")
        
except Exception as e:
    print(f"Error during scraping: {e}")
finally:
    # Close the browser session
    print("\nClosing browser...")
    driver.quit()

Starting to scrape 20 pages...

Scraping page 1/20
Extracting 60 listings...
Processed 10/60 listings
Processed 20/60 listings
Processed 30/60 listings
Processed 40/60 listings
Processed 50/60 listings
Processed 60/60 listings
Successfully extracted 60/60 valid listings
Page 1 completed. Successfully extracted 60 listings.

Scraping page 2/20
Extracting 60 listings...
Processed 10/60 listings
Processed 20/60 listings
Processed 30/60 listings
Processed 40/60 listings
Processed 50/60 listings
Processed 60/60 listings
Successfully extracted 60/60 valid listings
Page 2 completed. Successfully extracted 60 listings.

Scraping page 3/20
Extracting 60 listings...
Processed 10/60 listings
Processed 20/60 listings
Processed 30/60 listings
Processed 40/60 listings
Processed 50/60 listings
Processed 60/60 listings
Successfully extracted 60/60 valid listings
Page 3 completed. Successfully extracted 60 listings.

Scraping page 4/20
Extracting 60 listings...
Processed 10/60 listings
Processed 20/60 