In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import re
import random

In [None]:
# Headers to mimic a browser visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
}

def get_property_links(page_url):
    """Extract all property links from a page"""
    try:
        response = requests.get(page_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all property links
        property_links = []
        link_elements = soup.select('a.js-listing-link')
        
        for link in link_elements:
            href = link.get('href')
            if href and 'bproperty.com' in href:
                property_links.append(href)
        
        return property_links
    except Exception as e:
        print(f"Error fetching page {page_url}: {e}")
        return []

def get_next_page_url(page_url):
    """Extract the next page URL if available"""
    try:
        response = requests.get(page_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        next_page_div = soup.select_one('div.next')
        if next_page_div:
            next_page_a = next_page_div.find('a')
            if next_page_a:
                return next_page_a.get('href')
        return None
    except Exception as e:
        print(f"Error finding next page from {page_url}: {e}")
        return None

def clean_text(text):
    """Clean text by removing extra whitespace and newlines"""
    if not text:
        return ""
    return re.sub(r'\s+', ' ', text).strip()

def extract_price(price_text):
    """Extract numeric price from text"""
    if not price_text:
        return ""
    # Extract the price value (৳X,XXX,XXX)
    match = re.search(r'৳([\d,]+)', price_text)
    if match:
        return match.group(1)
    return price_text.strip()

def scrape_property_details(property_url):
    """Scrape details from a property page"""
    try:
        response = requests.get(property_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title_element = soup.select_one('h1.Title-pdp-title span')
        title = clean_text(title_element.text) if title_element else ""
        
        # Extract location
        location_element = soup.select_one('h3.Title-pdp-address')
        location = ""
        if location_element:
            # Remove the location icon span
            icon_span = location_element.select_one('span.icon-pin')
            if icon_span:
                icon_span.extract()
            location = clean_text(location_element.text)
        
        # Extract price
        price_element = soup.select_one('div.Title-pdp-price span.FirstPrice')
        price = extract_price(price_element.text) if price_element else ""
        
        # Extract details
        details = {}
        detail_elements = soup.select('div.listing-details div.columns-2')
        for detail in detail_elements:
            label_element = detail.select_one('div.listing-details-label')
            value_element = detail.select_one('div.last')
            
            if label_element and value_element:
                # Extract label name and clean it
                label_text = label_element.get_text(strip=True)
                label = clean_text(label_text).lower().replace(' ', '_')
                
                # Clean up label - remove icon reference
                label = re.sub(r'^[^a-z]*', '', label)
                
                # Extract value
                value = clean_text(value_element.text)
                details[label] = value
        
        # Extract amenities
        amenities = []
        amenity_elements = soup.select('div.listing-amenities-list-item span.listing-amenities-name')
        for amenity in amenity_elements:
            amenities.append(clean_text(amenity.text))
        
        # Combine all data
        property_data = {
            'title': title,
            'location': location,
            'price': price,
            'url': property_url,
            'amenities': ', '.join(amenities)
        }
        
        # Add details to property data
        property_data.update(details)
        
        return property_data
    except Exception as e:
        print(f"Error scraping property {property_url}: {e}")
        return None

def main():
    base_url = "https://www.bproperty.com/buy/dhaka/residential/apartments/"
    csv_file = "bproperty_apartments_dhaka.csv"
    
    # Get all property links from multiple pages
    all_property_links = []
    current_page_url = base_url
    page_count = 1
    
    while current_page_url and page_count <= 50:
        print(f"Fetching page {page_count}: {current_page_url}")
        property_links = get_property_links(current_page_url)
        all_property_links.extend(property_links)
        print(f"Found {len(property_links)} properties on page {page_count}")
        
        # Get next page URL
        current_page_url = get_next_page_url(current_page_url)
        page_count += 1
        
        # Random delay to avoid rate limiting
        time.sleep(random.uniform(1, 3))
    
    print(f"Total properties found: {len(all_property_links)}")
    
    # Scrape details for each property
    all_property_data = []
    total_properties = len(all_property_links)
    
    for i, property_url in enumerate(all_property_links, 1):
        print(f"Scraping property {i}/{total_properties}: {property_url}")
        property_data = scrape_property_details(property_url)
        
        if property_data:
            all_property_data.append(property_data)
        
        # Random delay to avoid rate limiting
        time.sleep(random.uniform(2, 5))
    
    # Find all unique fields across all properties
    all_fields = set()
    for property_data in all_property_data:
        all_fields.update(property_data.keys())
    
    # Sort fields to ensure consistent column order
    field_names = sorted(list(all_fields))
    
    # Move key fields to the beginning
    key_fields = ['title', 'location', 'price', 'url', 'bedrooms', 'bathrooms', 'floor_area', 'date', 'builtin_year', 'occupancy_status', 'amenities']
    for field in reversed(key_fields):
        if field in field_names:
            field_names.remove(field)
            field_names.insert(0, field)
    
    # Write data to CSV
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=field_names)
        writer.writeheader()
        for property_data in all_property_data:
            writer.writerow({field: property_data.get(field, '') for field in field_names})
    
    print(f"Successfully scraped {len(all_property_data)} properties")
    print(f"Data saved to {csv_file}")

if __name__ == "__main__":
    main()