###ENVIRONMENTAL SETUP

In [5]:

!pip install playwright
!playwright install chromium
!playwright install-deps chromium
!pip install nest_asyncio

Installing dependencies...
Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-freefont-ttf is already the newest version (20120503-10build1

###SCRAPING

In [8]:
import asyncio, random, pandas as pd
from datetime import datetime
import nest_asyncio
from playwright.async_api import async_playwright

# Allows the scraper to run in a Jupyter/Colab environment
nest_asyncio.apply()

async def scrape_neighborhood(page, neighborhood):
    """Scans multiple pages of a neighborhood with error handling and slug optimization."""
    listings = []
    slug = neighborhood.lower().replace(" ", "-")

    # Dual-URL pattern to catch all possible KPC listing structures
    url_patterns = [
        f"https://kenyapropertycentre.com/for-sale/nairobi/{slug}",
        f"https://kenyapropertycentre.com/for-sale/{slug}"
    ]

    print(f"Processing: {neighborhood.upper()}")

    success_found = False
    for base_url in url_patterns:
        if success_found: break

        # scan up to 8 pages deep for high-density areas
        for page_num in range(0, 8):
            offset = page_num * 20
            target_url = f"{base_url}?limitstart={offset}"

            try:
                await page.goto(target_url, wait_until="domcontentloaded", timeout=30000)
                await asyncio.sleep(random.uniform(1.5, 3.0))

                cards = await page.query_selector_all(".property-list-item, [itemprop='itemListElement']")
                if not cards:
                    break

                success_found = True

                for card in cards:
                    try:
                        title_elem = await card.query_selector("h3, h4")
                        price_elem = await card.query_selector(".price")
                        link_elem = await card.query_selector("a")

                        if title_elem and link_elem:
                            title = (await title_elem.inner_text()).strip()
                            price = (await price_elem.inner_text()).strip() if price_elem else "N/A"
                            url = await link_elem.get_attribute("href")
                            if not url.startswith("http"):
                                url = "https://kenyapropertycentre.com" + url

                            listings.append({
                                "source": "KPC",
                                "neighborhood": neighborhood,
                                "title": title,
                                "price": price,
                                "url": url,
                                "scraped_at": datetime.now().strftime("%Y-%m-%d")
                            })
                    except: continue

                print(f"   - Page {page_num + 1}: Found {len(cards)} items")
                if len(cards) < 15: break
            except Exception as e:
                print(f"   - Error on page {page_num + 1}: {str(e)[:50]}...")
                break

    return listings

async def run_master_scraper():

    neighborhoods = [

        "Kilimani", "Kileleshwa", "Lavington", "Karen", "Westlands", "Runda", "Muthaiga", "Spring Valley", "Kitisuru",


        "Nairobi West", "South B", "South C", "Madaraka", "Nairobi South", "Parklands", "Ngara", "Pangani", "Woodley", "Mountain View",


        "Ruaka", "Roysambu", "Kasarani", "Embakasi", "Imara Daima", "Nyayo Estate", "Utawala", "Chokaa", "Ruai", "Githurai",


        "Kiambu Road", "Garden Estate", "Thika Road", "Lower Kabete", "Kahawa West", "Kahawa North", "Riruta", "Riruta Satellite",


        "Nairobi Central", "Eastleigh North", "Eastleigh South", "Airbase", "Landi Mawe", "Makadara", "Komarock", "Ngando", "Upper Savannah"
    ]

    all_listings = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True, args=["--no-sandbox"])
        context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0")
        page = await context.new_page()

        for area in neighborhoods:
            batch = await scrape_neighborhood(page, area)
            all_listings.extend(batch)

            # Prevents loss if the session times out
            if len(all_listings) > 0:
                pd.DataFrame(all_listings).drop_duplicates(subset=['url']).to_csv("nairobi_master_backup.csv", index=False)

            await asyncio.sleep(2)

        await browser.close()

    if all_listings:
        final_df = pd.DataFrame(all_listings).drop_duplicates(subset=['url'])
        filename = f"raw_listings_{datetime.now().strftime('%Y%m%d')}.csv"
        final_df.to_csv(filename, index=False)
        print(f"\nSCRAPE COMPLETE! Total Unique Properties: {len(final_df)}")
        from google.colab import files
        files.download(filename)
    else:
        print("\n No data collected. Check internet connection or site availability.")

# Execute the master scraper
await run_master_scraper()

Processing: KILIMANI
   - Page 1: Found 23 items
   - Page 2: Found 23 items
   - Page 3: Found 23 items
   - Page 4: Found 23 items
   - Page 5: Found 23 items
   - Page 6: Found 23 items
   - Page 7: Found 23 items
   - Page 8: Found 23 items
Processing: KILELESHWA
   - Page 1: Found 23 items
   - Page 2: Found 23 items
   - Page 3: Found 23 items
   - Page 4: Found 23 items
   - Page 5: Found 23 items
   - Page 6: Found 23 items
   - Page 7: Found 23 items
   - Page 8: Found 23 items
Processing: LAVINGTON
   - Page 1: Found 23 items
   - Page 2: Found 23 items
   - Page 3: Found 23 items
   - Page 4: Found 23 items
   - Page 5: Found 23 items
   - Page 6: Found 23 items
   - Page 7: Found 23 items
   - Page 8: Found 23 items
Processing: KAREN
   - Page 1: Found 23 items
   - Page 2: Found 23 items
   - Page 3: Found 23 items
   - Page 4: Found 23 items
   - Page 5: Found 23 items
   - Page 6: Found 23 items
   - Page 7: Found 23 items
   - Page 8: Found 23 items
Processing: WESTLAND

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>