In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime
for dirname, _, filenames in os.walk("./"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

./Project_2.ipynb
./.ipynb_checkpoints/Project_2-checkpoint.ipynb


# 1. Frame the problem
Using the customer description, Define the problem your trying to solve in your own words (remember this is not technial but must be specific so the customer understands the project

In [None]:
"Predict home prices in your specific city or area "

# 2. Get the Data 
Define how you recieved the data (provided, gathered..)

In [20]:
def get_sold_year(soldDate):
    """soldDate may be 'YYYY-MM-DD' or epoch ms/sec; return year or None."""
    if soldDate is None:
        return None
    try:
        iv = int(soldDate)
        # epoch ms vs s
        if iv > 10**12:
            return datetime.utcfromtimestamp(iv/1000.0).year
        if iv > 10**9:
            return datetime.utcfromtimestamp(iv).year
    except Exception:
        pass
    try:
        return datetime.fromisoformat(str(val)[:10]).year
    except Exception:
        return None

def scrape_zillow_data(city='boston', state='ma', max_pages=24, pause=1.5):
    """
    Returns a DataFrame of mostly-raw fields + a derived 'Age' column.
    """
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en",
        "Cache-Control": "no-cache",
        "Pragma": "no-cache",
        "Sec-Ch-Ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
        "Sec-Ch-Ua-Mobile": "?0",
        "Sec-Ch-Ua-Platform": '"Windows"',
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
    }
    all_props, page_number = [], 1

    while page_number <= max_pages:
        url = f"https://www.zillow.com/{city}-{state}/sold/" if page_number == 1 else f"https://www.zillow.com/{city}-{state}/sold/{page_number}_p/"
        print(f"[{city},{state}] page {page_number}: {url}")
        try:
            r = requests.get(url, headers=headers, timeout=25)
            r.raise_for_status()
        except requests.RequestException as e:
            print(f"  request error on page {page_number}: {e}")
            break

        soup = BeautifulSoup(r.content, 'html.parser')
        script_tag = soup.find('script', {'id': '__NEXT_DATA__'})
        if not script_tag:
            print("  __NEXT_DATA__ not found; structure may have changed or throttled.")
            break

        try:
            jd = json.loads(script_tag.string)
            results = jd['props']['pageProps']['searchPageState']['cat1']['searchResults']['listResults']
            if not results:
                print("  no results on this page; stopping this city.")
                break

            page_props = []
            for item in results:
                hdp = item.get('hdpData', {}) or {}
                home = hdp.get('homeInfo', {}) or {}

                year_built = home.get('yearBuilt')
                sold_date_raw = item.get('soldDate') or home.get('dateSold')
                sold_year = get_sold_year(sold_date_raw)
                current_year = datetime.now().year

                age = None
                

                page_props.append({
                    'City': city,
                    'State': state,
                    'Address': item.get('address', 'N/A'),
                    'AIN': home.get('parcelId'),
                    'Sold Price': item.get('soldPrice', item.get('price')),
                    'Bedrooms': item.get('beds'),
                    'Bathrooms': item.get('baths'),
                    'Area (SqFt)': item.get('area'),
                    'Property Type': home.get('homeType', 'N/A'),
                    'Year Built': home.get('yearBuilt'),
                    'Age': age,                       # derived (post-capture)
                    'Sold Date (raw)': sold_date_raw, # keep raw for audit/2yr filter later
                    'Lot Size (val)': home.get('lotAreaValue'),
                    'Lot Size (unit)': home.get('lotAreaUnit'),
                    'HOA Fee': home.get('hoaFee'),
                    'Latitude': (item.get('latLong') or {}).get('latitude'),
                    'Longitude': (item.get('latLong') or {}).get('longitude'),
                    'ZPID': item.get('zpid') or home.get('zpid'),
                    'DetailURL' : item.get('detailUrl') or item.get('hdpUrl')
                })

            all_props.extend(page_props)
            print(f"  +{len(page_props)} this page (total {len(all_props)})")
            page_number += 1
            time.sleep(pause)

        except (KeyError, json.JSONDecodeError) as e:
            print(f"  parse error: {e}")
            break

    return pd.DataFrame(all_props)

def full_scrape(city_state, target_rows=1500, max_pages_per_city=200, pause=2.0):
    frames, total = [], 0
    for (city, state) in city_state:
        if total >= target_rows:
            break
        df = scrape_zillow_data(city=city, 
                                state=state, 
                                max_pages=max_pages_per_city, 
                                pause=pause)
        if not df.empty:
            frames.append(df)
            total += len(df)
            print(f"Accumulate: {total} rows so far")
        time.sleep(pause * 1.5)
    if frames:
        out = pd.concat(frames, ignore_index=True)
        # Remove repeats
        out.drop_duplicates(subset=['Address','Sold Price','Year Built','Area (SqFt)'], inplace=True)
        return out
    return pd.DataFrame()

if __name__ == "__main__":
    # Greater Boston area, gets just under 3k
    Cities = [
        ('boston','ma'),
        ('cambridge','ma'),
        ('somerville','ma')
    ]

    df = full_scrape(Cities, target_rows=2000, max_pages_per_city=250, pause=2.0)

    if not df.empty:

        df_links = df[['Address', 'DetailURL', 'ZPID']].copy()
        df = df.drop(columns=['DetailURL'])
    
        two_years_ago = datetime.now().year - 2
        df['_sold_year'] = df['Sold Date (raw)'].apply(get_sold_year)
        df2 = df[df['_sold_year'].fillna(0) >= two_years_ago].drop(columns=['_sold_year'])
        df2.to_csv('TEST_zillow_sold_greater_boston_last2yrs.csv', index=False)
    else:
        print("\nNo data scraped.")

[boston,ma] page 1: https://www.zillow.com/boston-ma/sold/
  +41 this page (total 41)
[boston,ma] page 2: https://www.zillow.com/boston-ma/sold/2_p/
  +41 this page (total 82)
[boston,ma] page 3: https://www.zillow.com/boston-ma/sold/3_p/
  +41 this page (total 123)
[boston,ma] page 4: https://www.zillow.com/boston-ma/sold/4_p/
  +41 this page (total 164)
[boston,ma] page 5: https://www.zillow.com/boston-ma/sold/5_p/
  +41 this page (total 205)
[boston,ma] page 6: https://www.zillow.com/boston-ma/sold/6_p/
  +41 this page (total 246)
[boston,ma] page 7: https://www.zillow.com/boston-ma/sold/7_p/
  +41 this page (total 287)
[boston,ma] page 8: https://www.zillow.com/boston-ma/sold/8_p/
  +41 this page (total 328)
[boston,ma] page 9: https://www.zillow.com/boston-ma/sold/9_p/
  +41 this page (total 369)
[boston,ma] page 10: https://www.zillow.com/boston-ma/sold/10_p/
  +41 this page (total 410)
[boston,ma] page 11: https://www.zillow.com/boston-ma/sold/11_p/
  +41 this page (total 451)
[

TypeError: unsupported operand type(s) for -: 'datetime.datetime' and 'int'

# 3. Explore the Data
Gain insights into the data you have from step 2, making sure to identify any bias

# 4.Prepare the Data


Apply any data transformations and explain what and why


In [15]:
def price_to_numeric(price_str):
    # Changes Price to integer value
    if price_str is None:
        return None
    if isinstance(price_str, (int, float)):
        return int(price_str)
    s = str(price_str).strip().upper().replace('$','').replace(',','')
    if 'M' in s:
        return int(float(s.replace('M','')) * 1_000_000)
    if 'K' in s:
        return int(float(s.replace('K','')) * 1_000)
    try:
        return int(float(s))
    except Exception:
        return None


# 5. Model the data
Using selected ML models, experment with your choices and describe your findings. Finish by selecting a Model to continue with


# 6. Fine Tune the Model

With the select model descibe the steps taken to acheve the best rusults possiable 


# 7. Present
In a customer faceing Document provide summery of finding and detail approach taken


# 8. Launch the Model System
Define your production run code, This should be self susficent and require only your model pramaters 


In [None]:
def infrence(prams):
    results = m.run(prams)
    return results 