# PART 1 - PREPROCESSING

In [2]:
import sys
print("Using interpreter:", sys.executable)

# install into THIS exact interpreter
!{sys.executable} -m pip install pandas numpy scikit-learn matplotlib seaborn

Using interpreter: /opt/homebrew/opt/python@3.10/bin/python3.10
Collecting pandas
  Using cached pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy
  Using cached numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.7-cp310-cp310-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.15.3-cp310-cp310-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Col

In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42

 ## Load all four city datasets

In [4]:
data_paths = {
    "Portland": "portland_listings.csv",
    "Pacific Grove": "pacificigrove_listings.csv",  # note: your file is named this
    "Albany": "albany_listing.csv",
    "Bozeman": "bozeman_listings.csv",
}

raw_data = {}

for city, path in data_paths.items():
    df = pd.read_csv(path)
    print(f"{city}: {df.shape[0]} rows, {df.shape[1]} columns")
    raw_data[city] = df

Portland: 4425 rows, 79 columns
Pacific Grove: 300 rows, 79 columns
Albany: 461 rows, 79 columns
Bozeman: 580 rows, 79 columns


 ## Define the required numeric columns

In [5]:
numeric_columns = [
    'price', 'accommodates', 'bedrooms', 'beds', 'bathrooms_text',
    'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'review_scores_checkin',
    'review_scores_communication', 'review_scores_location',
    'review_scores_value', 'number_of_reviews',
    'availability_365', 'minimum_nights', 'maximum_nights'
]

## Create helper functions to clean messy fields

In [6]:
def clean_price(series):
    return (
        series.astype(str)
        .str.replace(r"[\$,]", "", regex=True)
        .replace("nan", np.nan)
        .astype(float)
    )


def clean_bathrooms(series):
    def to_float(x):
        if pd.isna(x):
            return np.nan
        x = str(x).lower().strip()
        token = x.split()[0]
        if token.startswith("half"):
            return 0.5
        try:
            return float(token)
        except:
            return np.nan
    return series.map(to_float)


## Build the preprocessing function

In [9]:
def preprocess_city(df, city_name):
    print(f"\n Preprocessing: {city_name} ")
    
    # Keep only required columns
    sub = df[numeric_columns].copy()

    # Clean special fields
    sub["price"] = clean_price(sub["price"])
    sub["bathrooms_text"] = clean_bathrooms(sub["bathrooms_text"])

    # Convert to float
    sub = sub.astype(float)

    # Fill missing with column medians
    sub = sub.fillna(sub.median())

    # Clip outliers (1st–99th percentile)
    lower = sub.quantile(0.01)
    upper = sub.quantile(0.99)
    sub = sub.clip(lower=lower, upper=upper, axis=1)

    # Standard scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(sub)

    return sub, X_scaled, scaler

## Run preprocessing for all four cities


In [10]:
city_processed = {}
city_scaled = {}
city_scalers = {}

for city, df in raw_data.items():
    cleaned, scaled, scaler = preprocess_city(df, city)
    city_processed[city] = cleaned
    city_scaled[city] = scaled
    city_scalers[city] = scaler


 Preprocessing: Portland 

 Preprocessing: Pacific Grove 

 Preprocessing: Albany 

 Preprocessing: Bozeman 


In [11]:
for city in city_processed:
    print(city)
    display(city_processed[city].head())

Portland


Unnamed: 0,price,accommodates,bedrooms,beds,bathrooms_text,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,number_of_reviews,availability_365,minimum_nights,maximum_nights
0,159.0,2.0,1.0,1.0,1.0,4.89,4.87,4.87,4.84,4.79,4.95,4.69,134.0,339.0,30.0,730.0
1,130.0,2.0,1.0,1.0,1.0,5.0,5.0,5.0,4.92,5.0,5.0,4.92,25.0,214.0,30.0,300.0
2,354.0,10.0,4.0,5.0,2.0,4.92,4.94,4.99,4.95,4.97,4.79,4.82,80.0,317.0,4.0,365.0
3,76.0,3.0,1.0,2.0,1.0,4.84,4.82,4.79,4.95,4.95,4.98,4.86,138.0,361.0,30.0,365.0
4,81.0,1.0,1.0,1.0,3.5,4.52,4.58,4.36,4.83,4.73,4.84,4.46,68.0,335.0,30.0,180.0


Pacific Grove


Unnamed: 0,price,accommodates,bedrooms,beds,bathrooms_text,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,number_of_reviews,availability_365,minimum_nights,maximum_nights
0,279.0,2.0,1.0,1.0,1.0,4.89,4.85,4.82,4.88,4.85,4.97,4.58,62.0,83.0,30.0,120.0
1,1024.21,12.0,5.0,7.0,4.0,4.94,4.92,4.96,4.97,4.95,4.98,4.78,121.0,162.0,5.0,1125.0
2,225.0,2.0,1.0,1.0,1.0,4.96,4.96,4.96,4.96,4.96,4.97,4.93,807.63,267.0,30.0,90.0
3,200.0,3.0,1.0,1.0,1.0,4.91,4.88,4.96,4.97,4.97,4.86,4.8,807.63,90.0,1.0,300.0
4,128.0,3.0,2.0,2.0,1.0,4.71,4.47,4.94,4.88,5.0,5.0,4.65,17.0,75.0,30.0,1125.0


Albany


Unnamed: 0,price,accommodates,bedrooms,beds,bathrooms_text,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,number_of_reviews,availability_365,minimum_nights,maximum_nights
0,93.0,4.0,2.0,2.0,1.0,3.56,3.44,3.56,4.22,4.56,3.22,3.67,9.0,188.0,28.0,1125.0
1,91.0,3.0,2.0,1.0,1.0,4.74,4.88,4.87,4.85,4.81,4.81,4.77,311.0,354.0,2.0,1125.0
2,64.0,2.0,0.0,2.0,1.0,4.52,4.62,4.45,4.81,4.88,4.76,4.64,373.0,12.0,2.0,45.0
3,95.0,2.0,1.0,1.0,1.0,4.73,4.83,4.71,4.83,4.7,4.8,4.72,332.0,335.0,2.0,1125.0
4,89.0,4.0,1.0,2.0,1.0,4.79,4.86,4.82,4.95,4.93,4.87,4.77,563.0,246.0,1.0,1125.0


Bozeman


Unnamed: 0,price,accommodates,bedrooms,beds,bathrooms_text,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,number_of_reviews,availability_365,minimum_nights,maximum_nights
0,96.0,2.0,1.0,1.0,1.0,4.98,4.98,4.95,4.97,4.99,4.93,4.94,606.0,259.0,3.0,1125.0
1,172.0,4.0,1.0,2.0,1.5,4.8,4.89,4.84,4.95,4.98,4.98,4.7,44.0,350.0,2.0,1125.0
2,200.0,2.0,1.0,1.0,1.0,4.96,5.0,5.0,4.98,4.98,5.0,4.71,52.0,364.0,3.0,7.0
3,150.0,6.0,3.0,3.0,2.0,4.78,4.87,4.76,4.91,4.82,4.89,4.73,344.0,317.0,3.0,31.0
4,489.0,4.0,2.0,2.0,1.0,4.97,4.94,4.99,4.98,5.0,5.0,4.74,70.0,362.0,3.0,10.0
