# Use Open Data From City of Philadelphia

Use Dataset From https://cityofphiladelphia.github.io/carto-api-explorer/#opa_properties_public

The website's features for predicting the cost of homes should include:
- **Select**: city, state, zip_code, property_type
- **Values**: square_ft/ acers, *number_of_bedrooms, number_of_rooms, spaces_in_garage, number_of_floors, etc*
- **Bools**: *pool, basement, fence, lake, etc.*

In [1]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
import requests

In [2]:
def fetch_philly_data(limit, max_records):
    all_data = []
    for offset in range(0, max_records, limit):
        # limit amount of data requested from api, offset to read new data
        sql = f"SELECT * FROM opa_properties_public LIMIT {limit} OFFSET {offset}"
        url = f"https://phl.carto.com/api/v2/sql?q={sql}"
        resp = requests.get(url)
        # check for error in fetching data
        if resp.status_code != 200:
            print(f"Error fetching offset {offset}")
            break
        # avoid empty chunks of data in api
        chunk = resp.json().get('rows', [])
        if not chunk:
            break
        # combine all chunks of data
        all_data.extend(chunk)
        #print(f"fetched {len(chunk)} records at offset = {offset}")
    return pd.DataFrame(all_data)

# fetch data from open source
df = fetch_philly_data(limit=1000, max_records=20000)

# check the structure of the dataframe
#df.info(memory_usage='deep')

In [3]:
# keep columns where at least 25% of values are non-null
threshold = len(df) * 0.25
df = df.dropna(axis=1, thresh=threshold)
# drop all rows where 'zip_code' is missing
df = df.dropna(subset=['zip_code'])
df = df.reset_index(drop=True)

# check the structure of the dataframe
#df.info(memory_usage='deep')

In [4]:
# select and keep features that users will have access to and any key features
columns_to_keep = [
    'zip_code',
    'total_livable_area',
    'total_area',
    'number_of_bedrooms',
    'number_of_bathrooms',
    'number_stories',
    'garage_spaces',
    'basements',
    'view_type',
    'central_air',
    'type_heater',
    'exterior_condition',
    'interior_condition',
    'sale_price',
    'sale_date',
    'market_value',
    'year_built',
    'year_built_estimate'
]

df_model = df[columns_to_keep].copy()

# check the structure of the dataframe
#df_model.info(memory_usage='deep')

In [5]:
# create flags for columns with < 75% non-null values
# define columns to exclude that can't be included
exclude_cols = ['year_built', 'year_built_estimate']

for col in df_model.columns:
    if col not in exclude_cols and df_model[col].notna().mean() < 0.75:
        df_model[f'has_{col}'] = df_model[col].notna()
        
# check the structure of the dataframe
#df_model.info(memory_usage='deep')

In [6]:
# data exists where sale_price is 1 or 0
# create flag where sale_prices are unreasonably low as a nominal sale

df_model['is_nominal_sale'] = df_model['sale_price'] <= 1000
df_model.loc[df_model['is_nominal_sale'], 'sale_price'] = pd.NA

# finish creating flags for year_built estimates,
df_model['year_built_estimate'] = df_model['year_built_estimate'].isna()
df_model['year_built_estimate'] = df_model['year_built_estimate'].astype(bool)

#df_model.info(memory_usage='deep')

In [9]:
# zip_code to string avoiding any leading zeros
df_model['zip_code'] = df_model['zip_code'].astype(str)

# convert numeric features to integers
int_cols = [
    'number_of_bedrooms',
    'number_of_bathrooms',
    'number_stories',
    'garage_spaces',
    'year_built', 
]

for col in int_cols:
    df_model[col] = pd.to_numeric(df_model[col], errors='coerce').astype('Int64')

# sale_price should be float or int — it’s currently object due to NaNs or strings
df_model['sale_price'] = pd.to_numeric(df_model['sale_price'], errors='coerce').astype('Int64')

# keep total_livable_area, total_area, and market_value as float
df_model['total_livable_area'] = pd.to_numeric(df_model['total_livable_area'], errors='coerce').astype(float)
df_model['total_area'] = pd.to_numeric(df_model['total_area'], errors='coerce').astype(float)
df_model['market_value'] = pd.to_numeric(df_model['market_value'], errors='coerce').astype(float)

# dates
df_model['sale_date'] = pd.to_datetime(df_model['sale_date'], errors='coerce')

# Strings (categorical or object-type)
cat_cols = [
    'basements', 'view_type', 'central_air',
    'type_heater', 'exterior_condition', 'interior_condition',
    'year_built_estimate'
]

for col in cat_cols:
    df_model[col] = df_model[col].astype('category')

df_model.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19998 entries, 0 to 19997
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   zip_code             19998 non-null  object             
 1   total_livable_area   19045 non-null  float64            
 2   total_area           19995 non-null  float64            
 3   number_of_bedrooms   18037 non-null  Int64              
 4   number_of_bathrooms  17764 non-null  Int64              
 5   number_stories       18269 non-null  Int64              
 6   garage_spaces        17779 non-null  Int64              
 7   basements            13210 non-null  category           
 8   view_type            19496 non-null  category           
 9   central_air          12526 non-null  category           
 10  type_heater          13124 non-null  category           
 11  exterior_condition   19041 non-null  category           
 12  interior_condition

In [30]:
# now we want to fill in data using histogram based gradient boosting
# make a copy to preserve original
df_filled = df_model.copy()
# convert 'sale_date' into their own features to use learning models
df_filled['sale_date'] = pd.to_datetime(df_filled['sale_date'], errors='coerce')
df_filled['sale_year'] = df_filled['sale_date'].dt.year
df_filled['sale_month'] = df_filled['sale_date'].dt.month
df_filled['sale_day'] = df_filled['sale_date'].dt.day

# remove sale_date to avoid errors in model
df_filled = df_filled.drop(columns=['sale_date'])
# sum the missing values in each column
numeric_cols_to_fill = [
    col for col in df_filled.columns
    if df_filled[col].isna().sum() > 0 and pd.api.types.is_numeric_dtype(df_filled[col])
]

# sort by fewest missing values first
numeric_cols_to_fill = sorted(numeric_cols_to_fill, key=lambda c: df_filled[c].isna().sum())

# ordinal encode any categorical variables for modeling
categorical_cols = df_filled.select_dtypes(['category']).columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_filled[categorical_cols] = encoder.fit_transform(df_filled[categorical_cols].astype(str))

# impute one column at a time
for col in numeric_cols_to_fill:
    not_null = df_filled[df_filled[col].notna()]
    is_null = df_filled[df_filled[col].isna()]
    
    if not_null.shape[0] < 100:
        print(f"Skipping {col} (too few non-null rows)")
        continue

    features = df_filled.columns.drop([col])
    
    X_train = not_null[features]
    y_train = not_null[col]
    
    X_pred = is_null[features]

    model = HistGradientBoostingRegressor(random_state=0)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_pred)
    predictions = np.round(predictions).astype('int64')
    df_filled.loc[df_filled[col].isna(), col] = predictions
    
    print(f"Filled {len(predictions)} rows in {col}")

# Restore categorical values back to original encoding
df_filled[categorical_cols] = encoder.inverse_transform(df_filled[categorical_cols])

Filled 3 rows in total_area
Filled 953 rows in total_livable_area
Filled 953 rows in year_built
Filled 1729 rows in number_stories
Filled 1961 rows in number_of_bedrooms
Filled 2219 rows in garage_spaces
Filled 2234 rows in number_of_bathrooms
Filled 4690 rows in sale_price


In [31]:
#df_model['central_air'].cat.categories
#df_model['basements'].cat.categories
#df_model['type_heater'].cat.categories
#df_model.groupby('basements')['sale_price'].median().sort_values()
#df_model.groupby('type_heater')['sale_price'].median().sort_values()
#df_model.groupby('view_type')['sale_price'].median().sort_values()

In [32]:
# turn centeral_air into a bool
df_filled['central_air'] = df_filled['central_air'].map({'Y': True, 'N': False})

# map using grouped median values
basement_scale = {
    '0': 0,   
    'D': 1,   
    'C': 2,   
    'B': 3,   
    'A': 4,   
    'H': 5,   
    'E': 6,   
    'F': 7,   
    'G': 8,   
}
df_filled['basements'] = df_filled['basements'].map(basement_scale)

# map using grouped median values
type_heater_scale = {
    'G': 0,   
    'H': 1,   
    'F': 2,
    'E': 3,
    'B': 4,   
    'C': 5,   
    'D': 6,   
    'A': 7,   
}
df_filled['type_heater'] = df_filled['type_heater'].map(type_heater_scale)

# map using grouped median values
view_type_scale = {
    '0': 0,   
    'I': 1,   
    'D': 2,
    'C': 3,
    'B': 4,
    'A': 5,
    'H': 6,   
    'E': 7,   
}
df_filled['view_type'] = (
    df_model['view_type']
    .astype(str)
    .str.strip()
    .replace({'nan': np.nan, 'None': np.nan, '<NA>': np.nan})
    .map(view_type_scale)
)

df_filled['view_type'] = df_filled['view_type'].astype('Int64')
# define condition scale
valid_condition_values = {'1', '2', '3', '4', '5', '6', '7'}

# clean both columns
for col in ['exterior_condition', 'interior_condition']:
    df_filled[col] = df_filled[col].astype(str).str.strip()
    df_filled[col] = df_filled[col].where(df_filled[col].isin(valid_condition_values), np.nan)
    df_filled[col] = pd.to_numeric(df_filled[col], errors='coerce').astype('Int64')

In [33]:
# save dataframe to temp dataframe
df_temp = df_filled.copy()

# convert booleans to int for modeling
bool_cols = df_temp.select_dtypes(include='bool').columns
df_temp[bool_cols] = df_temp[bool_cols].astype(int)

# list of columns to fill
cols_to_fill = [
    'basements', 'central_air', 'type_heater',
    'exterior_condition', 'interior_condition',
    'view_type'  # ← just add this here!
]

# loop through each column and fill with classification model
for target_col in cols_to_fill:
    print(f"\n Filling missing value in column {target_col}")
    
    # drop target and label if present
    predictors = df_temp.columns.drop([target_col, 'sale_price'])

    # create encoder for predictors
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_raw = df_temp[predictors].copy()

    # fill missing values for categorical columns
    cat_cols = X_raw.select_dtypes(include=['object', 'category']).columns
    X_raw[cat_cols] = X_raw[cat_cols].fillna("Missing")
    X_raw[cat_cols] = encoder.fit_transform(X_raw[cat_cols].astype(str))

    # split data for training and prediction
    y = df_temp[target_col]
    mask_known = y.notna()
    mask_missing = ~mask_known

    if mask_missing.sum() == 0:
        print(f"No missing values")
        continue

    try:
        # train classifier
        model = HistGradientBoostingClassifier(random_state=0)
        model.fit(X_raw.loc[mask_known], y[mask_known].astype(float))

        # predict and fill
        preds = model.predict(X_raw.loc[mask_missing])
        df_temp.loc[mask_missing, target_col] = preds

        print(f"Filled {mask_missing.sum()}")
    except Exception as e:
        print(f"Not filling {target_col}: {e}")

# copy to df_filled
df_filled[cols_to_fill] = df_temp[cols_to_fill]
df_filled['central_air'] = df_filled['central_air'].astype(bool)


 Filling missing value in column basements
Filled 7140

 Filling missing value in column central_air
Filled 7527

 Filling missing value in column type_heater
Filled 6917

 Filling missing value in column exterior_condition
Filled 960

 Filling missing value in column interior_condition
Filled 959

 Filling missing value in column view_type
Filled 502


In [34]:
df_filled.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19998 entries, 0 to 19997
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   zip_code             19998 non-null  object 
 1   total_livable_area   19998 non-null  float64
 2   total_area           19998 non-null  float64
 3   number_of_bedrooms   19998 non-null  Int64  
 4   number_of_bathrooms  19998 non-null  Int64  
 5   number_stories       19998 non-null  Int64  
 6   garage_spaces        19998 non-null  Int64  
 7   basements            19998 non-null  float64
 8   view_type            19998 non-null  Int64  
 9   central_air          19998 non-null  bool   
 10  type_heater          19998 non-null  float64
 11  exterior_condition   19998 non-null  Int64  
 12  interior_condition   19998 non-null  Int64  
 13  sale_price           19998 non-null  Int64  
 14  market_value         19998 non-null  float64
 15  year_built           19998 non-null 

In [35]:
# avoid rerunning code creating copy to clean
df_cleaning = df_filled.copy()

In [36]:
# allow learning model to use date turning it into its own column
df_cleaning['sale_date'] = pd.to_datetime({
    'year': df_cleaning['sale_year'].astype('int'),
    'month': df_cleaning['sale_month'].astype('int'),
    'day': df_cleaning['sale_day'].astype('int')
}, errors='coerce')
df_cleaning.drop(columns=['sale_year', 'sale_month', 'sale_day'], inplace=True)

In [37]:
order = [
    'zip_code',
    'sale_date',

    # property features
    'view_scale',
    'heater_scale',
    'exterior_condition',
    'interior_condition',
    'central_air',
    'basement_scale',
    'garage_spaces',
    'number_of_bedrooms',
    'number_of_bathrooms',
    'number_stories',
    'total_livable_area',
    'total_area',
    'year_built',
    'year_built_estimate',

    # boolean flags
    'has_basements',
    'has_central_air',
    'has_type_heater',
    'is_nominal_sale',

    # price-related
    'market_value',
    'sale_price'
]

df_cleaning['has_basements'] = df_cleaning['basements'].fillna(0).astype(float) > 0
df_cleaning.rename(columns={
    'view_type': 'view_scale',
    'type_heater': 'heater_scale',
    'basements': 'basement_scale'
}, inplace=True)
df_cleaned = df_cleaning[order]

In [38]:
#df_cleaned.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19998 entries, 0 to 19997
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   zip_code             19998 non-null  object        
 1   sale_date            19998 non-null  datetime64[ns]
 2   view_scale           19998 non-null  Int64         
 3   heater_scale         19998 non-null  float64       
 4   exterior_condition   19998 non-null  Int64         
 5   interior_condition   19998 non-null  Int64         
 6   central_air          19998 non-null  bool          
 7   basement_scale       19998 non-null  float64       
 8   garage_spaces        19998 non-null  Int64         
 9   number_of_bedrooms   19998 non-null  Int64         
 10  number_of_bathrooms  19998 non-null  Int64         
 11  number_stories       19998 non-null  Int64         
 12  total_livable_area   19998 non-null  float64       
 13  total_area           19998 non-

In [None]:
df_cleaned.to_csv("philly_property_data.csv", index=False)