# Property value Prediction

## Introduction

This notebook documents the process of creating randomforest regression models to the value of various property types New York City based on the properties proximity to nearest trainstation.

---

## Setup and Installation

```python
# Install required libraries
!pip install pandas scikit-learn geopy numpy joblib time 
````
### Author: Aaron Mpuga 

In [25]:
import pandas as pd
import numpy as np
import joblib
import os 
import time
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split



In [26]:
#Removes the warning in output cell
import warnings
warnings.filterwarnings('ignore')

## Data Loading and Cleaning

In [86]:
'''
Iterates through all the housing sales data excel files in the "Annualized_Rolling_Sales_Update" folder and combines them into a single 
dataframe 

'''

# Path to "Annualized_Rolling_Sales_Update" folder
rolling_sales_directory = "/Users/atwoo/Documents/Fairly_even/data/raw_data/Annualized_Rolling_Sales_Update"

file_df_list = []

for file in os.listdir(sales_directory):
    if file.endswith('.xls') or file.endswith('.xlsx'):
        file_path = os.path.join(rolling_sales_directory, file)
        df = pd.read_excel(file_path)
        file_df_list.append(df)

housing_sales_df = pd.concat(file_df_list, ignore_index=True)
housing_sales_df.to_csv('housing_sales_data.csv', index=False)

print("Combined rolling sales CSV created!")


**Cleans MTA subway stations csv file taken from "data.ny.gov"** 


In [30]:
'''
Cleans MTA subway stations csv file taken from "data.ny.gov" 
'''

folder_directory = "/Users/atwoo/Documents/Fairly_even/data/raw_data"

station_data_filename = "MTA_Subway_Stations.csv"
station_data_path = os.path.join(folder_directory, station_data_filename)

station_data = pd.read_csv(station_data_path)
columns = ["Borough","Stop Name","GTFS Latitude", "GTFS Longitude", "Georeference"]
station_data = station_data[columns]

##Change borough names##
borough_map = {"M": "Manhattan", "Bk" : "Brooklyn", "Bx" : "Bronx", "Q" : "Queens", "SI" : "Staten Island"}

for row_idx, elem in enumerate(station_data["Borough"]):
    if elem in borough_map:
        station_data["Borough"][row_idx] = borough_map[elem]
        
#Change Latitude and Longitude index names
station_data.rename(columns= {"GTFS Latitude" : "Station_Latitude", "GTFS Longitude" : "Station_Longitude"}, inplace= True)

# Save Cleaned station Data as CSV
clean_station_data_filename = "clean_station_data.csv"
clean_path = os.path.join(folder_directory, clean_station_data_filename)
station_data.to_csv(clean_path, index= False)

**Cleans the combined housing data csv file that was just created.**

In [32]:
# Path to "housing_sales_data" file 
folder_directory = "/Users/atwoo/Documents/Fairly_even/data/interim_data"

housing_filename = "housing_sales_data.csv"
housing_data_path = os.path.join(folder_directory, housing_filename)

housing_data = pd.read_csv(housing_data_path, low_memory=False)
housing_data = housing_data.drop(housing_data.columns[21:], axis=1)
housing_data = housing_data.drop(axis = 0, index= [0,1,2])
new_column_names =["Borough","Neighborhood","Building_Class_Category","Tax_Class","Tax_Block","Tax_Lot","Easement", "Building_Classification_Code_At_Present","Address",
                    "Apartment_Number", "Zip_Code", "Residential_Units", "Commercial_Units", "Total_Units", "Land_Square_Feet", "Gross_Square_Feet", "Year_Built", 
                    "Tax_Class_At_Time_Of_Sale", "Building_Classification_Code_At_Time_Of_Sale", "Sale_Price", "Sale_Date"]

housing_data.columns = new_column_names
housing_data = housing_data.drop(axis=0, index=[3])
housing_data = housing_data.reset_index(drop= True)

# Re-order columns of dataframe
columns_kept = ["Borough", "Neighborhood", "Building_Class_Category", 
    "Building_Classification_Code_At_Time_Of_Sale", "Address", "Zip_Code", "Sale_Price", "Sale_Date"]
housing_data = housing_data[columns_kept]

# Used the "zip_borough.csv" CSV file to map each property to a borough, given the zip codes of each property in dataframe
zip_dir = "/Users/atwoo/Documents/Fairly_even/data/raw_data"

zip_borough_map_path = os.path.join(zip_dir, "zip_borough.csv")
zip_to_borough = pd.read_csv(zip_borough_map_path)

# Aligning column names and dtype of column cell data before merging
zip_to_borough.columns = ["Zip_Code", "Borough"]
housing_data["Zip_Code"] = pd.to_numeric(housing_data["Zip_Code"], errors = "coerce")
housing_data = housing_data.dropna(subset=["Zip_Code"])
housing_data["Zip_Code"] = housing_data["Zip_Code"].astype("int64")
housing_data = housing_data.merge(zip_to_borough, on="Zip_Code", how="left")

# Removing unmapped zipcode rows and re-ordering columns of dataframe
housing_data = housing_data.dropna(subset= ["Borough_y"])
housing_data = housing_data.drop(columns = ["Borough_x"])
housing_data = housing_data.rename(columns = {"Borough_y": "Borough"})
housing_data = housing_data[columns_kept]

# Removing rows where the sale price is zero (Indication of transfer of ownership involving no money according to glossary)
housing_data["Sale_Price"] = housing_data["Sale_Price"].astype("int64")
housing_data = housing_data[housing_data["Sale_Price"] != 0]
housing_data = housing_data.reset_index(drop= True)

# Formating date and time values
housing_data["Sale_Date"] = pd.to_datetime(housing_data["Sale_Date"])
housing_data["Year"] = housing_data["Sale_Date"].dt.year
housing_data = housing_data.sort_values(by= ["Address","Year"])

# Saving cleaned "housing_sales_data" dataframe as CSV file "cleaned_file_name"
cleaned_file_name = "Cleaned_NYC_Property_Sales_Data_2003_To_2015.csv"
cleaned_filepath = os.path.join(folder_directory, cleaned_file_name)

housing_data.to_csv(cleaned_filepath, index=False)

**Data/API Limitation and Scope Reduction:**

The original cleaned dataset contained approximately **900,000 rows** of housing sales data from the years **2003-2015**. To manage the project's scope because of how large the dataset was I initially wanted to limit the analysis to properties within the following building codes:

- **A**: One Family Dwellings (approx. 162,000 rows)
- **B**: Two Family Dwellings (approx. 142,000 rows)
- **D**: Elevator Apartments (approx. 166,000 rows)

However, after analyzing the number of **unique addresses** within these building codes, I encountered a limitation with the **free Nominatim API**. This API only allows **one address to be processed per second**, meaning that running the script to retrieve longitude and latitude values for all addresses would take about **~17.3 hours** = 62,481 (number of unique addresses) seconds. So, I decided to narrow the scope of the project further by focusing on just properties with building code **A**. Processing just these addresses would reduce the time to **6 hours** (24,205 seconds), making the task more manageable.


In [34]:
codes_start = ("A", "B", "D")
housing_data_codes = housing_data[housing_data["Building_Classification_Code_At_Time_Of_Sale"].str.startswith(codes_start)]
rows_to_drop = housing_data_codes[housing_data_codes["Address"].str.len() < 5].index
housing_data_codes = housing_data_codes.drop(rows_to_drop)

# Group by Address and filter out addresses that appear only once
address_duplicate_count = housing_data_codes['Address'].value_counts()
repeated_addresses = housing_data_codes['Address'].isin(address_duplicate_count[address_duplicate_count > 1].index)
housing_data_codes = housing_data_codes[repeated_addresses]
housing_data_codes.reset_index(drop=True, inplace=True)

# Create dataframe specific to housing data with building code A
housing_data_codeA = housing_data_codes[housing_data_codes["Building_Classification_Code_At_Time_Of_Sale"].str.startswith("A") == True]

In [35]:
count1 = housing_data_codes["Address"].nunique()
count2 = housing_data_codeA["Address"].nunique()
print(f"Number of unique addresses for building codes A, B and D is: {count1}")
print(f"Number of unique addresses for just buildings with code A is: {count2}")

Number of unique addresses for building codes A, B and D is: 62481
Number of unique addresses for just buildings with code A is: 24205


## Feature Engineering

**Geocodes (finds the latitude and longitude) for all addresses in the address column of the dataframe.**

In [38]:
# Path to save backup CSV file every 1800 addresses
save_directory = "/Users/atwoo/Documents/Fairly_even/data/interim_data"

backup_csv_path = os.path.join(save_directory, "geocoded_addresses_codeA_backup.csv")
final_csv_path = os.path.join(save_directory, "geocoded_housing_data_codeA_final.csv")
df_path = os.path.join(save_directory, "housing_data_codeA.csv")

housing_data_codeA = pd.read_csv(df_path)

geo = Nominatim(user_agent="my_housing_project_app")

# Address cache
address_dict = {}

def getMeridian(street: str, postalcode: int, city: str, index: int) -> tuple:
    """
    Determines the longitidue and latitude values for a given address
    Parameters
    ----------
    street : str
        Street address for the location of the desired latitude and longitude values.
    postalcode : int
        Postalcode or zipcode  for the location of the desired latitude and longitude values. 
    city : str
        City name for the location of the desired latitude and longitude values. 

    Returns
    -------
    tuple
        latitude and longitude pairing.

    """
    # Construct the structured query dictionary (defined by geopy)
    query = {
        'street': street,
        'postalcode': postalcode,
        'city' : city,
        'state' : "NY"
    }

    # Convert query dictionary to a string to use as a key for the cache
    query_key = str(query)
    
    # Check if the query has been seen before
    if query_key in address_dict:
        return address_dict[query_key]
    else:
        try:
            location = geo.geocode(query)
            if location:
                address_dict[query_key] = (location.latitude, location.longitude)
            else:
                address_dict[query_key] = (None, None)
                
        except Exception as e:
            print(f"Couldn't find latitude and longitude for {query_key}: {e}")
            address_dict[query_key] = (None, None)
                
    if index % 1800 == 0:
        print(f"Processed {index} addresses so far!")
        # Save progress to a backup CSV file every 1800 addresses (approximately every 30 minutes)
        pd.DataFrame.from_dict(address_dict, orient="index", columns=["Latitude", "Longitude"]).to_csv(backup_csv_path)
        
    # Rate limiting: pause between requests to avoid hitting the rate limit
    time.sleep(1)  
    
    return address_dict[query_key]

# Execute method 
housing_data_codeA["Latitude"], housing_data_codeA["Longitude"] = zip(*housing_data_codeA.apply(
    lambda row: getMeridian(row["Address"], row["Zip_Code"], row["Borough"], row.name), axis=1))

housing_data_codeA.to_csv(final_csv_path, index=False)
print(f"Completed! The final data has been saved to data folder")


**Creates a CSV file where for a given property row the nearest trainstation's name and its distance from the property are two additional columns added onto the dataframe.**

In [40]:
geo_codeA = pd.read_csv("/Users/atwoo/Documents/Fairly_even/data/interim_data/geocoded_housing_data_codeA_final.csv")

geo_codeA.dropna(axis= 0, subset= ["Latitude", "Longitude"], inplace= True)
geo_codeA = geo_codeA.reset_index()
station_data = pd.read_csv("/Users/atwoo/Documents/Fairly_even/data/processed_data/clean_station_data.csv")

def calculate_distance (latitude_1: float, longitude_1: float, latitude_2: float, longitude_2: float) -> float:
    """Calculate the distance in meters between two latitude and longitude coordinate points.

    Parameters
    ----------
    latitude_1 : float
        latitude value of first coordinate point
    longitude_1 : float
        longitude value of first coordinate point
    latitude_2 : float
        latitude value of second coordinate point
    longitude_2 : float
        longitude value of second coordinate point
        
    Returns
    -------
    float
        distance between coordinate points 
    """

    return geodesic((latitude_1, longitude_1), (latitude_2, longitude_2)).meters

def get_nearest_station (property_latitude: float, property_longitude: float, station_df) -> tuple:
    """Find the nearest train station name and its distance to the inputted property's latitude and longitude coordinate points

    Parameters
    ----------
    property_latitude : float
        latitude value of the property 
    property_longitude : float
        longitude value of the property        
    station_df : csv file 
        Csv file containing the station data for all train stations in new york. Each row must have a two columns correpsonding to a stations latitude and longitude, 
        titled Station_Latitude and Station_Longitude respectively.

    Returns
    -------
    tuple
        returns tuple of station name followed by distance of station to property

    """
    distances = station_df.apply(lambda row: calculate_distance(property_latitude, property_longitude, row["Station_Latitude"], row["Station_Longitude"]), axis=1)
    dist_idx = distances.idxmin()
    nearest_station = station_df.loc[dist_idx, "Stop Name"]
    nearest_station_dist = distances.min()

    return nearest_station, nearest_station_dist


# geo_codeA[["Nearest_Station", "Station_Distance"]] = geo_codeA.apply(lambda row: pd.Series(get_nearest_station(row["Latitude"], row["Longitude"], station_data)), axis=1)
geo_codeA = geo_codeA.drop(columns=["index"])

file_name = "combined_station_geocode.csv"
save_path = os.path.join("/Users/atwoo/Documents/Fairly_even/data/interim_data",file_name)
geo_codeA.to_csv(save_path)
print("CSV file created!")

**Creates a CSV file where 13 columns are added each representing the price change x amounts of years after the first sale of a property. Cell value is NaN if rows sale data doesn't correspond to x years ahead column**

In [44]:
# Load created CSV file that contains station name and distance 
combined_df = pd.read_csv("/Users/atwoo/Documents/Fairly_even/data/interim_data/combined_station_geocode.csv")

# Function to calculate lag and price change
def calculate_price_changes(group):
    """Calculate the price change  in meters between two latitude and longitude coordinate points.

    Parameters
    ----------
    group : dataframe 
        Dataframe where rows are grouped by Address name
        
    Returns
    -------
    dataframe
        returns dataframe where columns for price change after x years has been added
    """
    
    group = group.sort_values(by="Year")
        
    # Loop over all possible pairs of years in the group
    for i in range(len(group)):
        for j in range(i + 1, len(group)):
            year_diff = group.iloc[j]['Year'] - group.iloc[i]['Year']
            price_change = group.iloc[j]['Sale_Price'] - group.iloc[i]['Sale_Price']
            
            # Create a new column name based on the year difference
            col_name = f"Price_Change_{year_diff}_Years"
            group.loc[group.index[j], col_name] = price_change
    
    return group

# Apply the above function to each group of properties
df_result = combined_df.groupby("Address").apply(calculate_price_changes)

df_result = df_result.reset_index(drop=True)

print("DONE!")

#Re-order columns in ascending order  
price_idx = ["Price_Change_0_Years","Price_Change_1_Years","Price_Change_2_Years","Price_Change_3_Years","Price_Change_4_Years","Price_Change_5_Years","Price_Change_6_Years","Price_Change_7_Years","Price_Change_8_Years","Price_Change_9_Years","Price_Change_10_Years","Price_Change_11_Years","Price_Change_12_Years"]
reordered_cols = df_result[price_idx]
remaining_cols = df_result.drop(columns=price_idx)
df_result = pd.concat([remaining_cols, reordered_cols], axis=1)
df_result = df_result.drop(columns= ['Unnamed: 0', 'index'])

file_name = "final_df.csv"
save_path = os.path.join("/Users/atwoo/Documents/Fairly_even/data/processed_data",file_name)
df_result.to_csv(save_path)
print("CSV file created!")

DONE!


**The code below is a sanity check to see how many lag features have been created**

In [48]:
classification_codes = ["A1", "A2" "A5"]

counts = {}

# Iterate over each years_ahead lag feature
for i in range(13):
    feature_col = f'Price_Change_{i}_Years'
    
    filtered_df = df_result[
        (df_result["Building_Classification_Code_At_Time_Of_Sale"].isin(classification_codes)) & (df_result[feature_col].notna())]
    
    # Store the count of rows that meet the criteria
    counts[feature_col] = len(filtered_df)

# Print the results
for feature_col, count in counts.items():
    print(f"Number of rows with a value for the specified classification codes in column {feature_col} is: {count}")


Number of rows with a value for the specified classification codes in column Price_Change_0_Years is: 1766
Number of rows with a value for the specified classification codes in column Price_Change_1_Years is: 2665
Number of rows with a value for the specified classification codes in column Price_Change_2_Years is: 1432
Number of rows with a value for the specified classification codes in column Price_Change_3_Years is: 1135
Number of rows with a value for the specified classification codes in column Price_Change_4_Years is: 959
Number of rows with a value for the specified classification codes in column Price_Change_5_Years is: 766
Number of rows with a value for the specified classification codes in column Price_Change_6_Years is: 633
Number of rows with a value for the specified classification codes in column Price_Change_7_Years is: 601
Number of rows with a value for the specified classification codes in column Price_Change_8_Years is: 517
Number of rows with a value for the specif

**Due to the limited amount of housing sale price data the purpose of this for loop is to ensure a prediction value that is greater than the property value inputted by the user. The goal of this project is to predict property appreciation. As the year increases the less sale price data is available thus, having negative values will disproportionately show price depreciation of property values.**

In [50]:
for i in range(13):
    n_col = f'Price_Change_{i}_Years'
    print(f"year {i} has {(df_result[n_col] < 0).sum()} negative values") 

year 0 has 3032 negative values
year 1 has 1440 negative values
year 2 has 1178 negative values
year 3 has 1194 negative values
year 4 has 1190 negative values
year 5 has 1053 negative values
year 6 has 892 negative values
year 7 has 858 negative values
year 8 has 720 negative values
year 9 has 528 negative values
year 10 has 323 negative values
year 11 has 105 negative values
year 12 has 38 negative values


**Check to see which building classification codes have the most sale data avaliable.**

In [52]:
# Count the number of rows for each building classification code
building_code_counts = df_result.groupby('Building_Classification_Code_At_Time_Of_Sale').size()
building_code_counts

Building_Classification_Code_At_Time_Of_Sale
A0      500
A1    16626
A2     6228
A3      651
A4      760
A5    15264
A6      334
A7       20
A8        2
A9     3867
dtype: int64

**It appears that the majority of the buildings in the One Family Dwellings Building category (building classification codes that start with "A") fall under the classification codes of:**
- A1: TWO STORIES - DETACHED SM OR MID
- A2: ONE STORY - PERMANENT LIVING QUARTER
- A5: ONE FAMILY ATTACHED OR SEMI-DETACHED
- A9: MISCELLANEOUS ONE FAMILY

**Scope of project will now be limited to analyzing these properties due to lack of sale data for other housing types**


In [54]:
def categorize_station_distance(df, num_groups):
    """Group all the train station distance data into groups  

    Parameters
    ----------
    df : pandas dataframe
        the dataframe containing the station distance data to be parsed  
    num_groups : int
        the number of groups to split the station distance data into
        
    Returns
    -------
    dataframe
        returns dataframe where a new column is added corresponding to the current rows station group
    """
    
    # Create quantile-based bins
    bins = pd.qcut(df['Station_Distance'], num_groups, retbins=True, duplicates='drop')[1]
    
    # Round bins to the nearest whole number
    rounded_bins = np.round(bins)
    
    rounded_bins = np.unique(rounded_bins)  

    # Define labels as interval ranges
    interval_labels = [f'{rounded_bins[i]} - {rounded_bins[i+1]}' for i in range(len(rounded_bins) - 1)]
    
    # Create a new column in the DataFrame with the interval labels
    df['Station_Distance_Group'] = pd.cut(df['Station_Distance'], bins=rounded_bins, labels=interval_labels, include_lowest=True)
    
    return df

df_result = categorize_station_distance(df_result, 5)
group_counts = df_result['Station_Distance_Group'].value_counts().sort_index()
group_counts

9.0 - 531.0        8846
531.0 - 1092.0     8858
1092.0 - 1861.0    8845
1861.0 - 3357.0    8851
3357.0 - 8763.0    8852
Name: Station_Distance_Group, dtype: int64

**This checks to see how many feature rows will be used when training each corresponding regression model. Interestingly, Manhattan had little to know housing sale data for "A" Building classification codes. This suggests that Manhattan doesn't have two story, one story and one family attached homes.**

**As a result the scope of the project will be limited to focusing on New York City properties in Queens, Brooklyn, Staten Island and the Bronx** 

In [56]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib
import os

folder_directory = "/Users/atwoo/Documents/Fairly_even/data/processed_data"
file_name = "final_df.csv"
data_path = os.path.join(folder_directory, file_name)

df_result = pd.read_csv(data_path)

# List of popular building classification codes
popular_codes = ['A1', 'A2', 'A5']  # Replace with actual codes

# List of boroughs
boroughs = df_result['Borough'].unique()

# Iterate over each borough, building classification code, and years_ahead lag feature
for borough in boroughs:
    for building_code in popular_codes:
        for i in range(13):
            feature_col = f'Price_Change_{i}_Years'
            
            # Filter rows for the given borough and building code and where the feature is not NaN
            df_nonan = df_result.dropna(subset=[feature_col])
            df_filtered = df_nonan[
                (df_nonan['Building_Classification_Code_At_Time_Of_Sale'] == building_code) &
                (df_nonan['Borough'] == borough)
            ]

            # Define X (features) and y (target)
            numerical_features = df_filtered[['Sale_Price', 'Station_Distance']]  # Include numeric variables

            # Combine numerical and categorical features
            features = numerical_features

            # Target variable
            target = df_filtered[feature_col]  # Target is the price change

            # Ensure features and target have no missing values
            features = features.dropna()
            target = target.loc[features.index]

            # Print the number of samples
            num_samples = len(features)
            print(f"Number of samples for {borough}, {building_code}, {i} years ahead: {num_samples}")

Number of samples for Staten, A1, 0 years ahead: 347
Number of samples for Staten, A1, 1 years ahead: 655
Number of samples for Staten, A1, 2 years ahead: 447
Number of samples for Staten, A1, 3 years ahead: 345
Number of samples for Staten, A1, 4 years ahead: 298
Number of samples for Staten, A1, 5 years ahead: 245
Number of samples for Staten, A1, 6 years ahead: 220
Number of samples for Staten, A1, 7 years ahead: 203
Number of samples for Staten, A1, 8 years ahead: 185
Number of samples for Staten, A1, 9 years ahead: 163
Number of samples for Staten, A1, 10 years ahead: 143
Number of samples for Staten, A1, 11 years ahead: 76
Number of samples for Staten, A1, 12 years ahead: 40
Number of samples for Staten, A2, 0 years ahead: 148
Number of samples for Staten, A2, 1 years ahead: 278
Number of samples for Staten, A2, 2 years ahead: 180
Number of samples for Staten, A2, 3 years ahead: 139
Number of samples for Staten, A2, 4 years ahead: 115
Number of samples for Staten, A2, 5 years ahe

## Model Building

In [58]:
folder_directory = "/Users/atwoo/Documents/Fairly_even/data/processed_data"
file_name = "final_df.csv"
data_path = os.path.join(folder_directory, file_name)

df_result = pd.read_csv(data_path)

# List of popular building classification codes
popular_codes = ['A1', 'A5', 'A2']  # Replace with actual codes

# List of boroughs
boroughs = ["Staten", "Queens", "Bronx", "Brooklyn"]

#This removes all values in price change x years columns that are less than 0 
for i in range(13):
    n_col = f'Price_Change_{i}_Years'
    df_result[n_col] = df_result[n_col].apply(lambda x: 0 if x < 0 else x) 


# Iterate over each borough, building classification code, and years_ahead lag feature
for borough in boroughs:
    for building_code in popular_codes:
        for i in range(6):
            feature_col = f'Price_Change_{i}_Years'
            
            # Drop rows where price change column i year's value is NaN indicating that the dropped row didnt 
            # have a price change value for the desired i years ahead 
        
            df_nonan = df_result.dropna(subset=[feature_col])
            df_filtered = df_nonan[ (df_nonan['Building_Classification_Code_At_Time_Of_Sale'] == building_code) & 
                                        (df_nonan['Borough'] == borough)]

            # Define X (features) and y (target)
            numerical_features = df_filtered[['Sale_Price', 'Station_Distance']]

            # Combine numerical and categorical features
            features = numerical_features

            # Target variable is the values in price change i years column
            target = df_filtered[feature_col]  

            # Ensure features and target have no missing values
            features = features.dropna()
            target = target.loc[features.index]

            # Check if data is available after processing
            if features.shape[0] == 0:
                continue

            # Train-test split
            X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=23)

            # Initialize and train the model
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X_train, y_train)

            # Save the model with borough, building_code, and years_ahead in the filename
            model_filename = f'model_{borough}_{building_code}_years_ahead_{i}.pkl'
            model_dir = "/Users/atwoo/Documents/Fairly_even/models"
            model_filepath = os.path.join(model_dir, model_filename)
            joblib.dump(model, model_filepath)

            print(f"Model for {borough}, {building_code} {i} years ahead was saved.")


Model for Staten, A1 0 years ahead was saved.
Model for Staten, A1 1 years ahead was saved.
Model for Staten, A1 2 years ahead was saved.
Model for Staten, A1 3 years ahead was saved.
Model for Staten, A1 4 years ahead was saved.
Model for Staten, A1 5 years ahead was saved.
Model for Staten, A5 0 years ahead was saved.
Model for Staten, A5 1 years ahead was saved.
Model for Staten, A5 2 years ahead was saved.
Model for Staten, A5 3 years ahead was saved.
Model for Staten, A5 4 years ahead was saved.
Model for Staten, A5 5 years ahead was saved.
Model for Staten, A2 0 years ahead was saved.
Model for Staten, A2 1 years ahead was saved.
Model for Staten, A2 2 years ahead was saved.
Model for Staten, A2 3 years ahead was saved.
Model for Staten, A2 4 years ahead was saved.
Model for Staten, A2 5 years ahead was saved.
Model for Queens, A1 0 years ahead was saved.
Model for Queens, A1 1 years ahead was saved.
Model for Queens, A1 2 years ahead was saved.
Model for Queens, A1 3 years ahead

## Model Evaluation

In [92]:
import joblib
import pandas as pd
import os

def predict_property_value(current_property_value, years_ahead, building_code, borough, station_distance):
    """Predict the increase in property value

    Parameters
    ----------
    current_property_value : float
        Current value of property being passed in  
    year_ahead : int
        The number of years ahead that user wants to predict the value of their property will be         
    building_code : string
        The building classification code of the property to be predicted
    borough : string 
        The borough of where the property is located 

    Returns
    -------
    float
        returns the predicted property value 
    """

    # Validate the years_ahead parameter
    if years_ahead < 0 or years_ahead > 5:
        raise ValueError("Years ahead must be between 0 and 5.")

    # Load the corresponding model based on borough, building code, and years ahead
    model_filename = f'model_{borough}_{building_code}_years_ahead_{years_ahead}.pkl'
    model_path = os.path.join("/Users/atwoo/Documents/Fairly_even/models", model_filename)
    
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"The model for {borough}, {building_code}, {years_ahead} years ahead does not exist.")
    
    model = joblib.load(model_path)
    
    # Prepare the input data
    input_data = pd.DataFrame({
        'Sale_Price': [current_property_value],
        'Station_Distance': [station_distance],
    })
    
    # Make the prediction using the loaded model
    predicted_price_change = model.predict(input_data)[0]
    
    # Calculate the predicted future property value
    future_property_value = current_property_value + predicted_price_change
    
    return future_property_value

# Example usage:
curr_price = 300000  # Current property value input by the user
years = 5 # How many years ahead the user wants to predict
b_code = "A1"     # Building classification code of user's property
borough = "Queens"      # The borough the property is in
station_dist = 500   # Distance of the property from the station in meters

predicted_value = predict_property_value(curr_price, years, b_code, borough, station_dist)
print(f"The predicted property value in {years_ahead} years is: {predicted_value}")


The predicted property value in 5 years is: 400661.57
