In [25]:
#Necessary Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score

In [26]:
#Load in data for New York
nyc_data = pd.read_csv('NYC_Property_Sales_Apartments.csv')
nyc_data = nyc_data[['borough', 'neighborhood', 'gross_square_feet', 'sale_price']]
nyc_data['city'] = 'New York'
nyc_data['borough'] = nyc_data['borough'].str.lower()
nyc_data['neighborhood'] = nyc_data['neighborhood'].str.lower()
nyc_data['city'] = nyc_data['city'].str.lower()

#Load in data for London
london_data = pd.read_csv('London.csv')
london_data = london_data.rename(columns={
    'City/County': 'city',
    'Location': 'neighborhood',
    'Area in sq ft': 'gross_square_feet',
    'Price': 'sale_price'
})
london_data['sale_price'] *= 1.24  # adjust the sale price to match USD
london_data['borough'] = ''  # since london does not have boroughs we do not need it
london_data['city'] = london_data['city'].str.lower()
london_data['neighborhood'] = london_data['neighborhood'].str.lower()

london_data = london_data[['city', 'borough', 'neighborhood', 'gross_square_feet', 'sale_price']]

'''
Renamed all of the the columns in the london dataset as well as added a borough column in order to make sure it aligns with the columns that we used for the new york dataset in order
for them to be combined and to easily be able to train on both the data sets so that we could use them for evaluation and user input as well. 
'''

#combine the two data sets 
data = pd.concat([nyc_data, london_data], ignore_index=True)

In [27]:
#preprocessing, filter out potential outliers
filtered_data = data[
    (data['gross_square_feet'] > 100) & (data['gross_square_feet'] < 20000) &
    (data['sale_price'] > 50000) & (data['sale_price'] < 10000000)
]

features = filtered_data[['city', 'borough', 'neighborhood', 'gross_square_feet']]
target = filtered_data['sale_price']

column_transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['city', 'borough', 'neighborhood'])
], remainder='passthrough')

'''
Column Transformer here is used in order to apply different transformation/preprocessing steps to all of our
different columns that we have in our data sets. Additionally this only modifies the columns that are necessary
for our particular model and ignores the rest therefore saving some computing time. We also do not put square footage
in here since it is already in numerical values while our other features that we want to train on are not, and that
is why we have the onehotencoder as well. 
'''

features_encoded = column_transformer.fit_transform(features)

In [28]:
# split data sets for training and testing
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# create our model and fit/train it
model = RandomForestRegressor(
    n_estimators=1000,
    max_depth=None,
    min_samples_split=20,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42
)
model.fit(X_train, y_train)

In [29]:
y_pred = model.predict(X_test)

# evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Random Forest RMSE: {rmse:.2f}")
print(f"Random Forest R^2: {r2:.2f}")

Random Forest RMSE: 658230.83
Random Forest R^2: 0.73


In [34]:
#allow user input

def predict_property_price(city, borough, neighborhood, square_feet):
    input_data = {
        'city': [city.lower()],
        'borough': [borough.lower()],
        'neighborhood': [neighborhood.lower()],
        'gross_square_feet': [square_feet]
    }

    input_df = pd.DataFrame(input_data)

    transformed_input = column_transformer.transform(input_df)

    predicted_price = model.predict(transformed_input)

    return f"Sale price prediction for the property: ${predicted_price[0]:,.2f}"

# change here for user input it is not case sensitive
print(predict_property_price('New York', 'Manhattan', 'Tribeca', 800))
print(predict_property_price('new york', 'manhattan', 'tribeca', 800))


Sale price prediction for the property: $1,749,359.43
Sale price prediction for the property: $1,749,359.43
