In [77]:
#Necessary Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

In [78]:
#Preprocessing
data_path = 'NYC_Property_Sales_Apartments.csv' 
data = pd.read_csv(data_path)

#Filter our data to remove potential outliers.
filtered_data = data[
    (data['gross_square_feet'] > 100) & (data['gross_square_feet'] < 20000) &
    (data['sale_price'] > 50000) & (data['sale_price'] < 10000000)
]

features = filtered_data[['borough', 'neighborhood', 'gross_square_feet']]
target = filtered_data['sale_price']

column_transformer = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['borough', 'neighborhood'])
], remainder='passthrough')

features_encoded = column_transformer.fit_transform(features)

In [79]:
#Split our data set
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

In [84]:
#Train our model
model = RandomForestRegressor(
    n_estimators=1000,
    max_depth=None,
    min_samples_split=20,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42
)
model.fit(X_train, y_train)

[CV] END max_depth=None, max_features=log2, min_samples_leaf=10, min_samples_split=20, n_estimators=1000; total time=  17.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=20, min_samples_split=20, n_estimators=500; total time=   5.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   3.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  15.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=50; total time=   1.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=100; total time=   3.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=200; total time=   6.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=1000; total time=  30.6s
[CV] END max_depth=N

In [85]:
#Predict and Evaluate
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Random Forest RMSE: {rmse:.2f}")
print(f"Random Forest R^2: {r2:.2f}")

Random Forest RMSE: 698116.66
Random Forest R^2: 0.69


In [75]:
#Change the inputs in input_data to see different prices that the model spits out. 

input_data = {
    'borough': ['Manhattan'],
    'neighborhood': ['Tribeca'],
    'gross_square_feet': [600]
}

input_df = pd.DataFrame(input_data)

transformed_input = column_transformer.transform(input_df)

predicted_price = model.predict(transformed_input)

print(f"Predicted Sale Price for the property: ${predicted_price[0]:,.2f}")

Predicted Sale Price for the property: $924,292.50
