In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import cross_val_score
import pickle
from google.colab import files

In [3]:
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/refined_data_sample_distancetesting.csv'
full_df = pd.read_csv(url, index_col = 0)

In [4]:
full_df.columns.to_list()

['SOLD DATE',
 'PROPERTY TYPE',
 'ADDRESS',
 'CITY',
 'STATE OR PROVINCE',
 'ZIP OR POSTAL CODE',
 'PRICE',
 'BEDS',
 'BATHS',
 'LOCATION',
 'SQUARE FEET',
 'LOT SIZE',
 'YEAR BUILT',
 'HOA/MONTH',
 'LATITUDE',
 'LONGITUDE',
 'closest_greenspace_direct',
 'closest_greenspace_coords',
 'closest_greenspace_centercoord',
 'closest_greenspace_area',
 'closest_metro_direct',
 'closest_metro_loc',
 'closest_school_direct',
 'closest_school_loc',
 'closest_college_direct',
 'closest_college_loc',
 'closest_shop_direct',
 'closest_shop_loc',
 'closest_tourism_direct',
 'closest_tourism_loc',
 'closest_leisure_direct',
 'closest_leisure_loc',
 'all_greenspace_area_under0.5km',
 'all_greenspace_area_under0.75km',
 'all_greenspace_area_under1km',
 'all_greenspace_area_under1.25km',
 'all_greenspace_area_under1.5km',
 'all_greenspace_area_under1.75km',
 'all_greenspace_area_under2km',
 'all_greenspace_area_under2.25km',
 'all_greenspace_area_under2.5km',
 'crosses_highway_under0.5km',
 'crosses_hi

In [6]:
df = full_df.iloc[:, :-18].copy(deep=True)

In [7]:
df = pd.get_dummies(df, columns=['PROPERTY TYPE'], dtype = int)
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/encoded_df.csv'
encoded_df = pd.read_csv(url, index_col = 0)
df['TARGET_ENCODED_PRICE_50'] = encoded_df['TARGET_ENCODED_PRICE_50']
df['all_greenspace_area_under2.5km'] = full_df['all_greenspace_area_under2.5km']
df['crosses_highway_under2.5km'] = full_df['crosses_highway_under2.5km']

In [8]:
location_to_price_dict = dict(zip(df['LOCATION'], df['TARGET_ENCODED_PRICE_50']))
with open('location_to_price_dict.pkl', 'wb') as file:
    pickle.dump(location_to_price_dict, file)
files.download('location_to_price_dict.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
dropped = ['SOLD DATE', 'ADDRESS', 'CITY', 'STATE OR PROVINCE', 'ZIP OR POSTAL CODE', 'LOCATION', 'LATITUDE', 'LONGITUDE', 'closest_greenspace_coords', 'closest_greenspace_centercoord', 'closest_metro_loc',
           'closest_metro_loc', 'closest_school_loc', 'closest_college_loc', 'closest_shop_loc', 'closest_tourism_loc', 'closest_leisure_loc'] #EDIT THIS TO DROP MORE IF NECESSARY
df = df.drop(columns = dropped)
df = df.fillna(0)

In [10]:
X = df.drop(columns = ['PRICE'])
y = df['PRICE']

In [11]:
X_raw_final = X.copy() #Needed for model deployment
X_raw_final.to_csv('X_raw_final.csv')
files.download('X_raw_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
exclude_columns = ['PROPERTY TYPE_Condo/Co-op', 'PROPERTY TYPE_Single Family Residential', 'PROPERTY TYPE_Townhouse', 'crosses_highway_under2.5km'] #careful, this assumes any new column we add should be thrown into robust scaler (probably is the case)
all_columns = X.columns
robust_columns = [col for col in all_columns if col not in exclude_columns]
scaler = RobustScaler()
X[robust_columns] = scaler.fit_transform(X[robust_columns])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.025, 0.05, 0.01],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.2],
    'reg_lambda': [0, 0.1, 0.2]
}

xgb_model = XGBRegressor(random_state=42)

random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, #decrease n_iter to speed up proccess
                                   n_iter=100, scoring='r2', cv=5,
                                   random_state=42, verbose=1, n_jobs=-1)

random_search.fit(X_train, y_train)

print("Best Parameters found: ", random_search.best_params_)
xgb_model = random_search.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters found:  {'subsample': 1.0, 'reg_lambda': 0.1, 'reg_alpha': 0.2, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.8}


In [15]:
xgb_model = random_search.best_estimator_

In [16]:
xgb_model.fit(X_train, y_train)

In [17]:
y_pred = xgb_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")

mse = mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {mse**0.5}")

feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': xgb_model.feature_importances_}) #might need to replace
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(feature_importances)

R-squared (R2): 0.8677887036222853
Root Mean Squared Error (RMSE): 245222.90994087284
Feature Importances:
                                    Feature  Importance
2                               SQUARE FEET    0.359509
1                                     BATHS    0.187807
18           all_greenspace_area_under2.5km    0.071262
10                   closest_college_direct    0.048910
19               crosses_highway_under2.5km    0.048626
3                                  LOT SIZE    0.045440
16                  PROPERTY TYPE_Townhouse    0.031379
17                  TARGET_ENCODED_PRICE_50    0.029003
14                PROPERTY TYPE_Condo/Co-op    0.028631
13                   closest_leisure_direct    0.020276
4                                YEAR BUILT    0.019122
11                      closest_shop_direct    0.016781
0                                      BEDS    0.016723
5                                 HOA/MONTH    0.014977
9                     closest_school_direct    0.0142

Save training data and XGB model

In [18]:
xgb_model

In [19]:
X_train.to_csv('X_train_final.csv')
files.download('X_train_final.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
with open('xgb_model_final.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)
files.download('xgb_model_final.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
with open('y_train_final.pkl', 'wb') as file:
    pickle.dump(y_train, file)
files.download('y_train_final.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>