In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import cross_val_score

In [41]:
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/refined_data_sample_distancetesting.csv'
full_df = pd.read_csv(url, index_col = 0)

In [42]:
full_df

Unnamed: 0,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,...,all_greenspace_area_under2.5km,crosses_highway_under0.5km,crosses_highway_under0.75km,crosses_highway_under1km,crosses_highway_under1.25km,crosses_highway_under1.5km,crosses_highway_under1.75km,crosses_highway_under2km,crosses_highway_under2.25km,crosses_highway_under2.5km
0,2024-04-10,Condo/Co-op,1024 N Utah St #223,Arlington,VA,22201.0,405000.0,1.0,1.0,WESTVIEW AT BALLSTON METRO,...,1.714021e+06,False,False,False,False,False,False,False,False,False
1,2024-03-29,Single Family Residential,4717 26th St N,Arlington,VA,22207.0,1600000.0,3.0,3.0,SHIRLEY WOODS,...,9.658134e+05,False,False,False,False,False,False,False,False,False
2,2024-03-14,Condo/Co-op,1029 N Stuart St N #216,Arlington,VA,22201.0,415951.0,1.0,1.0,SUMMERWALK I&II,...,1.695740e+06,False,False,False,False,False,False,False,False,False
3,2024-05-02,Condo/Co-op,851 N Glebe Rd #103,Arlington,VA,22203.0,450000.0,1.0,1.0,CONTINENTAL,...,1.823800e+06,False,False,False,False,False,False,False,False,False
4,2024-05-03,Single Family Residential,4224 23rd St N,Arlington,VA,22207.0,2100000.0,5.0,4.5,LORCOM GROVE,...,2.520036e+06,False,False,False,False,False,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2350,2024-05-31,Townhouse,1250 Delafield Pl NE,Washington,DC,20017.0,540000.0,3.0,2.0,Riggs Park,...,1.061562e+06,False,False,False,False,False,False,False,False,False
2351,2024-03-13,Townhouse,1271 Delafield Pl NE,Washington,DC,20017.0,640000.0,3.0,2.5,Riggs Park,...,9.774984e+05,False,False,False,False,False,False,False,False,False
2352,2024-05-01,Townhouse,1237 Emerson St NE,Washington,DC,20017.0,680000.0,3.0,3.5,Riggs Park,...,1.061562e+06,False,False,False,False,False,False,False,False,False
2353,2024-04-16,Single Family Residential,1704 Michigan Ave NE,Washington,DC,20017.0,799900.0,3.0,2.5,Michigan Park,...,1.026494e+06,False,False,False,False,False,False,False,False,False


In [43]:
full_df.columns.to_list()

['SOLD DATE',
 'PROPERTY TYPE',
 'ADDRESS',
 'CITY',
 'STATE OR PROVINCE',
 'ZIP OR POSTAL CODE',
 'PRICE',
 'BEDS',
 'BATHS',
 'LOCATION',
 'SQUARE FEET',
 'LOT SIZE',
 'YEAR BUILT',
 'HOA/MONTH',
 'LATITUDE',
 'LONGITUDE',
 'closest_greenspace_direct',
 'closest_greenspace_coords',
 'closest_greenspace_centercoord',
 'closest_greenspace_area',
 'closest_metro_direct',
 'closest_metro_loc',
 'closest_school_direct',
 'closest_school_loc',
 'closest_college_direct',
 'closest_college_loc',
 'closest_shop_direct',
 'closest_shop_loc',
 'closest_tourism_direct',
 'closest_tourism_loc',
 'closest_leisure_direct',
 'closest_leisure_loc',
 'all_greenspace_area_under0.5km',
 'all_greenspace_area_under0.75km',
 'all_greenspace_area_under1km',
 'all_greenspace_area_under1.25km',
 'all_greenspace_area_under1.5km',
 'all_greenspace_area_under1.75km',
 'all_greenspace_area_under2km',
 'all_greenspace_area_under2.25km',
 'all_greenspace_area_under2.5km',
 'crosses_highway_under0.5km',
 'crosses_hi

In [44]:
part_df = full_df.iloc[:, :-18]

In [45]:
part_df = pd.get_dummies(part_df, columns=['PROPERTY TYPE'], dtype = int)
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/encoded_df.csv'
encoded_df = pd.read_csv(url, index_col = 0)
part_df['TARGET_ENCODED_PRICE_50'] = encoded_df['TARGET_ENCODED_PRICE_50']

In [46]:
dropped = ['SOLD DATE', 'ADDRESS', 'CITY', 'STATE OR PROVINCE', 'ZIP OR POSTAL CODE', 'LOCATION', 'LATITUDE', 'LONGITUDE', 'closest_greenspace_coords', 'closest_greenspace_centercoord', 'closest_metro_loc',
           'closest_metro_loc', 'closest_school_loc', 'closest_college_loc', 'closest_shop_loc', 'closest_tourism_loc', 'closest_leisure_loc'] #EDIT THIS TO DROP MORE IF NECESSARY
part_df = part_df.drop(columns = dropped)

In [47]:
part_df = part_df.fillna(0)

In [48]:
thresholds = ['0.5', '0.75', '1', '1.25', '1.5', '1.75', '2', '2.25', '2.5']

In [49]:
dfs = {}
for threshold in thresholds:
    column_name1 = f'all_greenspace_area_under{threshold}km'
    column_name2 = f'crosses_highway_under{threshold}km'
    df_threshold = part_df.copy()
    df_threshold[column_name1] = full_df[column_name1]
    df_threshold[column_name2] = full_df[column_name2]
    dfs[f'df_{threshold}'] = df_threshold

In [50]:
dfs['df_1'].columns.to_list()

['PRICE',
 'BEDS',
 'BATHS',
 'SQUARE FEET',
 'LOT SIZE',
 'YEAR BUILT',
 'HOA/MONTH',
 'closest_greenspace_direct',
 'closest_greenspace_area',
 'closest_metro_direct',
 'closest_school_direct',
 'closest_college_direct',
 'closest_shop_direct',
 'closest_tourism_direct',
 'closest_leisure_direct',
 'PROPERTY TYPE_Condo/Co-op',
 'PROPERTY TYPE_Single Family Residential',
 'PROPERTY TYPE_Townhouse',
 'TARGET_ENCODED_PRICE_50',
 'all_greenspace_area_under1km',
 'crosses_highway_under1km']

In [51]:
y = part_df['PRICE']
for key in dfs:
  dfs[key] = dfs[key].drop(columns = ['PRICE'])

In [52]:
for key in dfs:
  print(key[3:])

0.5
0.75
1
1.25
1.5
1.75
2
2.25
2.5


In [53]:
for key in dfs:
  thresh = key[3:]
  exclude_columns = ['PROPERTY TYPE_Condo/Co-op', 'PROPERTY TYPE_Single Family Residential', 'PROPERTY TYPE_Townhouse', f'crosses_highway_under{thresh}km']
  all_columns = dfs[key].columns
  robust_columns = [col for col in all_columns if col not in exclude_columns]
  scaler = RobustScaler()
  dfs[key][robust_columns] = scaler.fit_transform(dfs[key][robust_columns])

At this point in the project we know that XGBoost gives us the best results

In [54]:
for key in dfs:
  X_train, X_test, y_train, y_test = train_test_split(dfs[key], y, test_size=0.2, random_state=42)
  param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.025, 0.05, 0.01],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.2],
    'reg_lambda': [0, 0.1, 0.2]
   }

  xgb_model = XGBRegressor(random_state=42)

  random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid,
                                   n_iter=100, scoring='r2', cv=5,
                                   random_state=42, verbose=1, n_jobs=-1)

  random_search.fit(X_train, y_train)

  print("Best Parameters found: ", random_search.best_params_)
  xgb_model = random_search.best_estimator_

  scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='r2')
  print(f"Score for xgb_model on {key} : {scores}")
  print(f"Mean score for xgb_model on {key} : {np.mean(scores)}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters found:  {'subsample': 0.8, 'reg_lambda': 0.2, 'reg_alpha': 0.2, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.9}
Score for xgb_model on df_0.5 : [0.81494497 0.82692651 0.85510832 0.77578823 0.8545382 ]
Mean score for xgb_model on df_0.5 : 0.8254612452611664
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters found:  {'subsample': 0.8, 'reg_lambda': 0.2, 'reg_alpha': 0.2, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.9}
Score for xgb_model on df_0.75 : [0.82009354 0.83636863 0.84249959 0.8186203  0.84855164]
Mean score for xgb_model on df_0.75 : 0.8332267420113642
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters found:  {'subsample': 0.9, 'reg_lambda': 0.2, 'reg_alpha': 0.2, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.2, 'colsam

Based on the results, we are going to go with a threshold of 2.5 km for all_greenspace_area and crosses highway. Let's look at feature importance

In [56]:
y_pred = xgb_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")

mse = mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {mse**0.5}")

feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': xgb_model.feature_importances_}) #might need to replace
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(feature_importances)

R-squared (R2): 0.8677887036222853
Root Mean Squared Error (RMSE): 245222.90994087284
Feature Importances:
                                    Feature  Importance
2                               SQUARE FEET    0.359509
1                                     BATHS    0.187807
18           all_greenspace_area_under2.5km    0.071262
10                   closest_college_direct    0.048910
19               crosses_highway_under2.5km    0.048626
3                                  LOT SIZE    0.045440
16                  PROPERTY TYPE_Townhouse    0.031379
17                  TARGET_ENCODED_PRICE_50    0.029003
14                PROPERTY TYPE_Condo/Co-op    0.028631
13                   closest_leisure_direct    0.020276
4                                YEAR BUILT    0.019122
11                      closest_shop_direct    0.016781
0                                      BEDS    0.016723
5                                 HOA/MONTH    0.014977
9                     closest_school_direct    0.0142