We will download data from redfin on houses in the area for sale.
We will then perform pre-processing then predict the sales prices for these houses, and compare them to their listed price

In [3]:
!pip install osmnx

Collecting osmnx
  Downloading osmnx-1.9.4-py3-none-any.whl.metadata (4.9 kB)
Downloading osmnx-1.9.4-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.5/107.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: osmnx
Successfully installed osmnx-1.9.4


In [74]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from sklearn.model_selection import cross_val_score
import osmnx as ox
import networkx as nx
import geopandas
import folium
import numpy as np
from pyproj import Geod
from shapely.geometry import Polygon, MultiPolygon, Point, LineString
from shapely.ops import nearest_points
import warnings
import pickle

  and should_run_async(code)


In [122]:
warnings.filterwarnings('ignore')

In [6]:
places = ["Washington, DC, USA", "Arlington, Virginia, USA", "Alexandria, Virginia, USA"] #Our places of interest. This cell has about a 3m runtime
graphs = [ox.graph_from_place(place, network_type='walk') for place in places]
G_combined = nx.compose_all(graphs)

In [134]:
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/redfin_2024-07-29-11-03-05.csv'
df = pd.read_csv(url, index_col = 0)

In [135]:
#remove duplicate rows
df['duplicates']=df.duplicated()
df=df[df['duplicates']==False]

#reset index
df.reset_index(inplace=True)

#Fill N/A HOA columns to 0
df['HOA/MONTH'].fillna(0, inplace=True)

#remove parking lots and vacant land
df=df[df['PROPERTY TYPE'].isin(['Multi-Family (2-4 Unit)', 'Condo/Co-op', 'Townhouse', 'Single Family Residential'])]

In [136]:
df=df[['SOLD DATE', 'PROPERTY TYPE', 'ADDRESS', 'CITY', 'STATE OR PROVINCE', 'ZIP OR POSTAL CODE', 'PRICE', 'BEDS', 'BATHS', 'LOCATION', 'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE', 'LONGITUDE']]
df=df[df['PROPERTY TYPE'].isin(['Condo/Co-op', 'Townhouse', 'Single Family Residential']) & df['CITY'].isin(['Washington', 'Arlington', 'Alexandria'])]

In [137]:
df

Unnamed: 0,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,LATITUDE,LONGITUDE
1,,Townhouse,649 Morton St NW,Washington,DC,20010.0,625000.0,2.0,1.5,Columbia Heights,1088.0,1388.0,1928.0,0.0,38.932268,-77.022599
2,,Single Family Residential,5886 14th St N,Arlington,VA,22205.0,1299000.0,5.0,3.5,WESTOVER,2313.0,7395.0,1939.0,0.0,38.882172,-77.143247
3,,Townhouse,1637 Irving St NW,Washington,DC,20010.0,1245000.0,4.0,2.0,Mount Pleasant,2486.0,2348.0,1917.0,0.0,38.928982,-77.038523
4,,Condo/Co-op,520 John Carlyle St #219,Alexandria,VA,22314.0,445000.0,1.0,1.0,CARLYLE SQUARE,760.0,,2007.0,590.0,38.801935,-77.061552
5,,Single Family Residential,5628 Gary Ave,Alexandria,VA,22311.0,799000.0,4.0,1.5,SHIRLEY FOREST,1862.0,10475.0,1959.0,0.0,38.835506,-77.126423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,,Townhouse,2817-A S Woodrow St #1,Arlington,VA,22206.0,675000.0,3.0,2.0,COURTBRDGE I&II,2066.0,,1982.0,581.0,38.840766,-77.097163
347,,Townhouse,545 S Saint Asaph St,Alexandria,VA,22314.0,985000.0,3.0,2.5,TANNERY YARD,1260.0,748.0,1976.0,100.0,38.799803,-77.047125
348,,Condo/Co-op,4550 Strutfield Ln #2223,Alexandria,VA,22311.0,319000.0,1.0,1.0,PALAZZO AT PARK CENTER,818.0,,2000.0,479.0,38.835587,-77.105740
349,,Single Family Residential,410 Aspen St NW,Washington,DC,20012.0,1050000.0,5.0,3.0,Brightwood,2224.0,8250.0,1913.0,0.0,38.972130,-77.019134


In [138]:
#Tags we're using
greenspace_tags = {'leisure': ['park', 'garden', 'nature_reserve']}
metro_tags = {'railway': 'subway_entrance'}
#Optional tags
school_tags = {'amenity': ['kindergarten', 'school', 'library']}
college_tags = {'amenity': ['university', 'college', 'research_institute']}
shop_tags = {'shop': ['department_store', 'mall']}
tourism_tags = {'tourism': ['aquarium', 'artwork', 'attraction', 'gallery']}
leisure_tags = {'leisure': ['disc_golf_course', 'dog_park', 'fishing', 'fitness_centre', 'horse_riding', 'ice_rink', 'miniature_golf', 'pitch', 'playground', 'stadium', 'swimming_pool', 'track']}

# Function to get nodes from a place based on tags
def get_nodes_from_place(place, tags):
    return ox.features_from_place(place, tags)

# Function to put tag's lat/longs in a list
# Coords outputs Point locaitons (for metro)
# Nodes_combined outputs a dataframe with Points/Polygons/Multipolygon geometric objects (for greenspaces)
def get_lat_longs(tags):
  nodes = []
  for place in places:
    nodes.append(get_nodes_from_place(place, tags))
  nodes_combined = pd.concat(nodes)
  coords = nodes_combined[nodes_combined.geom_type == 'Point'].geometry.apply(lambda geom: (geom.y, geom.x)).tolist()
  return coords, nodes_combined

#call the function for each group
greenspace_coords, nodes_combined_green = get_lat_longs(greenspace_tags)
metro_coords, nodes_combined_metro = get_lat_longs(metro_tags)
school_coords, nodes_combined_school = get_lat_longs(school_tags)
college_coords, nodes_combined_college = get_lat_longs(college_tags)
shop_coords, nodes_combined_shop = get_lat_longs(shop_tags)
tourism_coords, nodes_combined_tourism = get_lat_longs(tourism_tags)
leisure_coords, nodes_combined_leisure = get_lat_longs(leisure_tags)

In [139]:
#clean nodes_combined_green
nodes_combined_green=nodes_combined_green[['geometry','leisure']]

#remove Point parks
nodes_combined_green=nodes_combined_green[nodes_combined_green.geom_type.isin(['Polygon', 'MultiPolygon'])]

#Fix Park 1685 (See Large_Park_Exploration notebook)

#convert multipolygon into list of lists
geom=nodes_combined_green.iloc[1685,0]
mycoordslist = [list(x.exterior.coords) for x in geom.geoms]
#remove parts of the polygon that are west of latitude -77.118427
newcoordslist = []
for coords in mycoordslist:
  lst1 = []
  for coord in coords:
    if coord[0] > -77.118427:
      lst1.append(coord)
  newcoordslist.append(lst1)
#clean new list
newcoordslist = [x for x in newcoordslist if x != []]
# Recreate the polygons
polygons = [Polygon(coords) for coords in newcoordslist]
# Create a MultiPolygon from the polygons
multipolygon = MultiPolygon(polygons)
#replace Park 1685's polygon with this polygon
nodes_combined_green.iloc[1685,0]=multipolygon

#remove parks that are far outside DC-metro boarders
nodes_combined_green.drop(nodes_combined_green.index[[1726, 1696, 1729, 1964]], inplace=True) #See Large_Park_Exploration notebook for explanation of index choice

#add center coords
nodes_combined_green['centercoords'] = nodes_combined_green.geometry.apply(lambda geom: (geom.centroid.y, geom.centroid.x)).tolist()

#clean the greenspace nodes combined dataframe
nodes_combined_green.reset_index(drop=True, inplace=True)

  and should_run_async(code)


In [140]:
geod = Geod(ellps="WGS84")
area_sqmeters = []

for x in range(0, len(nodes_combined_green)):
  poly= nodes_combined_green.geometry[x]
  geod_area = abs(geod.geometry_area_perimeter(poly)[0])
  area_sqmeters.append(geod_area)

nodes_combined_green['area_sq'] = area_sqmeters

  and should_run_async(code)


In [141]:
gdf = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df.LONGITUDE, df.LATITUDE), crs="EPSG:4326")

  and should_run_async(code)


In [142]:
#create a distance matrix of all the distances between every house and every park. Polygon/MultiPolygon parks will return distance to nearest edge
#we need to project onto utm to have the distances in meters
utm = gdf.estimate_utm_crs()
distancematrix=gdf.geometry.to_crs(utm).apply(lambda g: nodes_combined_green.to_crs(utm).geometry.distance(g)/1000) #Rows = House, Columns = Parks, results in km

  and should_run_async(code)


In [143]:
dm_under2_5 = distancematrix[distancematrix<2.5]

  and should_run_async(code)


In [144]:
non_nan_indices = []
for index, row in dm_under2_5.iterrows():
    non_nan_indices.append(row[~row.isna()].index.tolist())

  and should_run_async(code)


In [145]:
distance_under2_5 = []
for x in range(0, len(non_nan_indices)):
  ylist=[]
  for y in non_nan_indices[x]:
    z=nodes_combined_green.iloc[y,3]
    ylist.append(z)
  distance_under2_5.append(ylist)

#add all the areas of parks under 1 km for every house
distance_under2_5_sum = [sum(l) for l in distance_under2_5]

In [146]:
highway_tags = {'highway': ['motorway']}
hw_coords, nodes_combined_hw = get_lat_longs(highway_tags)

In [147]:
def get_crosses_highway_variable(area_underthresh, non_nan_indices):
  #Create a list of the area of the largest park under threshold for every house
  largest_park_underthresh=[]
  for x in area_underthresh:
    if x!=[]:
      largest_park_underthresh.append(max(x))
    else: largest_park_underthresh.append(0)
  #Get the geometry for the largest park found in largest_park_under1 variable
  #utilizes the non_nan_indicies variable to find the index in nodes_combined_green
  largest_park_underthresh_geos=[]
  for x in range(0, len(largest_park_underthresh)):
    if largest_park_underthresh[x] != 0:
      y=area_underthresh[x].index(largest_park_underthresh[x])
      largest_park_underthresh_geos.append(nodes_combined_green.iloc[non_nan_indices[x][y],0])
    if largest_park_underthresh[x] == 0:
      largest_park_underthresh_geos.append('No parks')
  #Create a line from each house to its closest largest park edge
  lines = []
  for x in range(0, len(largest_park_underthresh_geos)):
    if largest_park_underthresh_geos[x] != 'No parks':
      park = largest_park_underthresh_geos[x]
      house = gdf.iloc[x, 16]
      p1, p2 = nearest_points(park, house)
      line = LineString([p1, p2])
    else: line = 'No parks'
    lines.append(line)
  # Iterate through each highway polygon and check for intersection with line
  highwaycross = []
  for line in lines:
    crosses_highway = False
    for _, row in nodes_combined_hw.iterrows():
      highway_polygon = row['geometry']
      if line == 'No parks': break
      if line.crosses(highway_polygon):
        crosses_highway = True
        break
    highwaycross.append(crosses_highway)
  return highwaycross

  and should_run_async(code)


In [148]:
def get_all_greenspacearea_underthreshold(dm_underthreshold):
  # Get the column index of every non-NaN value in dm_underthreshold
  non_nan_indices = []
  for index, row in dm_underthreshold.iterrows():
    non_nan_indices.append(row[~row.isna()].index.tolist())
  #get areas for all parks under threshold (from non_nan_indices list)
  area_underthreshold = []
  for x in range(0, len(non_nan_indices)):
    ylist=[]
    for y in non_nan_indices[x]:
      z=nodes_combined_green.iloc[y,3] #column 3 = area_sqm
      ylist.append(z)
    area_underthreshold.append(ylist)
  #add all the areas of parks under thresh for every house
  area_underthreshold_sum = [sum(l) for l in area_underthreshold]
  return area_underthreshold_sum, area_underthreshold, non_nan_indices

In [149]:
area_under2_5_sum, area_under_2_5, nni_2_5 = get_all_greenspacearea_underthreshold(dm_under2_5)

In [150]:
highwaycross_2_5 = get_crosses_highway_variable(area_under_2_5, nni_2_5)

In [151]:
df['closest_greenspace_direct'] = distancematrix.min(axis=1)
df['closest_greenspace_coords'] = nodes_combined_green.loc[distancematrix.idxmin(axis=1)].geometry.values
df['closest_greenspace_centercoord'] = nodes_combined_green.loc[distancematrix.idxmin(axis=1)].centercoords.values
df['closest_greenspace_area'] = nodes_combined_green.loc[distancematrix.idxmin(axis=1)].area_sq.values

In [152]:
def haversine(lat1, lon1, lat2, lon2): #To get euclidian distance from lat/long coords
    R = 6371  # Earth radius in kilometers
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def closest_direct_distance(lat, lon, coords):
    distances = [haversine(lat, lon, x_lat, y_lon) for x_lat, y_lon in coords]
    return min(distances), coords[distances.index(min(distances))]

  and should_run_async(code)


In [153]:
df[['closest_metro_direct', 'closest_metro_loc']] = df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], metro_coords), axis=1, result_type='expand')
df[['closest_school_direct', 'closest_school_loc']] = df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], school_coords), axis=1, result_type='expand')
df[['closest_college_direct', 'closest_college_loc']] = df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], college_coords), axis=1, result_type='expand')
df[['closest_shop_direct', 'closest_shop_loc']] = df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], shop_coords), axis=1, result_type='expand')
df[['closest_tourism_direct', 'closest_tourism_loc']] = df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], tourism_coords), axis=1, result_type='expand')
df[['closest_leisure_direct', 'closest_leisure_loc']] = df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], leisure_coords), axis=1, result_type='expand')

In [154]:
df = pd.get_dummies(df, columns=['PROPERTY TYPE'], dtype = int)

  and should_run_async(code)


In [155]:
!wget https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/location_to_price_dict.pkl

  and should_run_async(code)


--2024-07-29 18:59:06--  https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/location_to_price_dict.pkl
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/location_to_price_dict.pkl [following]
--2024-07-29 18:59:06--  https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/location_to_price_dict.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10157 (9.9K) [application/octet-stream]
Saving to: ‘location_to_price_dict.pkl.2’


2024-07-29 18:59:06 (80.3 MB/s) - ‘location_to_price_dict.pkl.2’ saved [10157/10157]



In [156]:
with open('location_to_price_dict.pkl', 'rb') as file:
    location_to_price_dict = pickle.load(file)
df['TARGET_ENCODED_PRICE_50'] = df['LOCATION'].map(location_to_price_dict)
df['all_greenspace_area_under2.5km'] = area_under2_5_sum
df['crosses_highway_under2.5km'] = highwaycross_2_5

In [157]:
listed_price = df['PRICE'].copy()
dropped = ['SOLD DATE', 'PRICE', 'ADDRESS', 'CITY', 'STATE OR PROVINCE', 'ZIP OR POSTAL CODE', 'LOCATION', 'LATITUDE', 'LONGITUDE', 'closest_greenspace_coords', 'closest_greenspace_centercoord', 'closest_metro_loc',
           'closest_metro_loc', 'closest_school_loc', 'closest_college_loc', 'closest_shop_loc', 'closest_tourism_loc', 'closest_leisure_loc'] #A bit different then our previous dropped, you drop price for deployment
df = df.drop(columns = dropped)

  and should_run_async(code)


In [158]:
df = df.fillna(0)

  and should_run_async(code)


In [159]:
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/X_raw_final.csv'
X_raw = pd.read_csv(url, index_col = 0)
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/X_train_final.csv'
X_train = pd.read_csv(url, index_col = 0)

In [160]:
exclude_columns = ['PROPERTY TYPE_Condo/Co-op', 'PROPERTY TYPE_Single Family Residential', 'PROPERTY TYPE_Townhouse', 'crosses_highway_under2.5km'] #careful, this assumes any new column we add should be thrown into robust scaler (probably is the case)
all_columns = X_raw.columns
robust_columns = [col for col in all_columns if col not in exclude_columns]
scaler = RobustScaler()
scaler.fit(X_raw[robust_columns])

  and should_run_async(code)


In [161]:
df[robust_columns] = scaler.transform(df[robust_columns])

  and should_run_async(code)


In [162]:
!wget https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/xgb_model_final.pkl

  and should_run_async(code)


--2024-07-29 18:59:07--  https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/xgb_model_final.pkl
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/xgb_model_final.pkl [following]
--2024-07-29 18:59:07--  https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/xgb_model_final.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 733613 (716K) [application/octet-stream]
Saving to: ‘xgb_model_final.pkl.1’


2024-07-29 18:59:07 (12.8 MB/s) - ‘xgb_model_final.pkl.1’ saved [733613/733613]



In [163]:
with open('xgb_model_final.pkl', 'rb') as file:
    xgb_model = pickle.load(file)

In [164]:
!wget https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/y_train_final.pkl

--2024-07-29 18:59:07--  https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/y_train_final.pkl
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/y_train_final.pkl [following]
--2024-07-29 18:59:07--  https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/y_train_final.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 45875 (45K) [application/octet-stream]
Saving to: ‘y_train_final.pkl.1’


2024-07-29 18:59:07 (3.95 MB/s) - ‘y_train_final.pkl.1’ saved [45875/45875]



In [165]:
with open('y_train_final.pkl', 'rb') as file:
    y_train = pickle.load(file)

In [166]:
xgb_model.fit(X_train, y_train)

  and should_run_async(code)


In [169]:
y_pred = xgb_model.predict(df)

  and should_run_async(code)


Now let's get our original DF again and see results

In [171]:
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/redfin_2024-07-29-11-03-05.csv'
df = pd.read_csv(url, index_col = 0)
#remove duplicate rows
df['duplicates']=df.duplicated()
df=df[df['duplicates']==False]

#reset index
df.reset_index(inplace=True)

#Fill N/A HOA columns to 0
df['HOA/MONTH'].fillna(0, inplace=True)

#remove parking lots and vacant land
df=df[df['PROPERTY TYPE'].isin(['Multi-Family (2-4 Unit)', 'Condo/Co-op', 'Townhouse', 'Single Family Residential'])]
df=df[['PROPERTY TYPE', 'ADDRESS', 'CITY', 'STATE OR PROVINCE', 'ZIP OR POSTAL CODE', 'PRICE', 'BEDS', 'BATHS', 'LOCATION', 'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', 'HOA/MONTH', 'LATITUDE', 'LONGITUDE']]
df=df[df['PROPERTY TYPE'].isin(['Condo/Co-op', 'Townhouse', 'Single Family Residential']) & df['CITY'].isin(['Washington', 'Arlington', 'Alexandria'])]

In [173]:
df['pred_price'] = y_pred

  and should_run_async(code)


In [174]:
df

  and should_run_async(code)


Unnamed: 0,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,LATITUDE,LONGITUDE,pred_price
1,Townhouse,649 Morton St NW,Washington,DC,20010.0,625000.0,2.0,1.5,Columbia Heights,1088.0,1388.0,1928.0,0.0,38.932268,-77.022599,6.393133e+05
2,Single Family Residential,5886 14th St N,Arlington,VA,22205.0,1299000.0,5.0,3.5,WESTOVER,2313.0,7395.0,1939.0,0.0,38.882172,-77.143247,1.355891e+06
3,Townhouse,1637 Irving St NW,Washington,DC,20010.0,1245000.0,4.0,2.0,Mount Pleasant,2486.0,2348.0,1917.0,0.0,38.928982,-77.038523,1.058641e+06
4,Condo/Co-op,520 John Carlyle St #219,Alexandria,VA,22314.0,445000.0,1.0,1.0,CARLYLE SQUARE,760.0,,2007.0,590.0,38.801935,-77.061552,4.323214e+05
5,Single Family Residential,5628 Gary Ave,Alexandria,VA,22311.0,799000.0,4.0,1.5,SHIRLEY FOREST,1862.0,10475.0,1959.0,0.0,38.835506,-77.126423,7.273896e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,Townhouse,2817-A S Woodrow St #1,Arlington,VA,22206.0,675000.0,3.0,2.0,COURTBRDGE I&II,2066.0,,1982.0,581.0,38.840766,-77.097163,6.128288e+05
347,Townhouse,545 S Saint Asaph St,Alexandria,VA,22314.0,985000.0,3.0,2.5,TANNERY YARD,1260.0,748.0,1976.0,100.0,38.799803,-77.047125,6.713867e+05
348,Condo/Co-op,4550 Strutfield Ln #2223,Alexandria,VA,22311.0,319000.0,1.0,1.0,PALAZZO AT PARK CENTER,818.0,,2000.0,479.0,38.835587,-77.105740,3.937866e+05
349,Single Family Residential,410 Aspen St NW,Washington,DC,20012.0,1050000.0,5.0,3.0,Brightwood,2224.0,8250.0,1913.0,0.0,38.972130,-77.019134,8.793669e+05


In [176]:
y1 = df['PRICE']
y2 = df['pred_price']
r2 = r2_score(y1, y2)
print(f"R-squared (R2): {r2}")

mse = mean_squared_error(y1, y2)
print(f"Root Mean Squared Error (RMSE): {mse**0.5}")

feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': xgb_model.feature_importances_}) #might need to replace
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(feature_importances)

R-squared (R2): 0.8297541607262138
Root Mean Squared Error (RMSE): 283000.0372204725
Feature Importances:
                                    Feature  Importance
2                               SQUARE FEET    0.359509
1                                     BATHS    0.187807
18           all_greenspace_area_under2.5km    0.071262
10                   closest_college_direct    0.048910
19               crosses_highway_under2.5km    0.048626
3                                  LOT SIZE    0.045440
16                  PROPERTY TYPE_Townhouse    0.031379
17                  TARGET_ENCODED_PRICE_50    0.029003
14                PROPERTY TYPE_Condo/Co-op    0.028631
13                   closest_leisure_direct    0.020276
4                                YEAR BUILT    0.019122
11                      closest_shop_direct    0.016781
0                                      BEDS    0.016723
5                                 HOA/MONTH    0.014977
9                     closest_school_direct    0.01425

  and should_run_async(code)
