This notebook will test out using a model on a new address. First we will need to gather the enriched variables on an address using OSMNX, then we will need to import the model and training data to scale and fit, then we can make a prediction

In [2]:
!pip install osmnx

Collecting osmnx
  Downloading osmnx-1.9.3-py3-none-any.whl (107 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/107.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.2/107.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: osmnx
Successfully installed osmnx-1.9.3


In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from sklearn.model_selection import cross_val_score
import osmnx as ox
import networkx as nx
import geopandas
import folium
import numpy as np
from pyproj import Geod
from shapely.geometry import Polygon, MultiPolygon, Point
import warnings
import pickle

In [13]:
warnings.filterwarnings('ignore')

In [18]:
addy = '415 6th St NE, Washington, DC 20002' #sample address

In [7]:
places = ["Washington, DC, USA", "Arlington, Virginia, USA", "Alexandria, Virginia, USA"] #Our places of interest. This cell has about a 3m runtime
graphs = [ox.graph_from_place(place, network_type='walk') for place in places]
G_combined = nx.compose_all(graphs)

In [8]:
#Tags we're using
greenspace_tags = {'leisure': ['park', 'garden', 'nature_reserve']}
metro_tags = {'railway': 'subway_entrance'}
#Optional tags
school_tags = {'amenity': ['kindergarten', 'school', 'library']}
college_tags = {'amenity': ['university', 'college', 'research_institute']}
shop_tags = {'shop': ['department_store', 'mall']}
tourism_tags = {'tourism': ['aquarium', 'artwork', 'attraction', 'gallery']}
leisure_tags = {'leisure': ['disc_golf_course', 'dog_park', 'fishing', 'fitness_centre', 'horse_riding', 'ice_rink', 'miniature_golf', 'pitch', 'playground', 'stadium', 'swimming_pool', 'track']}

# Function to get nodes from a place based on tags
def get_nodes_from_place(place, tags):
    return ox.features_from_place(place, tags)

# Function to put tag's lat/longs in a list
# Coords outputs Point locaitons (for metro)
# Nodes_combined outputs a dataframe with Points/Polygons/Multipolygon geometric objects (for greenspaces)
def get_lat_longs(tags):
  nodes = []
  for place in places:
    nodes.append(get_nodes_from_place(place, tags))
  nodes_combined = pd.concat(nodes)
  coords = nodes_combined[nodes_combined.geom_type == 'Point'].geometry.apply(lambda geom: (geom.y, geom.x)).tolist()
  return coords, nodes_combined

#call the function for each group
greenspace_coords, nodes_combined_green = get_lat_longs(greenspace_tags)
metro_coords, nodes_combined_metro = get_lat_longs(metro_tags)
school_coords, nodes_combined_school = get_lat_longs(school_tags)
college_coords, nodes_combined_college = get_lat_longs(college_tags)
shop_coords, nodes_combined_shop = get_lat_longs(shop_tags)
tourism_coords, nodes_combined_tourism = get_lat_longs(tourism_tags)
leisure_coords, nodes_combined_leisure = get_lat_longs(leisure_tags)

In [9]:
#clean nodes_combined_green
nodes_combined_green=nodes_combined_green[['geometry','leisure']]

#remove Point parks
nodes_combined_green=nodes_combined_green[nodes_combined_green.geom_type.isin(['Polygon', 'MultiPolygon'])]
print(len(nodes_combined_green))

#Fix Park 1685 (See Large_Park_Exploration notebook)

#convert multipolygon into list of lists
geom=nodes_combined_green.iloc[1685,0]
mycoordslist = [list(x.exterior.coords) for x in geom.geoms]
#remove parts of the polygon that are west of latitude -77.118427
newcoordslist = []
for coords in mycoordslist:
  lst1 = []
  for coord in coords:
    if coord[0] > -77.118427:
      lst1.append(coord)
  newcoordslist.append(lst1)
#clean new list
newcoordslist = [x for x in newcoordslist if x != []]
# Recreate the polygons
polygons = [Polygon(coords) for coords in newcoordslist]
# Create a MultiPolygon from the polygons
multipolygon = MultiPolygon(polygons)
#replace Park 1685's polygon with this polygon
nodes_combined_green.iloc[1685,0]=multipolygon

#remove parks that are far outside DC-metro boarders
nodes_combined_green.drop(nodes_combined_green.index[[1726, 1696, 1729, 1964]], inplace=True) #See Large_Park_Exploration notebook for explanation of index choice
print(len(nodes_combined_green))

#add center coords
nodes_combined_green['centercoords'] = nodes_combined_green.geometry.apply(lambda geom: (geom.centroid.y, geom.centroid.x)).tolist()

#clean the greenspace nodes combined dataframe
nodes_combined_green.reset_index(drop=True, inplace=True)

  and should_run_async(code)


2159
2155


In [10]:
geod = Geod(ellps="WGS84")
area_sqmeters = []

for x in range(0, len(nodes_combined_green)):
  poly= nodes_combined_green.geometry[x]
  geod_area = abs(geod.geometry_area_perimeter(poly)[0])
  area_sqmeters.append(geod_area)

nodes_combined_green['area_sq'] = area_sqmeters

  and should_run_async(code)


In [19]:
geocode_result = ox.geocode(addy)
latitude, longitude = geocode_result[0], geocode_result[1]

In [20]:
latitude

38.89533

In [21]:
longitude

-76.99818706788449

In [25]:
# Create a GeoDataFrame with the specific point
gdf = geopandas.GeoDataFrame([{
    'LATITUDE': latitude,
    'LONGITUDE': longitude
}], geometry=[Point(longitude, latitude)], crs="EPSG:4326")

In [26]:
#create a distance matrix of all the distances between every house and every park. Polygon/MultiPolygon parks will return distance to nearest edge
#we need to project onto utm to have the distances in meters
utm = gdf.estimate_utm_crs()
distancematrix=gdf.geometry.to_crs(utm).apply(lambda g: nodes_combined_green.to_crs(utm).geometry.distance(g)/1000) #Rows = House, Col

In [27]:
dm_under1 = distancematrix[distancematrix<1]
non_nan_indices = []
for index, row in dm_under1.iterrows():
    non_nan_indices.append(row[~row.isna()].index.tolist())

In [28]:
distance_under1 = []
for x in range(0, len(non_nan_indices)):
  ylist=[]
  for y in non_nan_indices[x]:
    z=nodes_combined_green.iloc[y,3]
    ylist.append(z)
  distance_under1.append(ylist)

#add all the areas of parks under 1 km for every house
distance_under1_sum = [sum(l) for l in distance_under1]

[3280444.114939552]


1

In [44]:
right_df = pd.DataFrame()
right_df['Address'] = [addy]
right_df['LATITUDE'] = [latitude]
right_df['LONGITUDE'] = [longitude]
right_df['closest_greenspace_direct'] = distancematrix.min(axis=1)
right_df['closest_greenspace_coords'] = nodes_combined_green.loc[distancematrix.idxmin(axis=1)].geometry.values
right_df['closest_greenspace_centercoord'] = nodes_combined_green.loc[distancematrix.idxmin(axis=1)].centercoords.values
right_df['closest_greenspace_area'] = nodes_combined_green.loc[distancematrix.idxmin(axis=1)].area_sq.values
right_df['all_greenspace_area_under1km'] = distance_under1_sum

In [36]:
def haversine(lat1, lon1, lat2, lon2): #To get euclidian distance from lat/long coords
    R = 6371  # Earth radius in kilometers
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

def closest_direct_distance(lat, lon, coords):
    distances = [haversine(lat, lon, x_lat, y_lon) for x_lat, y_lon in coords]
    return min(distances), coords[distances.index(min(distances))]

In [46]:
right_df[['closest_metro_direct', 'closest_metro_loc']] = right_df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], metro_coords), axis=1, result_type='expand')
right_df[['closest_school_direct', 'closest_school_loc']] = right_df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], school_coords), axis=1, result_type='expand')
right_df[['closest_college_direct', 'closest_college_loc']] = right_df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], college_coords), axis=1, result_type='expand')
right_df[['closest_shop_direct', 'closest_shop_loc']] = right_df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], shop_coords), axis=1, result_type='expand')
right_df[['closest_tourism_direct', 'closest_tourism_loc']] = right_df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], tourism_coords), axis=1, result_type='expand')
right_df[['closest_leisure_direct', 'closest_leisure_loc']] = right_df.apply(lambda row: closest_direct_distance(row['LATITUDE'], row['LONGITUDE'], leisure_coords), axis=1, result_type='expand')

In [47]:
right_df

Unnamed: 0,Address,LATITUDE,LONGITUDE,closest_greenspace_direct,closest_greenspace_coords,closest_greenspace_centercoord,closest_greenspace_area,all_greenspace_area_under1km,closest_metro_direct,closest_metro_loc,closest_school_direct,closest_school_loc,closest_college_direct,closest_college_loc,closest_shop_direct,closest_shop_loc,closest_tourism_direct,closest_tourism_loc,closest_leisure_direct,closest_leisure_loc
0,"415 6th St NE, Washington, DC 20002",38.89533,-76.998187,0.092965,"POLYGON ((-76.99742 38.89457, -76.99746 38.894...","(38.89466543333333, -76.99731686666667)",274.831126,3280444.0,0.821068,"(38.89743, -77.0072829)",0.188611,"(38.8944065, -76.996359)",2.167953,"(38.8860291, -77.0202022)",1.677912,"(38.9002563, -76.9798604)",0.227428,"(38.8935654, -76.9995158)",0.526958,"(38.9000638, -76.9979006)"


In [48]:
PROPERTY_TYPE = 'Townhouse'
ADDRESS = addy
CITY = 'Washington'
STATE_OR_PROVINCE = 'DC'
ZIP_OR_POSTAL_CODE = 20002
PRICE = 975000
BEDS = 4
BATHS = 1.5
LOCATION = 'Old City 1'
SQUARE_FEET = 1922
LOT_SIZE = 1830
YEAR_BUILT = 1911
HOA_Month = np.nan

In [204]:
data = {
    'PROPERTY TYPE': ['Townhouse'],
    'ADDRESS': [addy],
    'CITY': ['Washington'],
    'STATE OR PROVINCE': ['DC'],
    'ZIP OR POSTAL CODE': [20002],
    'PRICE': [975000],
    'BEDS': [4],
    'BATHS': [1.5],
    'LOCATION': ['Old City 1'],
    'SQUARE FEET': [1922],
    'LOT SIZE': [1830],
    'YEAR BUILT': [1911],
    'HOA/MONTH': [np.nan]
}

In [205]:
left_df = pd.DataFrame(data)

In [206]:
left_df

Unnamed: 0,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH
0,Townhouse,"415 6th St NE, Washington, DC 20002",Washington,DC,20002,975000,4,1.5,Old City 1,1922,1830,1911,


In [83]:
del right_df['Address']

In [207]:
df = pd.concat([left_df, right_df], axis=1)

In [208]:
df

Unnamed: 0,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,...,closest_school_direct,closest_school_loc,closest_college_direct,closest_college_loc,closest_shop_direct,closest_shop_loc,closest_tourism_direct,closest_tourism_loc,closest_leisure_direct,closest_leisure_loc
0,Townhouse,"415 6th St NE, Washington, DC 20002",Washington,DC,20002,975000,4,1.5,Old City 1,1922,...,0.188611,"(38.8944065, -76.996359)",2.167953,"(38.8860291, -77.0202022)",1.677912,"(38.9002563, -76.9798604)",0.227428,"(38.8935654, -76.9995158)",0.526958,"(38.9000638, -76.9979006)"


In [209]:
property_types = ['PROPERTY TYPE_Condo/Co-op', 'PROPERTY TYPE_Single Family Residential', 'PROPERTY TYPE_Townhouse']

In [210]:
for ptype in property_types:
    # Check if 'PROPERTY TYPE' matches the property type in the column
    df[ptype] = (df['PROPERTY TYPE'] == ptype.split('_')[-1]).astype(int)
df.drop(columns=['PROPERTY TYPE'], inplace=True)

In [211]:
df

Unnamed: 0,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,...,closest_college_loc,closest_shop_direct,closest_shop_loc,closest_tourism_direct,closest_tourism_loc,closest_leisure_direct,closest_leisure_loc,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse
0,"415 6th St NE, Washington, DC 20002",Washington,DC,20002,975000,4,1.5,Old City 1,1922,1830,...,"(38.8860291, -77.0202022)",1.677912,"(38.9002563, -76.9798604)",0.227428,"(38.8935654, -76.9995158)",0.526958,"(38.9000638, -76.9979006)",0,0,1


In [212]:
!wget https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/location_to_price_dict.pkl

--2024-07-20 20:35:32--  https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/location_to_price_dict.pkl
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/location_to_price_dict.pkl [following]
--2024-07-20 20:35:32--  https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/location_to_price_dict.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10157 (9.9K) [application/octet-stream]
Saving to: ‘location_to_price_dict.pkl.2’


2024-07-20 20:35:32 (79.6 MB/s) - ‘location_to_price_dict.pkl.2’ saved [10157/10157]



In [213]:
with open('location_to_price_dict.pkl', 'rb') as file:
    location_to_price_dict = pickle.load(file)

In [214]:
df['TARGET_ENCODED_PRICE_50'] = df['LOCATION'].map(location_to_price_dict)

In [215]:
df

Unnamed: 0,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,SQUARE FEET,LOT SIZE,...,closest_shop_direct,closest_shop_loc,closest_tourism_direct,closest_tourism_loc,closest_leisure_direct,closest_leisure_loc,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse,TARGET_ENCODED_PRICE_50
0,"415 6th St NE, Washington, DC 20002",Washington,DC,20002,975000,4,1.5,Old City 1,1922,1830,...,1.677912,"(38.9002563, -76.9798604)",0.227428,"(38.8935654, -76.9995158)",0.526958,"(38.9000638, -76.9979006)",0,0,1,937493.0


In [216]:
df.columns.to_list()

['ADDRESS',
 'CITY',
 'STATE OR PROVINCE',
 'ZIP OR POSTAL CODE',
 'PRICE',
 'BEDS',
 'BATHS',
 'LOCATION',
 'SQUARE FEET',
 'LOT SIZE',
 'YEAR BUILT',
 'HOA/MONTH',
 'LATITUDE',
 'LONGITUDE',
 'closest_greenspace_direct',
 'closest_greenspace_coords',
 'closest_greenspace_centercoord',
 'closest_greenspace_area',
 'all_greenspace_area_under1km',
 'closest_metro_direct',
 'closest_metro_loc',
 'closest_school_direct',
 'closest_school_loc',
 'closest_college_direct',
 'closest_college_loc',
 'closest_shop_direct',
 'closest_shop_loc',
 'closest_tourism_direct',
 'closest_tourism_loc',
 'closest_leisure_direct',
 'closest_leisure_loc',
 'PROPERTY TYPE_Condo/Co-op',
 'PROPERTY TYPE_Single Family Residential',
 'PROPERTY TYPE_Townhouse',
 'TARGET_ENCODED_PRICE_50']

In [217]:
dropped = ['PRICE', 'ADDRESS', 'CITY', 'STATE OR PROVINCE', 'ZIP OR POSTAL CODE', 'LOCATION', 'LATITUDE', 'LONGITUDE', 'closest_greenspace_coords', 'closest_greenspace_centercoord', 'closest_metro_loc',
           'closest_metro_loc', 'closest_school_loc', 'closest_college_loc', 'closest_shop_loc', 'closest_tourism_loc', 'closest_leisure_loc'] #A bit different then our previous dropped, you drop price for deployment
df = df.drop(columns = dropped)

In [218]:
df = df.fillna(0)

In [219]:
df

Unnamed: 0,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,closest_greenspace_direct,closest_greenspace_area,all_greenspace_area_under1km,closest_metro_direct,closest_school_direct,closest_college_direct,closest_shop_direct,closest_tourism_direct,closest_leisure_direct,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse,TARGET_ENCODED_PRICE_50
0,4,1.5,1922,1830,1911,0.0,0.092965,274.831126,3280444.0,0.821068,0.188611,2.167953,1.677912,0.227428,0.526958,0,0,1,937493.0


In [220]:
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/X_raw.csv'
X_raw = pd.read_csv(url, index_col = 0)
url = 'https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/X_train.csv'
X_train = pd.read_csv(url, index_col = 0)

In [221]:
exclude_columns = ['PROPERTY TYPE_Condo/Co-op', 'PROPERTY TYPE_Single Family Residential', 'PROPERTY TYPE_Townhouse'] #careful, this assumes any new column we add should be thrown into robust scaler (probably is the case)
all_columns = X_raw.columns
robust_columns = [col for col in all_columns if col not in exclude_columns]
scaler = RobustScaler()
scaler.fit(X_raw[robust_columns])

In [222]:
df[robust_columns] = scaler.transform(df[robust_columns])

In [223]:
df

Unnamed: 0,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,closest_greenspace_direct,closest_greenspace_area,all_greenspace_area_under1km,closest_metro_direct,closest_school_direct,closest_college_direct,closest_shop_direct,closest_tourism_direct,closest_leisure_direct,PROPERTY TYPE_Condo/Co-op,PROPERTY TYPE_Single Family Residential,PROPERTY TYPE_Townhouse,TARGET_ENCODED_PRICE_50
0,0.5,-0.25,0.382236,0.286169,-0.634146,-0.252715,-0.299873,-0.188973,1.826285,-0.104913,-0.493995,-0.115694,0.117351,-0.435429,0.113884,0,0,1,0.335131


In [224]:
!wget https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/xgb_model.pkl

--2024-07-20 20:36:05--  https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/xgb_model.pkl
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/xgb_model.pkl [following]
--2024-07-20 20:36:05--  https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/xgb_model.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 495125 (484K) [application/octet-stream]
Saving to: ‘xgb_model.pkl.1’


2024-07-20 20:36:05 (10.1 MB/s) - ‘xgb_model.pkl.1’ saved [495125/495125]



In [225]:
with open('xgb_model.pkl', 'rb') as file:
    xgb_model = pickle.load(file)

In [226]:
!wget https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/y_train.pkl

--2024-07-20 20:36:11--  https://github.com/cbarnes5/DATA606CapstoneProject/raw/main/y_train.pkl
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/y_train.pkl [following]
--2024-07-20 20:36:11--  https://raw.githubusercontent.com/cbarnes5/DATA606CapstoneProject/main/y_train.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 45869 (45K) [application/octet-stream]
Saving to: ‘y_train.pkl.1’


2024-07-20 20:36:11 (3.99 MB/s) - ‘y_train.pkl.1’ saved [45869/45869]



In [227]:
with open('y_train.pkl', 'rb') as file:
    y_train = pickle.load(file)

In [228]:
xgb_model

In [229]:
xgb_model.fit(X_train, y_train)

In [230]:
xgb_model.predict(df)

array([1014393.2], dtype=float32)