In [3]:
import os, tqdm

In [4]:
from sklearn.metrics import r2_score
import numpy as np

# metric
def metric(label, pred):
    assert label.shape == pred.shape
    label = np.exp(label)
    pred = np.exp(pred)
    
    with np.errstate(divide = 'ignore', invalid = 'ignore'):
        mask = np.not_equal(label, 0)
        mask = mask.astype(np.float32)
        mask /= np.mean(mask)
        male = np.abs(np.subtract(np.log(pred), np.log(label))).astype(np.float32)
        male = np.nan_to_num(male * mask)
        male = np.mean(male)
        mae = np.abs(np.subtract(pred, label)).astype(np.float32)
        rmse = np.square(mae)
        mape = np.divide(mae, label)
        mae = np.nan_to_num(mae * mask)
        mae = np.mean(mae)
        rmse = np.nan_to_num(rmse * mask)
        rmse = np.sqrt(np.mean(rmse))
        mape = np.nan_to_num(mape * mask)
        mape = np.median(mape) # np.mean(mape) -- author leverages median
    return male, rmse, mape

In [5]:
os.listdir('.')

['house-dataset-osm-road2vec-poa.ipynb',
 'house-dataset-feat-kde-fc-kc-poa-sp.ipynb',
 'mygeometries.py',
 'house-dataset-osm-neighbor-sp.ipynb',
 'house-dataset-osm-neighbor-kc-process.ipynb',
 '.ipynb_checkpoints',
 'brazil_data',
 'osmnx-1.3.0.zip',
 'house-dataset-feat-kde-fc-kc-poa-sp-mapviz.html',
 'house-dataset-feat-kde-fc-kc-poa-sp-mapviz.ipynb',
 'house-dataset-osm-AREA-EMBEDDING-sp.ipynb',
 'house-dataset-osm-road2vec-kc-POI-extraction.ipynb',
 'house-dataset-osm-sp-NodeEmbedding.ipynb',
 'house-dataset-osm-poa-NodeEmbedding.ipynb',
 'house-dataset-osm-AREA-EMBEDDING-kc.ipynb',
 'house-dataset-osm-road2vec-sp-POI-extraction.ipynb',
 'house-dataset-POI-sp.ipynb',
 'house-dataset-osm-neighbor-kc-process-Copy1.ipynb',
 'house-dataset-osm-kc-NodeEmbedding.ipynb',
 'osm_poi',
 'generateSE.py',
 'house-dataset-feat-kde+gaussianem_important-datagen-same-cluster.ipynb',
 'house-dataset-osm-road2vec-fc-POI-extraction.ipynb',
 'test-neighbor-mean-value.ipynb',
 'house-dataset-osm-roa

In [6]:
datasets = ['kc']

In [7]:
import numpy as np

In [8]:
streetmap = {
    'style': 'mapbox://styles/mapbox/streets-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}
mybasemap = {
    #'style': 'mapbox://styles/mapbox/streets-v9',
    'style': 'mapbox://styles/mapbox/satellite-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}

In [9]:
from cartoframes.viz import *

Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


In [10]:
import pandas as pd

In [11]:
import geopandas as gpd

In [12]:
for dname in ['kc']:#['kc', 'fc', 'sp', 'poa']:
    print(dname)
    data = np.load(f'{dname}/data.npz')
    
    dict1 = {'lat':data['X_train'][:,0], 'lng': data['X_train'][:,1], 'price': data['y_train']}
    dict2 = {'lat':data['X_test'][:,0], 'lng': data['X_test'][:,1], 'price': data['y_test']}
    attr_names = []
    for a in range(2, data['X_train'].shape[1]):
        dict1.update({f'attr{a-2}': data['X_train'][:, a]})
        dict2.update({f'attr{a-2}': data['X_test'][:, a]})
        attr_names.append(f'attr{a-2}')
    df1 = pd.DataFrame(dict1)
    df2 = pd.DataFrame(dict2)
    df = pd.concat([df1, df2])
    
    train_gdf = gpd.GeoDataFrame(df1.copy(), geometry=gpd.points_from_xy(x=df1.lng, y=df1.lat))
    train_gdf.crs = 'EPSG:4326'
    test_gdf = gpd.GeoDataFrame(df2.copy(), geometry=gpd.points_from_xy(x=df2.lng, y=df2.lat))
    test_gdf.crs = 'EPSG:4326'
    house_gdf = gpd.GeoDataFrame(df.copy(), geometry=gpd.points_from_xy(x=df.lng, y=df.lat))
    house_gdf.crs = 'EPSG:4326'
    #print(np.exp(df['price'].values).mean())
    gdf = house_gdf
    for attr in attr_names:
        print(attr, gdf[attr].nunique())
        if gdf[attr].nunique() < 30:
            gdf[attr] = gdf[attr].astype(str)
    gdfcpy = gdf.copy()
    
    
#     display(Map(
#         [
#             Layer(gdfcpy, color_category_style(tattr, cat=cat, palette='cb_blues'), encode_data=False),
#             Layer(gdf, color_continuous_style('price', palette='sunset'), encode_data=False),
#         ],
#         basemap=mybasemap))
    
    break

kc
attr0 13
attr1 30
attr2 1038
attr3 9782
attr4 6
attr5 2
attr6 5
attr7 5
attr8 12
attr9 946
attr10 306
attr11 116
attr12 70
attr13 70
attr14 777
attr15 8689


In [13]:
Layer(house_gdf)

In [14]:
import osmnx as ox
from shapely.geometry import *

In [15]:
x1, y1, x2, y2 = gdf.total_bounds

house_center_latitude = (y1 + y2)/2 #sensor_hull.centroid.y
house_center_longitude = (x1 + x2)/2 #sensor_hull.centroid.x

In [16]:
help(ox.geometries.geometries_from_point)

Help on function geometries_from_point in module osmnx.geometries:

geometries_from_point(center_point, tags, dist=1000)
    Create GeoDataFrame of OSM entities within some distance N, S, E, W of a point.
    
    Parameters
    ----------
    center_point : tuple
        the (lat, lng) center point around which to get the geometries
    tags : dict
        Dict of tags used for finding objects in the selected area. Results
        returned are the union, not intersection of each individual tag.
        Each result matches at least one given tag. The dict keys should be
        OSM tags, (e.g., `building`, `landuse`, `highway`, etc) and the dict
        values should be either `True` to retrieve all items with the given
        tag, or a string to get a single tag-value combination, or a list of
        strings to get multiple values for the given tag. For example,
        `tags = {'building': True}` would return all building footprints in
        the area. `tags = {'amenity':True, 'la

In [17]:
import hereosmnx
center_point = gpd.GeoDataFrame(geometry = [Point(house_center_longitude, house_center_latitude)])
center_point.crs = 'epsg:4326'
center_point = center_point.to_crs('epsg:3310')
max_distance = gdf.to_crs('epsg:3310').distance(center_point.iloc[0].geometry).max()+1000

In [18]:
tag_dict_list = [
    {'amenity':'hospital'},
    {'amenity': 'university'},
    {'amenity': 'school'},
    {'amenity': 'place_of_worship'},
    {'landuse': 'cemetery'},
    {'landuse': 'commercial'},
    {'landuse': 'industrial'},
    {'landuse': 'retail'},
    {'landuse': 'railway'},
    {'leisure': 'golf_course'},
    {'leisure': 'park'},
    {'leisure': 'sports_centre'},
    {'natural': 'water'},
    {'natural': 'wood'},
    {'aeroway': 'aerodrome'}
]

In [19]:
import tqdm
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    if os.path.isfile(f'osm_poi/{dname}/{fname}.geojson'):
        continue
    buildings = ox.geometries.geometries_from_point((house_center_latitude, house_center_longitude), 
                                        tags=ttag_dict,
                                        dist=max_distance)
    
    buildings = buildings.reset_index().copy()

    for col in buildings.columns:
        if col != 'geometry':
            buildings[col] = buildings[col].astype(str)
            
    buildings.to_file(f'osm_poi/{dname}/{fname}.geojson', driver='GeoJSON')
    print(fname)

100%|███████████████████████████████████████████| 15/15 [00:17<00:00,  1.16s/it]

landuse-railway





In [99]:
import tqdm
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    mbuildings = gpd.read_file(f'osm_poi/{dname}/{fname}.geojson')
    water_geo = mbuildings.unary_union
    house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
    print(fname)


  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
  7%|██████████▏                                                                                                                                   | 1/14 [00:02<00:33,  2.57s/it]

amenity-hospital



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 14%|████████████████████▎                                                                                                                         | 2/14 [00:04<00:24,  2.02s/it]

amenity-university



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 21%|██████████████████████████████▍                                                                                                               | 3/14 [00:34<02:43, 14.87s/it]

amenity-school



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 29%|████████████████████████████████████████▌                                                                                                     | 4/14 [01:16<04:17, 25.80s/it]

amenity-place_of_worship



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 36%|██████████████████████████████████████████████████▋                                                                                           | 5/14 [01:19<02:38, 17.58s/it]

landuse-cemetery



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 43%|████████████████████████████████████████████████████████████▊                                                                                 | 6/14 [02:23<04:26, 33.32s/it]

landuse-commercial



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 50%|███████████████████████████████████████████████████████████████████████                                                                       | 7/14 [02:58<03:55, 33.69s/it]

landuse-industrial



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 57%|█████████████████████████████████████████████████████████████████████████████████▏                                                            | 8/14 [03:34<03:26, 34.41s/it]

landuse-retail



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 64%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 9/14 [03:40<02:08, 25.60s/it]

leisure-golf_course



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 71%|████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 10/14 [05:28<03:24, 51.13s/it]

leisure-park



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 79%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 11/14 [05:35<01:52, 37.50s/it]

leisure-sports_centre



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                    | 12/14 [09:23<03:10, 95.44s/it]

natural-water



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 13/14 [12:09<01:56, 116.99s/it]

natural-wood



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [12:12<00:00, 52.30s/it]

aeroway-aerodrome





In [155]:
# Map([
#     Layer(mbuildings[mbuildings.geometry.type == 'Polygon']),
#     Layer(mbuildings[mbuildings.geometry.type == 'Point'])
    
# ])


In [20]:
import tqdm
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    mbuildings = gpd.read_file(f'osm_poi/{dname}/{fname}.geojson')
    mbuildings = mbuildings[mbuildings.geometry.type == 'Polygon'].copy()
    water_geo = mbuildings.unary_union
    house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
    print(fname, len(mbuildings))


  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)

  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 13%|█████▊                                      | 2/15 [00:00<00:02,  4.93it/s]

amenity-hospital 39
amenity-university 11



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 20%|████████▊                                   | 3/15 [00:03<00:18,  1.50s/it]

amenity-school 1014



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 27%|███████████▋                                | 4/15 [00:07<00:29,  2.65s/it]

amenity-place_of_worship 1295



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 33%|██████████████▋                             | 5/15 [00:08<00:18,  1.81s/it]

landuse-cemetery 99



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 40%|█████████████████▌                          | 6/15 [00:15<00:33,  3.68s/it]

landuse-commercial 2717



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 47%|████████████████████▌                       | 7/15 [00:19<00:29,  3.74s/it]

landuse-industrial 1563



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 53%|███████████████████████▍                    | 8/15 [00:23<00:27,  3.94s/it]

landuse-retail 1510



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 60%|██████████████████████████▍                 | 9/15 [00:24<00:17,  2.91s/it]

landuse-railway 45



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 67%|████████████████████████████▋              | 10/15 [00:24<00:10,  2.19s/it]

leisure-golf_course 96



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 73%|███████████████████████████████▌           | 11/15 [00:36<00:19,  4.93s/it]

leisure-park 3267



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 80%|██████████████████████████████████▍        | 12/15 [00:36<00:10,  3.64s/it]

leisure-sports_centre 193



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 87%|█████████████████████████████████████▎     | 13/15 [01:02<00:20, 10.45s/it]

natural-water 4477



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 93%|████████████████████████████████████████▏  | 14/15 [01:20<00:12, 12.50s/it]

natural-wood 4534



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
100%|███████████████████████████████████████████| 15/15 [01:20<00:00,  5.36s/it]

aeroway-aerodrome 19





In [51]:

beta = {'fc': 0.045, 'kc':0.035, 'sp': 0.020, 'poa': 0.025}
    
fname_list = []
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    fname_list.append(fname)
    
    house_gdf[fname + '_proximity'] = np.exp(- (house_gdf[fname + '_dist'] / beta[dname])**2)
#     house_gdf[fname + '_proximity'] = np.exp(- (house_gdf[fname + '_dist'] / np.std(house_gdf[fname + '_dist']))**2)
    print(fname, np.std(house_gdf[fname + '_dist']))

100%|██████████████████████████████████████████| 15/15 [00:00<00:00, 937.09it/s]

amenity-hospital 0.03124324429638121
amenity-university 0.05630307198012221
amenity-school 0.007637229324726503
amenity-place_of_worship 0.01068897066031709
landuse-cemetery 0.025249522791088205
landuse-commercial 0.01588557896208886
landuse-industrial 0.012236312476212364
landuse-retail 0.012821461931258086
landuse-railway 0.05595083173288442
leisure-golf_course 0.022012051025430523
leisure-park 0.00599341970465253
leisure-sports_centre 0.026566966700692397
natural-water 0.005620632620887171
natural-wood 0.0057370892863312034
aeroway-aerodrome 0.04619138509911294





In [38]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

#X = house_gdf[attr_names + [fn + '_proximity' for fn in fname_list]].values
#y = house_gdf['price'].values


X = house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values
y = house_gdf.iloc[:len(train_gdf)]['price'].values

estimator = RandomForestRegressor(random_state=42)
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X, y)

In [39]:
feat_list = attr_names + [fn + '_proximity' for fn in fname_list]

rank_dict = dict()
i = 0
for r in np.argsort(selector.ranking_[len(attr_names):]):#selector.ranking_:
    print(fname_list[r])
    i += 1
    rank_dict[fname_list[r]] = i

landuse-industrial
amenity-university
leisure-sports_centre
natural-water
aeroway-aerodrome
landuse-retail
landuse-commercial
leisure-golf_course
amenity-hospital
amenity-place_of_worship
landuse-railway
natural-wood
landuse-cemetery
amenity-school
leisure-park


In [40]:
for k in fname_list:
    v = rank_dict[k]
    print(k, v, sep='\t')

amenity-hospital	9
amenity-university	2
amenity-school	14
amenity-place_of_worship	10
landuse-cemetery	13
landuse-commercial	7
landuse-industrial	1
landuse-retail	6
landuse-railway	11
leisure-golf_course	8
leisure-park	15
leisure-sports_centre	3
natural-water	4
natural-wood	12
aeroway-aerodrome	5


In [52]:
house_gdf.to_file(f'{dname}/house_gdf_poi_processed.geojson', driver='GeoJSON')

In [47]:


import statsmodels.api as sm
import numpy as np
import pandas as pd


for zeta in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.5, 1]:
    fname_list = []
    for ttag_dict in tag_dict_list:
        fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
        fname_list.append(fname)

        house_gdf[fname + '_proximity'] = np.exp(- (house_gdf[fname + '_dist'] / zeta)**2 / 2)



    df = house_gdf.copy()#pd.DataFrame(dict(X1=X1, X2=X2, X3=X3, X4=X4, X5=X5, Y=Y0+err))
    # model = sm.OLS(df["price"], sm.add_constant(df[attr_names + [fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
    model = sm.OLS(df["price"], sm.add_constant(df[[fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
    print(zeta, model.rsquared, sep='\t')

0.01	0.22712296200342186
0.02	0.3145653102328885
0.03	0.34106813469890795
0.04	0.34100866253148054
0.05	0.32995600837647754
0.06	0.3155182983052407
0.07	0.301612558892446
0.08	0.2896525770027353
0.09	0.2798047268041174
0.1	0.2718138324095919
0.5	0.2319509269641895
1	0.23070589578494094


In [44]:
import statsmodels.api as sm
import numpy as np
import pandas as pd


df = house_gdf.copy()#pd.DataFrame(dict(X1=X1, X2=X2, X3=X3, X4=X4, X5=X5, Y=Y0+err))
for attr in attr_names:
    df[attr] = df[attr].astype(float)
# model = sm.OLS(df["price"], sm.add_constant(df[attr_names + [fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
model = sm.OLS(df["price"], sm.add_constant(df[[fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
print(model.summary2())

                          Results: Ordinary least squares
Model:                    OLS                   Adj. R-squared:          0.182     
Dependent Variable:       price                 AIC:                     29287.5748
Date:                     2023-11-07 17:08      BIC:                     29415.2679
No. Observations:         21608                 Log-Likelihood:          -14628.   
Df Model:                 15                    F-statistic:             321.8     
Df Residuals:             21592                 Prob (F-statistic):      0.00      
R-squared:                0.183                 Scale:                   0.22691   
-----------------------------------------------------------------------------------
                                    Coef.  Std.Err.    t     P>|t|   [0.025  0.975]
-----------------------------------------------------------------------------------
const                              13.1239   0.0155 844.3551 0.0000 13.0935 13.1544
amenity-hospital_p

In [46]:
print(model.rsquared)

0.1827038318952272


In [45]:
dmp = dict(model.params)
print(dname)
for k in fname_list:
    k = k + '_proximity'
    print(k, dmp[k], sep='\t')

kc
amenity-hospital_proximity	-0.0891938692715123
amenity-university_proximity	0.5433497509034078
amenity-school_proximity	-0.10117189698845169
amenity-place_of_worship_proximity	-0.20855514235783618
landuse-cemetery_proximity	-0.046416551312900955
landuse-commercial_proximity	-0.17826309332949988
landuse-industrial_proximity	-0.4501302352472598
landuse-retail_proximity	0.029470011031664814
landuse-railway_proximity	0.192430046043305
leisure-golf_course_proximity	0.04252124774018293
leisure-park_proximity	0.21878411123630595
leisure-sports_centre_proximity	0.24515405941677773
natural-water_proximity	0.014267721009660825
natural-wood_proximity	0.08546377295600255
aeroway-aerodrome_proximity	-0.5088348551196258


In [172]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb  # Import XGBoost

# Create the XGBoost regressor
xgb_regressor = xgb.XGBRegressor()

# Fit the XGBoost model using the training data
xgb_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

# Make predictions using the XGBoost model
pred_price = xgb_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)

# Print the evaluation metric (replace 'metric' with your actual metric function)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

# If you have additional features for proximity, you can add them like this:
xgb_regressor_proximity = xgb.XGBRegressor()

xgb_regressor_proximity.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price_proximity = xgb_regressor_proximity.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values)

print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price_proximity))


(0.13230416, 145130.1, 0.09641606569828381)
(0.12883827, 138498.73, 0.09394274778799841)


In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [23]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

#(0.1611773, 174137.61, 0.11322664344820148)


(0.1611773, 174137.61, 0.11322664344820148)


In [24]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

#(0.14807186, 163863.34, 0.10121785888935364)

(0.15113868, 165543.69, 0.10709395433930297)


In [170]:
from sklearn.linear_model import LinearRegression

rf_regressor = LinearRegression()
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

# (0.2449086, 246864.64, 0.2019051282673396)


(0.2449086, 246864.64, 0.2019051282673396)


In [171]:
rf_regressor = LinearRegression()
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

# (0.20207812, 213049.52, 0.15763048507885563)


(0.18905048, 198920.53, 0.14902903976320478)


In [None]:
from sklearn.linear_model import LinearRegression
#from sklearn.svm import SVR
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

train_data = house_gdf.iloc[:len(train_gdf)][attr_names].values.astype(float)
test_data = house_gdf.iloc[len(train_gdf):][attr_names].values.astype(float)

mean = np.mean(train_data, 0)[np.newaxis, :]
std = np.std(train_data, 0)[np.newaxis, :]
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

ymean = house_gdf.iloc[:len(train_gdf)]['price'].values.mean()
ystd = house_gdf.iloc[:len(train_gdf)]['price'].values.std()

for name, regressor in [('LR', LinearRegression()), ('SV', SVR(max_iter=1000)), ('RF', RandomForestRegressor(n_estimators=10, random_state=42))]:
    regressor.fit(
        train_data, 
        (house_gdf.iloc[:len(train_gdf)]['price'].values - ymean)*ystd)

    pred_price = regressor.predict(test_data)*ystd + ymean
    print(name, metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

In [None]:
train_data = house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values.astype(float)
test_data = house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values.astype(float)

mean = np.mean(train_data, 0)[np.newaxis, :]
std = np.std(train_data, 0)[np.newaxis, :]
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

for name, regressor in [('LR', LinearRegression()), ('SV', SVR(max_iter=1000)), ('RF', RandomForestRegressor(n_estimators=10, random_state=42))]:
    regressor.fit(
        train_data, 
        house_gdf.iloc[:len(train_gdf)]['price'].values)

    pred_price = regressor.predict(test_data)*ystd + ymean
    print(name, metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

In [None]:
from sklearn.linear_model import LinearRegression
#from sklearn.svm import SVR
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

train_data = house_gdf.iloc[:len(train_gdf)][attr_names].values.astype(float)
test_data = house_gdf.iloc[len(train_gdf):][attr_names].values.astype(float)

mean = np.mean(train_data, 0)[np.newaxis, :]
std = np.std(train_data, 0)[np.newaxis, :]
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

ymean = house_gdf.iloc[:len(train_gdf)]['price'].values.mean()
ystd = house_gdf.iloc[:len(train_gdf)]['price'].values.std()

for name, regressor in [('LR', LinearRegression()), ('SV', SVR(max_iter=1000)), ('RF', RandomForestRegressor(n_estimators=10, random_state=42))]:
    regressor.fit(
        train_data, 
        (house_gdf.iloc[:len(train_gdf)]['price'].values - ymean)*ystd)

    pred_price = regressor.predict(test_data)*ystd + ymean
    print(name, metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

In [None]:
train_data = house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values.astype(float)
test_data = house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values.astype(float)

mean = np.mean(train_data, 0)[np.newaxis, :]
std = np.std(train_data, 0)[np.newaxis, :]
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

for name, regressor in [('LR', LinearRegression()), ('SV', SVR(max_iter=1000)), ('RF', RandomForestRegressor(n_estimators=10, random_state=42))]:
    regressor.fit(
        train_data, 
        house_gdf.iloc[:len(train_gdf)]['price'].values)

    pred_price = regressor.predict(test_data)*ystd + ymean
    print(name, metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

In [53]:
newdata = dict()
newdata['Train_feat'] = house_gdf.iloc[:len(train_gdf)][attr_names].astype(np.float32).values
newdata['Train_latlon'] = house_gdf.iloc[:len(train_gdf)][['lat', 'lng']].values
newdata['Train_price'] = house_gdf.iloc[:len(train_gdf)][['price']].values
newdata['Train_idx_geo'] = data['idx_geo'][:len(train_gdf)]
newdata['Train_dist_geo'] = data['dist_geo'][:len(train_gdf)]
newdata['Train_idx_eucli'] = data['idx_eucli'][:len(train_gdf)]
newdata['Train_dist_eucli'] = data['dist_eucli'][:len(train_gdf)]
newdata['Train_poiprox'] = house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values
newdata['Train_poidist'] = house_gdf.iloc[:len(train_gdf)][[fn + '_dist' for fn in fname_list]].values

newdata['Test_feat'] = house_gdf.iloc[len(train_gdf):][attr_names].astype(np.float32).values
newdata['Test_latlon'] = house_gdf.iloc[len(train_gdf):][['lat', 'lng']].values
newdata['Test_price'] = house_gdf.iloc[len(train_gdf):][['price']].values
newdata['Test_idx_geo'] = data['idx_geo'][len(train_gdf):]
newdata['Test_dist_geo'] = data['dist_geo'][len(train_gdf):]
newdata['Test_idx_eucli'] = data['idx_eucli'][len(train_gdf):]
newdata['Test_dist_eucli'] = data['dist_eucli'][len(train_gdf):]
newdata['Test_poiprox'] = house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values
newdata['Test_poidist'] = house_gdf.iloc[len(train_gdf):][[fn + '_dist' for fn in fname_list]].values


np.savez(f'processed/{dname}/processed_data_poi.npz', **newdata)

In [173]:
newdata = dict(np.load(f'{dname}/data.npz'))
newdata['X_train'] = np.concatenate((newdata['X_train'], 
                                     house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values), -1)
newdata['X_test'] = np.concatenate((newdata['X_test'], 
                                     house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values), -1)

np.savez(f'{dname}/data_poi.npz', **newdata)

In [128]:
newdata = dict(np.load(f'processed/{dname}/processed_data2.npz'))
newdata['Train_poiprox'] = house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values
newdata['Test_poiprox'] = house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values

newdata['Train_poidist'] = house_gdf.iloc[:len(train_gdf)][[fn + '_dist' for fn in fname_list]].values
newdata['Test_poidist'] = house_gdf.iloc[len(train_gdf):][[fn + '_dist' for fn in fname_list]].values

np.savez(f'processed/{dname}/processed_data3.npz', **newdata)

In [141]:
newdata = dict(np.load(f'processed/{dname}/processed_data3.npz'))
newdata['Train_idx_geo'] = data['idx_geo'][:len(train_gdf)]
newdata['Train_dist_geo'] = data['dist_geo'][:len(train_gdf)]
newdata['Test_idx_geo'] = data['idx_geo'][len(train_gdf):]
newdata['Test_dist_geo'] = data['dist_geo'][len(train_gdf):]
np.savez(f'processed/{dname}/processed_data4.npz', **newdata)

In [129]:
house_gdf[[fn + '_proximity' for fn in fname_list]]

Unnamed: 0,lat,lng,price,attr0,attr1,attr2,attr3,attr4,attr5,attr6,...,landuse-cemetery_proximity,landuse-commercial_proximity,landuse-industrial_proximity,landuse-retail_proximity,leisure-golf_course_proximity,leisure-park_proximity,leisure-sports_centre_proximity,natural-water_proximity,natural-wood_proximity,aeroway-aerodrome_proximity
0,47.6889,-122.342,13.345507,2.0,1.75,1990.0,4000.0,1.0,0.0,0.0,...,0.548085,0.998332,0.621970,0.977859,0.021328,0.310096,0.599061,0.333318,0.052321,6.772081e-02
1,47.3872,-122.154,12.720392,3.0,2.50,1990.0,3694.0,2.0,0.0,0.0,...,0.658805,0.653334,0.228623,0.629952,0.969664,0.965389,0.016008,0.946388,0.000003,3.439944e-02
2,47.6178,-122.055,13.735119,4.0,3.50,3560.0,4951.0,2.0,0.0,0.0,...,0.001307,0.217924,0.000860,0.155783,0.752802,0.998955,0.121354,0.918662,0.945833,6.023485e-09
3,47.6293,-122.121,13.023647,3.0,1.75,1600.0,7232.0,1.0,0.0,0.0,...,0.057296,0.865813,0.000011,0.499908,0.861253,0.219873,0.728765,0.837321,0.254394,1.027918e-03
4,47.6510,-122.119,13.304685,4.0,1.75,1700.0,7800.0,1.0,0.0,0.0,...,0.368928,0.898480,0.011013,0.152076,0.236763,0.746618,0.597413,0.490910,0.837636,1.521504e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4317,47.5467,-122.377,12.779873,2.0,1.00,900.0,6656.0,1.0,0.0,0.0,...,0.915444,0.999810,0.035225,0.996391,0.780280,0.510335,0.531248,0.224564,0.310409,6.564175e-02
4318,47.7605,-122.234,13.219490,3.0,2.50,2830.0,5802.0,2.0,0.0,0.0,...,0.110623,0.339958,0.685054,0.937366,0.457306,0.944631,0.890146,0.386316,0.019260,6.639110e-01
4319,47.6572,-122.346,13.652992,3.0,3.00,1910.0,4800.0,1.5,0.0,0.0,...,0.527270,0.964765,0.650201,0.939979,0.080932,0.695764,0.859457,0.065785,0.431994,5.025004e-01
4320,47.3203,-122.184,12.721886,3.0,2.50,2260.0,8040.0,2.0,0.0,0.0,...,0.412481,0.868185,0.113798,0.820614,0.354649,0.915776,0.156140,0.858627,0.976720,2.978773e-01


In [137]:
import statsmodels.api as sm
import numpy as np
import pandas as pd


df = house_gdf.copy()#pd.DataFrame(dict(X1=X1, X2=X2, X3=X3, X4=X4, X5=X5, Y=Y0+err))
for attr in attr_names:
    df[attr] = df[attr].astype(float)
model = sm.OLS(df["price"], sm.add_constant(df[attr_names + [fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
print(model.summary2())

                          Results: Ordinary least squares
Model:                    OLS                    Adj. R-squared:           0.751    
Dependent Variable:       price                  AIC:                      3620.4868
Date:                     2023-11-03 11:41       BIC:                      3859.9114
No. Observations:         21608                  Log-Likelihood:           -1780.2  
Df Model:                 29                     F-statistic:              2246.    
Df Residuals:             21578                  Prob (F-statistic):       0.00     
R-squared:                0.751                  Scale:                    0.069134 
------------------------------------------------------------------------------------
                                    Coef.  Std.Err.    t     P>|t|   [0.025   0.975]
------------------------------------------------------------------------------------
const                              -5.4331   3.9619  -1.3713 0.1703 -13.1986  2.3325
attr0  