In [15]:
import os, tqdm

In [16]:
from sklearn.metrics import r2_score
import numpy as np

# metric
def metric(label, pred):
    assert label.shape == pred.shape
    label = np.exp(label)
    pred = np.exp(pred)
    
    with np.errstate(divide = 'ignore', invalid = 'ignore'):
        mask = np.not_equal(label, 0)
        mask = mask.astype(np.float32)
        mask /= np.mean(mask)
        male = np.abs(np.subtract(np.log(pred), np.log(label))).astype(np.float32)
        male = np.nan_to_num(male * mask)
        male = np.mean(male)
        mae = np.abs(np.subtract(pred, label)).astype(np.float32)
        rmse = np.square(mae)
        mape = np.divide(mae, label)
        mae = np.nan_to_num(mae * mask)
        mae = np.mean(mae)
        rmse = np.nan_to_num(rmse * mask)
        rmse = np.sqrt(np.mean(rmse))
        mape = np.nan_to_num(mape * mask)
        mape = np.median(mape) # np.mean(mape) -- author leverages median
    return male, rmse, mape

In [17]:
os.listdir('.')

['house-dataset-osm-road2vec-poa.ipynb',
 'house-dataset-feat-kde-fc-kc-poa-sp.ipynb',
 'mygeometries.py',
 'house-dataset-osm-neighbor-sp.ipynb',
 'house-dataset-osm-neighbor-kc-process.ipynb',
 '.ipynb_checkpoints',
 'brazil_data',
 'osmnx-1.3.0.zip',
 'house-dataset-feat-kde-fc-kc-poa-sp-mapviz.html',
 'house-dataset-feat-kde-fc-kc-poa-sp-mapviz.ipynb',
 'house-dataset-osm-AREA-EMBEDDING-sp.ipynb',
 'house-dataset-osm-road2vec-kc-POI-extraction.ipynb',
 'house-dataset-osm-sp-NodeEmbedding.ipynb',
 'house-dataset-osm-poa-NodeEmbedding.ipynb',
 'house-dataset-osm-AREA-EMBEDDING-kc.ipynb',
 'house-dataset-osm-road2vec-sp-POI-extraction.ipynb',
 'house-dataset-POI-sp.ipynb',
 'house-dataset-osm-neighbor-kc-process-Copy1.ipynb',
 'house-dataset-osm-kc-NodeEmbedding.ipynb',
 'osm_poi',
 'generateSE.py',
 'house-dataset-feat-kde+gaussianem_important-datagen-same-cluster.ipynb',
 'house-dataset-osm-road2vec-fc-POI-extraction.ipynb',
 'test-neighbor-mean-value.ipynb',
 'house-dataset-osm-roa

In [18]:
datasets = ['fc']

In [19]:
import numpy as np

In [20]:
streetmap = {
    'style': 'mapbox://styles/mapbox/streets-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}
mybasemap = {
    #'style': 'mapbox://styles/mapbox/streets-v9',
    'style': 'mapbox://styles/mapbox/satellite-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}

In [21]:
from cartoframes.viz import *

In [22]:
import pandas as pd

In [23]:
import geopandas as gpd

In [24]:
for dname in ['fc']:#['kc', 'fc', 'sp', 'poa']:
    print(dname)
    data = np.load(f'{dname}/data.npz')
    
    dict1 = {'lat':data['X_train'][:,0], 'lng': data['X_train'][:,1], 'price': data['y_train']}
    dict2 = {'lat':data['X_test'][:,0], 'lng': data['X_test'][:,1], 'price': data['y_test']}
    attr_names = []
    for a in range(2, data['X_train'].shape[1]):
        dict1.update({f'attr{a-2}': data['X_train'][:, a]})
        dict2.update({f'attr{a-2}': data['X_test'][:, a]})
        attr_names.append(f'attr{a-2}')
    df1 = pd.DataFrame(dict1)
    df2 = pd.DataFrame(dict2)
    df = pd.concat([df1, df2])
    
    train_gdf = gpd.GeoDataFrame(df1.copy(), geometry=gpd.points_from_xy(x=df1.lng, y=df1.lat))
    train_gdf.crs = 'EPSG:4326'
    test_gdf = gpd.GeoDataFrame(df2.copy(), geometry=gpd.points_from_xy(x=df2.lng, y=df2.lat))
    test_gdf.crs = 'EPSG:4326'
    house_gdf = gpd.GeoDataFrame(df.copy(), geometry=gpd.points_from_xy(x=df.lng, y=df.lat))
    house_gdf.crs = 'EPSG:4326'
    #print(np.exp(df['price'].values).mean())
    gdf = house_gdf
    for attr in attr_names:
        print(attr, gdf[attr].nunique())
        if gdf[attr].nunique() < 30:
            gdf[attr] = gdf[attr].astype(str)
    gdfcpy = gdf.copy()
    
    
#     display(Map(
#         [
#             Layer(gdfcpy, color_category_style(tattr, cat=cat, palette='cb_blues'), encode_data=False),
#             Layer(gdf, color_continuous_style('price', palette='sunset'), encode_data=False),
#         ],
#         basemap=mybasemap))
    
    break

fc
attr0 6447
attr1 18
attr2 11
attr3 6447
attr4 16372
attr5 79
attr6 3230
attr7 8
attr8 6


In [25]:
house_gdf

Unnamed: 0,lat,lng,price,attr0,attr1,attr2,attr3,attr4,attr5,attr6,attr7,attr8,geometry
0,38.011523,-84.533286,11.744037,0.2506,3.0,0.0,0.2506,10917.0,0.0,1065.0,1.0,0.0,POINT (-84.53329 38.01152)
1,37.972752,-84.522255,11.443575,0.0891,3.0,0.0,0.0891,3880.0,0.0,1070.0,2.0,0.0,POINT (-84.52226 37.97275)
2,38.056814,-84.471052,11.034890,0.2583,3.0,2.0,0.2583,11250.0,36000.0,1165.0,1.0,0.0,POINT (-84.47105 38.05681)
3,38.014345,-84.518004,12.031719,0.1951,3.0,3.0,0.1951,8500.0,0.0,1980.0,1.0,0.0,POINT (-84.51800 38.01434)
4,38.014156,-84.390089,12.146853,0.1221,5.0,4.0,0.1221,5320.0,0.0,2528.0,2.0,1.0,POINT (-84.39009 38.01416)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16621,38.011661,-84.487875,12.502467,0.3696,3.0,0.0,0.3696,16100.0,36000.0,2170.0,2.0,0.0,POINT (-84.48788 38.01166)
16622,38.036378,-84.480754,11.669929,0.1136,3.0,3.0,0.1136,4950.0,0.0,1008.0,1.0,1.0,POINT (-84.48075 38.03638)
16623,38.038550,-84.496440,12.149502,0.1148,3.0,0.0,0.1148,5000.0,36000.0,1660.0,1.0,1.0,POINT (-84.49644 38.03855)
16624,38.023158,-84.395299,12.252479,0.1494,5.0,4.0,0.1494,6508.0,0.0,2533.0,2.0,1.0,POINT (-84.39530 38.02316)


In [26]:
Layer(house_gdf)

In [27]:
import osmnx as ox
from shapely.geometry import *

In [28]:
x1, y1, x2, y2 = gdf.total_bounds

house_center_latitude = (y1 + y2)/2 #sensor_hull.centroid.y
house_center_longitude = (x1 + x2)/2 #sensor_hull.centroid.x

In [29]:
import hereosmnx
center_point = gpd.GeoDataFrame(geometry = [Point(house_center_longitude, house_center_latitude)])
center_point.crs = 'epsg:4326'
center_point = center_point.to_crs('epsg:3310')
max_distance = gdf.to_crs('epsg:3310').distance(center_point.iloc[0].geometry).max()+1000

In [48]:
tag_dict_list = [
    {'amenity':'hospital'},
    {'amenity': 'university'},
    {'amenity': 'school'},
    {'amenity': 'place_of_worship'},
    {'landuse': 'cemetery'},
    {'landuse': 'commercial'},
    {'landuse': 'industrial'},
    {'landuse': 'retail'},
    {'landuse': 'railway'},
    {'leisure': 'golf_course'},
    {'leisure': 'park'},
    {'leisure': 'sports_centre'},
    {'natural': 'water'},
    {'natural': 'wood'},
    {'aeroway': 'aerodrome'}
]

In [131]:

for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    print(fname)

100%|███████████████████████████████████████| 15/15 [00:00<00:00, 108286.68it/s]

amenity-hospital
amenity-university
amenity-school
amenity-place_of_worship
landuse-cemetery
landuse-commercial
landuse-industrial
landuse-retail
landuse-railway
leisure-golf_course
leisure-park
leisure-sports_centre
natural-water
natural-wood
aeroway-aerodrome





In [49]:
import tqdm
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    if os.path.isfile(f'osm_poi/{dname}/{fname}.geojson'):
        continue
    buildings = ox.geometries.geometries_from_point((house_center_latitude, house_center_longitude), 
                                        tags=ttag_dict,
                                        dist=max_distance)
    
    buildings = buildings.reset_index().copy()

    for col in buildings.columns:
        if col != 'geometry':
            buildings[col] = buildings[col].astype(str)
            
    buildings.to_file(f'osm_poi/{dname}/{fname}.geojson', driver='GeoJSON')
    print(fname)

100%|████████████████████████████████████████| 15/15 [00:00<00:00, 48507.76it/s]


In [54]:
import tqdm
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    mbuildings = gpd.read_file(f'osm_poi/{dname}/{fname}.geojson')
    mbuildings = mbuildings[mbuildings.geometry.type == 'Polygon'].copy()
    water_geo = mbuildings.unary_union
    if water_geo:
        house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
    else:
        house_gdf[fname + '_dist'] = 0
    print(fname, len(mbuildings))


  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
  7%|██▉                                         | 1/15 [00:00<00:03,  3.56it/s]

amenity-hospital 13



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 13%|█████▊                                      | 2/15 [00:00<00:06,  2.00it/s]

amenity-university 48



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 20%|████████▊                                   | 3/15 [00:01<00:07,  1.57it/s]

amenity-school 67



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 27%|███████████▋                                | 4/15 [00:03<00:12,  1.12s/it]

amenity-place_of_worship 139



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 33%|██████████████▋                             | 5/15 [00:04<00:10,  1.05s/it]

landuse-cemetery 80



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 40%|█████████████████▌                          | 6/15 [00:05<00:08,  1.04it/s]

landuse-commercial 56



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 47%|████████████████████▌                       | 7/15 [00:06<00:08,  1.05s/it]

landuse-industrial 100



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 53%|███████████████████████▍                    | 8/15 [00:06<00:06,  1.16it/s]

landuse-retail 22
landuse-railway 0



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 67%|████████████████████████████▋              | 10/15 [00:07<00:02,  1.76it/s]

leisure-golf_course 23



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 73%|███████████████████████████████▌           | 11/15 [00:09<00:03,  1.07it/s]

leisure-park 157



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 80%|██████████████████████████████████▍        | 12/15 [00:10<00:02,  1.12it/s]

leisure-sports_centre 64



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 87%|█████████████████████████████████████▎     | 13/15 [00:15<00:04,  2.05s/it]

natural-water 378



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 93%|████████████████████████████████████████▏  | 14/15 [00:16<00:01,  1.89s/it]

natural-wood 78



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
100%|███████████████████████████████████████████| 15/15 [00:17<00:00,  1.15s/it]

aeroway-aerodrome 7





In [105]:
import statsmodels.api as sm
import numpy as np
import pandas as pd


for zeta in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.5, 1]:
    fname_list = []
    for ttag_dict in tag_dict_list:
        fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
        fname_list.append(fname)

        house_gdf[fname + '_proximity'] = np.exp(- (house_gdf[fname + '_dist'] / zeta)**2 / 2)



    df = house_gdf.copy()#pd.DataFrame(dict(X1=X1, X2=X2, X3=X3, X4=X4, X5=X5, Y=Y0+err))
    # model = sm.OLS(df["price"], sm.add_constant(df[attr_names + [fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
    model = sm.OLS(df["price"], sm.add_constant(df[[fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
    print(zeta, model.rsquared, sep='\t')

0.01	0.2509010516016037
0.02	0.23995886024060042
0.03	0.2607064045928177
0.04	0.2737492210333028
0.05	0.27493676762567665
0.06	0.2728586471125706
0.07	0.2706616989463577
0.08	0.26886763952426596
0.09	0.26743268839640433
0.1	0.2662643627241885
0.5	0.2590088369827629
1	0.258680867183972


In [106]:
best_zeta = 0.05
fname_list = []
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    fname_list.append(fname)
    
    house_gdf[fname + '_proximity'] = np.exp(- (house_gdf[fname + '_dist'] / best_zeta)**2 / 2)
    print(fname, np.std(house_gdf[fname + '_dist']))

100%|██████████████████████████████████████████| 15/15 [00:00<00:00, 619.99it/s]

amenity-hospital 0.018866326659274978
amenity-university 0.02542866205111766
amenity-school 0.010280676372390444
amenity-place_of_worship 0.008864649896908907
landuse-cemetery 0.010601878168740862
landuse-commercial 0.01337141837945649
landuse-industrial 0.009069582779319503
landuse-retail 0.019213919341159583
landuse-railway 0.0
leisure-golf_course 0.01104759021833181
leisure-park 0.005657033016724257
leisure-sports_centre 0.014238750581820864
natural-water 0.004820487889309123
natural-wood 0.013412820091462148
aeroway-aerodrome 0.024845408379460237





In [204]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb  # Import XGBoost

# Create the XGBoost regressor
xgb_regressor = xgb.XGBRegressor()

# Fit the XGBoost model using the training data
xgb_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

# Make predictions using the XGBoost model
pred_price = xgb_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)

# Print the evaluation metric (replace 'metric' with your actual metric function)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

# If you have additional features for proximity, you can add them like this:
xgb_regressor_proximity = xgb.XGBRegressor()

xgb_regressor_proximity.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price_proximity = xgb_regressor_proximity.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values)

print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price_proximity))


(0.18419792, 38736.055, 0.12594704792490116)
(0.116826065, 26086.83, 0.08000143168179157)


In [81]:
nfname_list = '''landuse-industrial
amenity-university
landuse-retail
landuse-commercial
amenity-hospital
leisure-sports_centre
landuse-cemetery
natural-wood
natural-water
leisure-golf_course'''.split()

In [82]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [83]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

# (0.1972145, 41769.594, 0.13155421489087882)


(0.1972145, 41769.594, 0.13155421489087882)


In [84]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in nfname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in nfname_list]].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

# (0.11372489, 27510.088, 0.07236947689749926)


(0.115460694, 27972.752, 0.07301202746261505)


In [202]:
from sklearn.linear_model import LinearRegression
#from sklearn.svm import SVR
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

train_data = house_gdf.iloc[:len(train_gdf)][attr_names].values.astype(float)
test_data = house_gdf.iloc[len(train_gdf):][attr_names].values.astype(float)

mean = np.mean(train_data, 0)[np.newaxis, :]
std = np.std(train_data, 0)[np.newaxis, :]
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

#ymean = house_gdf.iloc[:len(train_gdf)]['price'].values.mean()
#ystd = house_gdf.iloc[:len(train_gdf)]['price'].values.std()

for name, regressor in [('LR', LinearRegression()), ('RF', RandomForestRegressor(n_estimators=10, random_state=42))]:
    regressor.fit(
        train_data, 
        house_gdf.iloc[:len(train_gdf)]['price'].values)
        #(house_gdf.iloc[:len(train_gdf)]['price'].values - ymean)*ystd)

    pred_price = regressor.predict(test_data)#*ystd + ymean
    print(name, metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

LR (0.22798902, 52084.39, 0.15595470214843754)
RF (0.19733766, 41791.465, 0.13171252663352273)


In [203]:
train_data = house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values.astype(float)
test_data = house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values.astype(float)

mean = np.mean(train_data, 0)[np.newaxis, :]
std = np.std(train_data, 0)[np.newaxis, :]
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std

# ymean = house_gdf.iloc[:len(train_gdf)]['price'].values.mean()
# ystd = house_gdf.iloc[:len(train_gdf)]['price'].values.std()

for name, regressor in [('LR', LinearRegression()), ('RF', RandomForestRegressor(n_estimators=10, random_state=42))]:
    regressor.fit(
        train_data, 
        house_gdf.iloc[:len(train_gdf)]['price'].values)
        #(house_gdf.iloc[:len(train_gdf)]['price'].values - ymean)*ystd)

    pred_price = regressor.predict(test_data)#*ystd + ymean
    print(name, metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

LR (0.2096063, 48716.37, 0.14575447060405353)
RF (0.1126573, 27034.625, 0.07140714355901628)


In [117]:
from sklearn.linear_model import LinearRegression

rf_regressor = LinearRegression()
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

(0.22798902, 52084.39, 0.15595470214843754)


In [118]:
rf_regressor = LinearRegression()
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

(0.21011578, 48526.336, 0.1481396684551604)


In [216]:
newdata = dict()
newdata['Train_feat'] = house_gdf.iloc[:len(train_gdf)][attr_names].astype(np.float32).values
newdata['Train_latlon'] = house_gdf.iloc[:len(train_gdf)][['lat', 'lng']].values
newdata['Train_price'] = house_gdf.iloc[:len(train_gdf)][['price']].values
newdata['Train_idx_geo'] = data['idx_geo'][:len(train_gdf)]
newdata['Train_dist_geo'] = data['dist_geo'][:len(train_gdf)]
newdata['Train_idx_eucli'] = data['idx_eucli'][:len(train_gdf)]
newdata['Train_dist_eucli'] = data['dist_eucli'][:len(train_gdf)]
newdata['Train_poiprox'] = house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values

newdata['Test_feat'] = house_gdf.iloc[len(train_gdf):][attr_names].astype(np.float32).values
newdata['Test_latlon'] = house_gdf.iloc[len(train_gdf):][['lat', 'lng']].values
newdata['Test_price'] = house_gdf.iloc[len(train_gdf):][['price']].values
newdata['Test_idx_geo'] = data['idx_geo'][len(train_gdf):]
newdata['Test_dist_geo'] = data['dist_geo'][len(train_gdf):]
newdata['Test_idx_eucli'] = data['idx_eucli'][len(train_gdf):]
newdata['Test_dist_eucli'] = data['dist_eucli'][len(train_gdf):]
newdata['Test_poiprox'] = house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values


np.savez(f'processed/{dname}/processed_data_poi.npz', **newdata)

In [211]:
newdata = dict(np.load(f'{dname}/data.npz'))
newdata['X_train'] = np.concatenate((newdata['X_train'], 
                                     house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values), -1)
newdata['X_test'] = np.concatenate((newdata['X_test'], 
                                     house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values), -1)

np.savez(f'{dname}/data_poi.npz', **newdata)

(66510, 14)

In [114]:
newdata = dict(np.load(f'processed/{dname}/processed_data2.npz'))
newdata['Train_poiprox'] = house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values
newdata['Test_poiprox'] = house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values

newdata['Train_poidist'] = house_gdf.iloc[:len(train_gdf)][[fn + '_dist' for fn in fname_list]].values
newdata['Test_poidist'] = house_gdf.iloc[len(train_gdf):][[fn + '_dist' for fn in fname_list]].values

np.savez(f'processed/{dname}/processed_data3.npz', **newdata)

In [181]:
newdata = dict(np.load(f'processed/{dname}/processed_data3.npz'))
newdata['Train_idx_geo'] = data['idx_geo'][:len(train_gdf)]
newdata['Train_dist_geo'] = data['dist_geo'][:len(train_gdf)]
newdata['Test_idx_geo'] = data['idx_geo'][len(train_gdf):]
newdata['Test_dist_geo'] = data['dist_geo'][len(train_gdf):]
np.savez(f'processed/{dname}/processed_data4.npz', **newdata)

In [189]:
np.exp(-(data['dist_geo']/1)**2)

array([[0.99951459, 0.99910566, 0.99839537, ..., 0.96287884, 0.96278539,
        0.96268793],
       [0.99983089, 0.99979902, 0.99940404, ..., 0.98506274, 0.98410642,
        0.98365536],
       [0.99962202, 0.99939626, 0.9985964 , ..., 0.97703415, 0.9749153 ,
        0.97487228],
       ...,
       [0.99934545, 0.99855746, 0.9960146 , ..., 0.96017339, 0.95882167,
        0.9586799 ],
       [0.99980621, 0.99929914, 0.99879563, ..., 0.97386593, 0.97322085,
        0.97271677],
       [0.99898233, 0.99856187, 0.99839015, ..., 0.96666185, 0.96662011,
        0.96640474]])

In [185]:
data['dist_geo'].max()

11.081784934445057

In [186]:
data['dist_geo'].min()

0.0024337476260169666

In [190]:
data['dist_geo'].mean()

0.13911421143473146

In [188]:
np.median(data['dist_geo'])

0.12129168573342938

In [192]:
np.percentile(data['dist_geo'], 80)

0.16785058825424531

In [183]:
data['dist_geo'].std()

0.2338238117108864

0.01	0.2509010516016037
0.02	0.23995886024060042
0.03	0.2607064045928177
0.04	0.2737492210333028
0.05	0.27493676762567665
0.06	0.2728586471125706
0.07	0.2706616989463577
0.08	0.26886763952426596
0.09	0.26743268839640433
0.1	0.2662643627241885
0.5	0.2590088369827629
1	0.258680867183972


In [101]:
import statsmodels.api as sm
import numpy as np
import pandas as pd


df = house_gdf.copy()#pd.DataFrame(dict(X1=X1, X2=X2, X3=X3, X4=X4, X5=X5, Y=Y0+err))
for attr in attr_names:
    df[attr] = df[attr].astype(float)
# model = sm.OLS(df["price"], sm.add_constant(df[attr_names + [fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
model = sm.OLS(df["price"], sm.add_constant(df[[fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
print(model.summary2())

                          Results: Ordinary least squares
Model:                    OLS                   Adj. R-squared:          0.236      
Dependent Variable:       price                 AIC:                     109513.4729
Date:                     2023-11-07 17:08      BIC:                     109653.3964
No. Observations:         83136                 Log-Likelihood:          -54742.    
Df Model:                 14                    F-statistic:             1839.      
Df Residuals:             83121                 Prob (F-statistic):      0.00       
R-squared:                0.236                 Scale:                   0.21854    
------------------------------------------------------------------------------------
                                    Coef.  Std.Err.     t     P>|t|   [0.025  0.975]
------------------------------------------------------------------------------------
amenity-hospital_proximity          0.2197   0.0093   23.7389 0.0000  0.2015  0.2378
amenity

In [103]:
print(model.rsquared)

0.2364752529370847


In [102]:
dmp = dict(model.params)
print(dname)
for k in fname_list:
    k = k + '_proximity'
    print(k, dmp[k], sep='\t')

fc
amenity-hospital_proximity	0.21967631080221914
amenity-university_proximity	0.13998542794654706
amenity-school_proximity	-0.07250987700301359
amenity-place_of_worship_proximity	-0.3237454797777217
landuse-cemetery_proximity	-0.5067687675172838
landuse-commercial_proximity	-0.29105626521956945
landuse-industrial_proximity	-0.2745436820610975
landuse-retail_proximity	0.20234830009432406
landuse-railway_proximity	12.1474295805937
leisure-golf_course_proximity	-0.13845262363016192
leisure-park_proximity	-0.10086451691509346
leisure-sports_centre_proximity	0.009381313205464198
natural-water_proximity	0.21940260924211358
natural-wood_proximity	0.04243283383396385
aeroway-aerodrome_proximity	0.716289883223768


In [142]:
rf_regressor = LinearRegression()
rf_regressor.fit(
    house_gdf[attr_names].values, 
    np.exp(house_gdf['price'].values))

pred_price = rf_regressor.predict(house_gdf[attr_names].values)
house_gdf['pred_price'] = pred_price