In [1]:
import os, tqdm

In [2]:
from sklearn.metrics import r2_score
import numpy as np

# metric
def metric(label, pred):
    assert label.shape == pred.shape
    label = np.exp(label)
    pred = np.exp(pred)
    
    with np.errstate(divide = 'ignore', invalid = 'ignore'):
        mask = np.not_equal(label, 0)
        mask = mask.astype(np.float32)
        mask /= np.mean(mask)
        male = np.abs(np.subtract(np.log(pred), np.log(label))).astype(np.float32)
        male = np.nan_to_num(male * mask)
        male = np.mean(male)
        mae = np.abs(np.subtract(pred, label)).astype(np.float32)
        rmse = np.square(mae)
        mape = np.divide(mae, label)
        mae = np.nan_to_num(mae * mask)
        mae = np.mean(mae)
        rmse = np.nan_to_num(rmse * mask)
        rmse = np.sqrt(np.mean(rmse))
        mape = np.nan_to_num(mape * mask)
        mape = np.median(mape) # np.mean(mape) -- author leverages median
    return male, rmse, mape

In [3]:
os.listdir('.')

['house-dataset-osm-road2vec-poa.ipynb',
 'house-dataset-feat-kde-fc-kc-poa-sp.ipynb',
 'mygeometries.py',
 'house-dataset-osm-neighbor-sp.ipynb',
 'house-dataset-osm-neighbor-kc-process.ipynb',
 '.ipynb_checkpoints',
 'brazil_data',
 'osmnx-1.3.0.zip',
 'house-dataset-feat-kde-fc-kc-poa-sp-mapviz.html',
 'house-dataset-feat-kde-fc-kc-poa-sp-mapviz.ipynb',
 'house-dataset-osm-AREA-EMBEDDING-sp.ipynb',
 'house-dataset-osm-road2vec-kc-POI-extraction.ipynb',
 'house-dataset-osm-sp-NodeEmbedding.ipynb',
 'house-dataset-osm-poa-NodeEmbedding.ipynb',
 'house-dataset-osm-AREA-EMBEDDING-kc.ipynb',
 'house-dataset-osm-road2vec-sp-POI-extraction.ipynb',
 'house-dataset-POI-sp.ipynb',
 'house-dataset-osm-neighbor-kc-process-Copy1.ipynb',
 'house-dataset-osm-kc-NodeEmbedding.ipynb',
 'osm_poi',
 'generateSE.py',
 'house-dataset-feat-kde+gaussianem_important-datagen-same-cluster.ipynb',
 'house-dataset-osm-road2vec-fc-POI-extraction.ipynb',
 'test-neighbor-mean-value.ipynb',
 'house-dataset-osm-roa

In [4]:
datasets = ['sp']

In [5]:
import numpy as np

In [6]:
streetmap = {
    'style': 'mapbox://styles/mapbox/streets-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}
mybasemap = {
    #'style': 'mapbox://styles/mapbox/streets-v9',
    'style': 'mapbox://styles/mapbox/satellite-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}

In [7]:
from cartoframes.viz import *

Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


In [8]:
import pandas as pd

In [9]:
import geopandas as gpd

In [10]:
for dname in ['sp']:#['kc', 'fc', 'sp', 'poa']:
    print(dname)
    data = np.load(f'{dname}/data.npz')
    
    dict1 = {'lat':data['X_train'][:,0], 'lng': data['X_train'][:,1], 'price': data['y_train']}
    dict2 = {'lat':data['X_test'][:,0], 'lng': data['X_test'][:,1], 'price': data['y_test']}
    attr_names = []
    for a in range(2, data['X_train'].shape[1]):
        dict1.update({f'attr{a-2}': data['X_train'][:, a]})
        dict2.update({f'attr{a-2}': data['X_test'][:, a]})
        attr_names.append(f'attr{a-2}')
    df1 = pd.DataFrame(dict1)
    df2 = pd.DataFrame(dict2)
    df = pd.concat([df1, df2])
    
    train_gdf = gpd.GeoDataFrame(df1.copy(), geometry=gpd.points_from_xy(x=df1.lng, y=df1.lat))
    train_gdf.crs = 'EPSG:4326'
    test_gdf = gpd.GeoDataFrame(df2.copy(), geometry=gpd.points_from_xy(x=df2.lng, y=df2.lat))
    test_gdf.crs = 'EPSG:4326'
    house_gdf = gpd.GeoDataFrame(df.copy(), geometry=gpd.points_from_xy(x=df.lng, y=df.lat))
    house_gdf.crs = 'EPSG:4326'
    #print(np.exp(df['price'].values).mean())
    gdf = house_gdf
    for attr in attr_names:
        print(attr, gdf[attr].nunique())
        if gdf[attr].nunique() < 30:
            gdf[attr] = gdf[attr].astype(str)
    gdfcpy = gdf.copy()
    
    
#     display(Map(
#         [
#             Layer(gdfcpy, color_category_style(tattr, cat=cat, palette='cb_blues'), encode_data=False),
#             Layer(gdf, color_continuous_style('price', palette='sunset'), encode_data=False),
#         ],
#         basemap=mybasemap))
    
    break

sp
attr0 10
attr1 467
attr2 10
attr3 2
attr4 2


In [11]:
Layer(house_gdf)

In [12]:
# gdf = gpd.read_file('BR_UF_2021.zip')
# gdf = gpd.read_file('BR_Municipios_2021.zip')
mgdf = gpd.read_file('brazil_data/SP_Municipios_2021.zip')

In [13]:
cmgdf = mgdf.copy()
cmgdf.geometry = cmgdf.geometry.centroid


  cmgdf.geometry = cmgdf.geometry.centroid


In [15]:
import osmnx as ox
from shapely.geometry import *

In [16]:
x1, y1, x2, y2 = gdf.total_bounds

house_center_latitude = (y1 + y2)/2 #sensor_hull.centroid.y
house_center_longitude = (x1 + x2)/2 #sensor_hull.centroid.x

In [17]:
import hereosmnx
center_point = gpd.GeoDataFrame(geometry = [Point(house_center_longitude, house_center_latitude)])
center_point.crs = 'epsg:4326'
center_point = center_point.to_crs('epsg:3310')
max_distance = gdf.to_crs('epsg:3310').distance(center_point.iloc[0].geometry).max()+1000

In [18]:
tag_dict_list = [
    {'amenity':'hospital'},
    {'amenity': 'university'},
    {'amenity': 'school'},
    {'amenity': 'place_of_worship'},
    {'landuse': 'cemetery'},
    {'landuse': 'commercial'},
    {'landuse': 'industrial'},
    {'landuse': 'retail'},
    {'landuse': 'railway'},
    {'leisure': 'golf_course'},
    {'leisure': 'park'},
    {'leisure': 'sports_centre'},
    {'natural': 'water'},
    {'natural': 'wood'},
    {'aeroway': 'aerodrome'}
]

In [19]:
# os.mkdir(f'osm_poi/{dname}')

In [20]:
import tqdm
for ttag_dict in tqdm.tqdm(tag_dict_list):
    buildings = ox.geometries.geometries_from_point((house_center_latitude, house_center_longitude), 
                                        tags=ttag_dict,
                                        dist=max_distance)
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    
    buildings = buildings.reset_index().copy()

    for col in buildings.columns:
        if col != 'geometry':
            buildings[col] = buildings[col].astype(str)
            
    buildings.to_file(f'osm_poi/{dname}/{fname}.geojson', driver='GeoJSON')
    print(fname)

  7%|██▉                                         | 1/15 [00:01<00:20,  1.48s/it]

amenity-hospital


 13%|█████▊                                      | 2/15 [00:02<00:14,  1.13s/it]

amenity-university


 20%|████████▊                                   | 3/15 [00:14<01:13,  6.17s/it]

amenity-school


 27%|███████████▋                                | 4/15 [00:25<01:26,  7.87s/it]

amenity-place_of_worship


 33%|██████████████▋                             | 5/15 [00:25<00:52,  5.25s/it]

landuse-cemetery


 40%|█████████████████▌                          | 6/15 [00:26<00:35,  3.90s/it]

landuse-commercial


 47%|████████████████████▌                       | 7/15 [00:29<00:26,  3.32s/it]

landuse-industrial


 53%|███████████████████████▍                    | 8/15 [00:30<00:18,  2.66s/it]

landuse-retail


 60%|██████████████████████████▍                 | 9/15 [00:38<00:26,  4.37s/it]

landuse-railway


 67%|████████████████████████████▋              | 10/15 [00:38<00:15,  3.18s/it]

leisure-golf_course


 73%|███████████████████████████████▌           | 11/15 [01:06<00:43, 10.80s/it]

leisure-park


 80%|██████████████████████████████████▍        | 12/15 [01:08<00:23,  7.92s/it]

leisure-sports_centre


 87%|█████████████████████████████████████▎     | 13/15 [01:14<00:14,  7.32s/it]

natural-water


 93%|████████████████████████████████████████▏  | 14/15 [01:23<00:07,  7.92s/it]

natural-wood


100%|███████████████████████████████████████████| 15/15 [01:24<00:00,  5.61s/it]

aeroway-aerodrome





In [54]:
import tqdm
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    mbuildings = gpd.read_file(f'osm_poi/{dname}/{fname}.geojson')
    mbuildings = mbuildings[mbuildings.geometry.type == 'Polygon'].copy()
    water_geo = mbuildings.unary_union
    house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
    print(fname, len(mbuildings))


  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
  7%|██▉                                         | 1/15 [00:03<00:42,  3.02s/it]

amenity-hospital 354



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 13%|█████▊                                      | 2/15 [00:04<00:29,  2.30s/it]

amenity-university 185



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 20%|████████▊                                   | 3/15 [00:26<02:15, 11.29s/it]

amenity-school 2526



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 27%|███████████▋                                | 4/15 [00:43<02:27, 13.43s/it]

amenity-place_of_worship 1900



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 33%|██████████████▋                             | 5/15 [00:44<01:30,  9.09s/it]

landuse-cemetery 121



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 40%|█████████████████▌                          | 6/15 [00:48<01:04,  7.12s/it]

landuse-commercial 456



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 47%|████████████████████▌                       | 7/15 [00:56<00:59,  7.47s/it]

landuse-industrial 1067



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 53%|███████████████████████▍                    | 8/15 [00:59<00:42,  6.01s/it]

landuse-retail 357



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 60%|██████████████████████████▍                 | 9/15 [00:59<00:25,  4.30s/it]

landuse-railway 37



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 67%|████████████████████████████▋              | 10/15 [01:00<00:15,  3.15s/it]

leisure-golf_course 10



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 73%|███████████████████████████████▌           | 11/15 [02:08<01:32, 23.14s/it]

leisure-park 7281



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 80%|██████████████████████████████████▍        | 12/15 [02:11<00:50, 16.91s/it]

leisure-sports_centre 276



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 87%|█████████████████████████████████████▎     | 13/15 [02:52<00:48, 24.26s/it]

natural-water 3715



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
 93%|████████████████████████████████████████▏  | 14/15 [03:44<00:32, 32.58s/it]

natural-wood 4609



  house_gdf[fname + '_dist'] = house_gdf.distance(water_geo)
100%|███████████████████████████████████████████| 15/15 [03:44<00:00, 15.00s/it]

aeroway-aerodrome 5





In [58]:

beta = {'fc': 0.045, 'kc':0.035, 'sp': 0.020, 'poa': 0.025}
    
fname_list = []
for ttag_dict in tqdm.tqdm(tag_dict_list):
    fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
    fname_list.append(fname)
    
    house_gdf[fname + '_proximity'] = np.exp(- (house_gdf[fname + '_dist'] / beta[dname])**2)
#     house_gdf[fname + '_proximity'] = np.exp(- (house_gdf[fname + '_dist'] / np.std(house_gdf[fname + '_dist']))**2)
    print(fname, np.std(house_gdf[fname + '_dist']))

100%|██████████████████████████████████████████| 15/15 [00:00<00:00, 540.62it/s]

amenity-hospital 0.006851214118582728
amenity-university 0.010809549690809153
amenity-school 0.002952618197339736
amenity-place_of_worship 0.004390402404467587
landuse-cemetery 0.011996529311678137
landuse-commercial 0.013545987817012078
landuse-industrial 0.011406628890376235
landuse-retail 0.009633689368010418
landuse-railway 0.019024479196720845
leisure-golf_course 0.049208428118357135
leisure-park 0.0015988540410076927
leisure-sports_centre 0.007218246882933727
natural-water 0.006691173303733249
natural-wood 0.0074608963500871145
aeroway-aerodrome 0.029060474052784106





In [43]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

#X = house_gdf[attr_names + [fn + '_proximity' for fn in fname_list]].values
#y = house_gdf['price'].values


X = house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values
y = house_gdf.iloc[:len(train_gdf)]['price'].values

estimator = RandomForestRegressor(random_state=42)
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X, y)

In [44]:
feat_list = attr_names + [fn + '_proximity' for fn in fname_list]

rank_dict = dict()
i = 0
for r in np.argsort(selector.ranking_[len(attr_names):]):#selector.ranking_:
    print(fname_list[r])
    i += 1
    rank_dict[fname_list[r]] = i

landuse-commercial
landuse-industrial
amenity-university
landuse-retail
natural-wood
amenity-hospital
landuse-railway
leisure-sports_centre
landuse-cemetery
amenity-place_of_worship
natural-water
amenity-school
leisure-park
aeroway-aerodrome
leisure-golf_course


In [45]:
for k in fname_list:
    v = rank_dict[k]
    print(k, v, sep='\t')

amenity-hospital	6
amenity-university	3
amenity-school	12
amenity-place_of_worship	10
landuse-cemetery	9
landuse-commercial	1
landuse-industrial	2
landuse-retail	4
landuse-railway	7
leisure-golf_course	15
leisure-park	13
leisure-sports_centre	8
natural-water	11
natural-wood	5
aeroway-aerodrome	14


In [51]:


import statsmodels.api as sm
import numpy as np
import pandas as pd


for zeta in [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.5, 1]:
    fname_list = []
    for ttag_dict in tag_dict_list:
        fname = ','.join(['-'.join(items) for items in ttag_dict.items()])
        fname_list.append(fname)

        house_gdf[fname + '_proximity'] = np.exp(- (house_gdf[fname + '_dist'] / zeta)**2 / 2)



    df = house_gdf.copy()#pd.DataFrame(dict(X1=X1, X2=X2, X3=X3, X4=X4, X5=X5, Y=Y0+err))
    # model = sm.OLS(df["price"], sm.add_constant(df[attr_names + [fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
    model = sm.OLS(df["price"], sm.add_constant(df[[fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
    print(zeta, model.rsquared, sep='\t')

0.01	0.23123992395765647
0.02	0.2582881052523279
0.03	0.25618701820576995
0.04	0.25126770076950533
0.05	0.24706482772022287
0.06	0.24366288064031438
0.07	0.24090500728178477
0.08	0.23867624063896709
0.09	0.23687581216717213
0.1	0.23541541795304977
0.5	0.2270413923803325
1	0.22668545679364782


In [48]:
import statsmodels.api as sm
import numpy as np
import pandas as pd


df = house_gdf.copy()#pd.DataFrame(dict(X1=X1, X2=X2, X3=X3, X4=X4, X5=X5, Y=Y0+err))
for attr in attr_names:
    df[attr] = df[attr].astype(float)
# model = sm.OLS(df["price"], sm.add_constant(df[attr_names + [fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
model = sm.OLS(df["price"], sm.add_constant(df[[fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
print(model.summary2())

                          Results: Ordinary least squares
Model:                    OLS                   Adj. R-squared:          0.201     
Dependent Variable:       price                 AIC:                     99370.8875
Date:                     2023-11-07 17:08      BIC:                     99517.1220
No. Observations:         68848                 Log-Likelihood:          -49669.   
Df Model:                 15                    F-statistic:             1157.     
Df Residuals:             68832                 Prob (F-statistic):      0.00      
R-squared:                0.201                 Scale:                   0.24789   
-----------------------------------------------------------------------------------
                                    Coef.  Std.Err.    t     P>|t|   [0.025  0.975]
-----------------------------------------------------------------------------------
const                              13.1277   0.0245 534.7747 0.0000 13.0795 13.1758
amenity-hospital_p

In [50]:
print(model.rsquared)

0.2013253560955387


In [49]:
dmp = dict(model.params)
print(dname)
for k in fname_list:
    k = k + '_proximity'
    print(k, dmp[k], sep='\t')

sp
amenity-hospital_proximity	0.04892534116646502
amenity-university_proximity	0.10255941527810017
amenity-school_proximity	-0.1497505022765199
amenity-place_of_worship_proximity	-0.013367318153897759
landuse-cemetery_proximity	0.07975808720637464
landuse-commercial_proximity	0.4725034499755887
landuse-industrial_proximity	-0.29140702244495365
landuse-retail_proximity	0.00643703445053749
landuse-railway_proximity	0.057487795059480165
leisure-golf_course_proximity	0.13250249845706422
leisure-park_proximity	0.13651365755055478
leisure-sports_centre_proximity	0.170530099300368
natural-water_proximity	-0.0018366603404632656
natural-wood_proximity	-0.1251036116020519
aeroway-aerodrome_proximity	0.06591510347727306


In [60]:
house_gdf.to_file(f'{dname}/house_gdf_poi_processed.geojson', driver='GeoJSON')

In [44]:
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb  # Import XGBoost

# Create the XGBoost regressor
xgb_regressor = xgb.XGBRegressor()

# Fit the XGBoost model using the training data
xgb_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

# Make predictions using the XGBoost model
pred_price = xgb_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)

# Print the evaluation metric (replace 'metric' with your actual metric function)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

# If you have additional features for proximity, you can add them like this:
xgb_regressor_proximity = xgb.XGBRegressor()

xgb_regressor_proximity.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price_proximity = xgb_regressor_proximity.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values)

print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price_proximity))


(0.24064553, 244169.61, 0.19732134295886072)
(0.15990165, 172914.66, 0.1260420833333333)


In [23]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [24]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

(0.24345116, 250505.45, 0.19720707145539446)


In [25]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

(0.15467195, 172443.19, 0.11407246131006968)


In [45]:
from sklearn.linear_model import LinearRegression

rf_regressor = LinearRegression()
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

(0.2713994, 267317.1, 0.23117117300724635)


In [46]:
rf_regressor = LinearRegression()
rf_regressor.fit(
    house_gdf.iloc[:len(train_gdf)][attr_names + [fn + '_proximity' for fn in fname_list]].values, 
    house_gdf.iloc[:len(train_gdf)]['price'].values)

pred_price = rf_regressor.predict(house_gdf.iloc[len(train_gdf):][attr_names + [fn + '_proximity' for fn in fname_list]].values)
print(metric(house_gdf.iloc[len(train_gdf):]['price'], pred_price))

(0.22703628, 235775.69, 0.18907240808823533)


In [51]:
#os.mkdir(f'processed/{dname}')

In [59]:
newdata = dict()
newdata['Train_feat'] = house_gdf.iloc[:len(train_gdf)][attr_names].astype(np.float32).values
newdata['Train_latlon'] = house_gdf.iloc[:len(train_gdf)][['lat', 'lng']].values
newdata['Train_price'] = house_gdf.iloc[:len(train_gdf)][['price']].values
newdata['Train_idx_geo'] = data['idx_geo'][:len(train_gdf)]
newdata['Train_dist_geo'] = data['dist_geo'][:len(train_gdf)]
newdata['Train_idx_eucli'] = data['idx_eucli'][:len(train_gdf)]
newdata['Train_dist_eucli'] = data['dist_eucli'][:len(train_gdf)]
newdata['Train_poiprox'] = house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values
newdata['Train_poidist'] = house_gdf.iloc[:len(train_gdf)][[fn + '_dist' for fn in fname_list]].values

newdata['Test_feat'] = house_gdf.iloc[len(train_gdf):][attr_names].astype(np.float32).values
newdata['Test_latlon'] = house_gdf.iloc[len(train_gdf):][['lat', 'lng']].values
newdata['Test_price'] = house_gdf.iloc[len(train_gdf):][['price']].values
newdata['Test_idx_geo'] = data['idx_geo'][len(train_gdf):]
newdata['Test_dist_geo'] = data['dist_geo'][len(train_gdf):]
newdata['Test_idx_eucli'] = data['idx_eucli'][len(train_gdf):]
newdata['Test_dist_eucli'] = data['dist_eucli'][len(train_gdf):]
newdata['Test_poiprox'] = house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values
newdata['Test_poidist'] = house_gdf.iloc[len(train_gdf):][[fn + '_dist' for fn in fname_list]].values


np.savez(f'processed/{dname}/processed_data_poi.npz', **newdata)

In [56]:
data1 = np.load(f'processed/{dname}/processed_data_poi.npz')
data2 = np.load(f'{dname}/data_poi.npz')

In [68]:
data1['Train_feat']

array([[2.       , 3.6888795, 1.       , 1.       , 0.       ],
       [3.       , 4.65396  , 2.       , 1.       , 0.       ],
       [2.       , 4.543295 , 2.       , 1.       , 0.       ],
       ...,
       [3.       , 5.0998664, 3.       , 1.       , 0.       ],
       [3.       , 4.787492 , 2.       , 0.       , 1.       ],
       [2.       , 4.248495 , 2.       , 0.       , 1.       ]],
      dtype=float32)

In [69]:
data2['X_train'][:, 2:2+5]

array([[2.        , 3.68887945, 1.        , 1.        , 0.        ],
       [3.        , 4.65396035, 2.        , 1.        , 0.        ],
       [2.        , 4.54329478, 2.        , 1.        , 0.        ],
       ...,
       [3.        , 5.09986643, 3.        , 1.        , 0.        ],
       [3.        , 4.78749174, 2.        , 0.        , 1.        ],
       [2.        , 4.24849524, 2.        , 0.        , 1.        ]])

In [74]:
data1['Train_price']

array([[12.2010601 ],
       [13.5670492 ],
       [13.80546022],
       ...,
       [14.15198279],
       [13.21767356],
       [12.7628272 ]])

array([12.2010601 , 13.5670492 , 13.80546022, ..., 14.15198279,
       13.21767356, 12.7628272 ])

In [77]:
np.sum(data1['Train_price'][:, 0] - data2['y_train'])

0.0

In [47]:
newdata = dict(np.load(f'{dname}/data.npz'))
newdata['X_train'] = np.concatenate((newdata['X_train'], 
                                     house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values), -1)
newdata['X_test'] = np.concatenate((newdata['X_test'], 
                                     house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values), -1)

np.savez(f'{dname}/data_poi.npz', **newdata)

In [None]:
newdata = dict()
newdata['Train_feat'] = house_gdf.iloc[:len(train_gdf)][attr_names].astype(np.float32).values
newdata['Train_latlon'] = house_gdf.iloc[:len(train_gdf)][['lat', 'lng']].astype(np.float32).values
newdata['Train_price'] = house_gdf.iloc[:len(train_gdf)][['price']].astype(np.float32).values
#newdata['Train_haddr'] = house_gdf.iloc[:len(train_gdf)][['haddr1', 'haddr2', 'haddr3']].values
#newdata['Train_saddr'] = house_gdf.iloc[:len(train_gdf)][['saddr1', 'saddr2', 'saddr3']].values

newdata['Test_feat'] = house_gdf.iloc[len(train_gdf):][attr_names].astype(np.float32).values
newdata['Test_latlon'] = house_gdf.iloc[len(train_gdf):][['lat', 'lng']].astype(np.float32).values
newdata['Test_price'] = house_gdf.iloc[len(train_gdf):][['price']].astype(np.float32).values
#newdata['Test_haddr'] = house_gdf.iloc[len(train_gdf):][['haddr1', 'haddr2', 'haddr3']].values
#newdata['Test_saddr'] = house_gdf.iloc[len(train_gdf):][['saddr1', 'saddr2', 'saddr3']].values

#newdata['distmx_saddr2'] = dist_mx
#newdata['haddr_nums'] = [len(haddr1dict), len(haddr2dict), len(haddr3dict)]
#newdata['saddr_nums'] = [len(saddr1dict), len(saddr2dict), len(saddr3dict)]

np.savez(f'processed/{dname}/processed_data2.npz', **newdata)

In [None]:
newdata = dict(np.load(f'processed/{dname}/processed_data2.npz'))
newdata['Train_poiprox'] = house_gdf.iloc[:len(train_gdf)][[fn + '_proximity' for fn in fname_list]].values
newdata['Test_poiprox'] = house_gdf.iloc[len(train_gdf):][[fn + '_proximity' for fn in fname_list]].values

newdata['Train_poidist'] = house_gdf.iloc[:len(train_gdf)][[fn + '_dist' for fn in fname_list]].values
newdata['Test_poidist'] = house_gdf.iloc[len(train_gdf):][[fn + '_dist' for fn in fname_list]].values

np.savez(f'processed/{dname}/processed_data3.npz', **newdata)

In [None]:
newdata = dict(np.load(f'processed/{dname}/processed_data3.npz'))
newdata['Train_idx_geo'] = data['idx_geo'][:len(train_gdf)]
newdata['Train_dist_geo'] = data['dist_geo'][:len(train_gdf)]
newdata['Test_idx_geo'] = data['idx_geo'][len(train_gdf):]
newdata['Test_dist_geo'] = data['dist_geo'][len(train_gdf):]
np.savez(f'processed/{dname}/processed_data4.npz', **newdata)

In [None]:
house_gdf[[fn + '_proximity' for fn in fname_list]]

In [None]:
import statsmodels.api as sm
import numpy as np
import pandas as pd


df = house_gdf.copy()#pd.DataFrame(dict(X1=X1, X2=X2, X3=X3, X4=X4, X5=X5, Y=Y0+err))
for attr in attr_names:
    df[attr] = df[attr].astype(float)
model = sm.OLS(df["price"], sm.add_constant(df[attr_names + [fn + '_proximity' for fn in fname_list]]), missing="drop").fit()
print(model.summary2())