In [1]:
import os, tqdm

In [2]:
from sklearn.metrics import r2_score
import numpy as np

# metric
def metric(label, pred):
    assert label.shape == pred.shape
    
    with np.errstate(divide = 'ignore', invalid = 'ignore'):
        mask = (label == label) & (pred == pred)
        mask = mask.astype(np.float32)
        mask /= np.mean(mask)
        
        male = np.abs(np.subtract(np.log(pred), np.log(label))).astype(np.float32)
        mae = np.abs(np.subtract(pred, label)).astype(np.float32)
        
        male = np.nan_to_num(male * mask)
        male = np.mean(male)
        
        mae = np.nan_to_num(mae * mask)
        mae = np.mean(mae)
        
        rmse = np.square(mae)
        rmse = np.nan_to_num(rmse * mask)
        rmse = np.sqrt(np.mean(rmse))
        
        mape = np.divide(mae, label)
        mape = np.nan_to_num(mape * mask)
        mape = np.median(mape*mask)
        
        print('masked:', np.sum(mask == 0))
    return male, rmse, mape

In [3]:
os.listdir('.')

['house-dataset-osm-road2vec-poa.ipynb',
 'house-dataset-visualization.html',
 'house-dataset-visualization.ipynb',
 '.ipynb_checkpoints',
 'house-lgbm.ipynb',
 'generateSE.py',
 'sp',
 'house-dataset-visualization-Copy1.ipynb',
 'house-dataset-osm-road2vec-dbscan.ipynb',
 'node2vec.py',
 '.gitkeep ',
 'poa',
 'house-dataset-osm-road2vec-Copy2.ipynb',
 'fc',
 'kc',
 'house-dataset-osm-road2vec-fc.ipynb',
 'house-dataset-osm-road2vec-kc.ipynb',
 'house_reverse_geocoding.py',
 '__pycache__',
 'house-lgbm-sonia.ipynb',
 'house-dataset-osm-road2vec-Copy1.ipynb',
 'house-dataset-osm-road2vec-sp.ipynb',
 'osmdata',
 'house-dataset-osm-road2vec.ipynb',
 'house-dataset-osm-buildings.ipynb',
 'house-dataset-osm-buildings-Copy1.ipynb',
 'cache']

In [6]:
datasets = ['fc']

In [7]:
import numpy as np

In [8]:
streetmap = {
    'style': 'mapbox://styles/mapbox/streets-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}
mybasemap = {
    #'style': 'mapbox://styles/mapbox/streets-v9',
    'style': 'mapbox://styles/mapbox/satellite-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}

In [9]:
from cartoframes.viz import *

Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


In [10]:
import pandas as pd

In [11]:
import geopandas as gpd

In [12]:
for dname in ['fc']:#['kc', 'fc', 'sp', 'poa']:
    print(dname)
    data = np.load(f'{dname}/data.npz')
    
    dict1 = {'lat':data['X_train'][:,0], 'lng': data['X_train'][:,1], 'price': data['y_train']}
    dict2 = {'lat':data['X_test'][:,0], 'lng': data['X_test'][:,1], 'price': data['y_test']}
    attr_names = []
    for a in range(2, data['X_train'].shape[1]):
        dict1.update({f'attr{a-2}': data['X_train'][:, a]})
        dict2.update({f'attr{a-2}': data['X_test'][:, a]})
        attr_names.append(f'attr{a-2}')
    df1 = pd.DataFrame(dict1)
    df2 = pd.DataFrame(dict2)
    df = pd.concat([df1, df2])
    
    train_gdf = gpd.GeoDataFrame(df1.copy(), geometry=gpd.points_from_xy(x=df1.lng, y=df1.lat))
    train_gdf.crs = 'EPSG:4326'
    test_gdf = gpd.GeoDataFrame(df2.copy(), geometry=gpd.points_from_xy(x=df2.lng, y=df2.lat))
    test_gdf.crs = 'EPSG:4326'
    house_gdf = gpd.GeoDataFrame(df.copy(), geometry=gpd.points_from_xy(x=df.lng, y=df.lat))
    house_gdf.crs = 'EPSG:4326'
    #print(np.exp(df['price'].values).mean())
    gdf = house_gdf
    for attr in attr_names:
        print(attr, gdf[attr].nunique())
        if gdf[attr].nunique() < 30:
            gdf[attr] = gdf[attr].astype(str)
    gdfcpy = gdf.copy()
    
    
#     display(Map(
#         [
#             Layer(gdfcpy, color_category_style(tattr, cat=cat, palette='cb_blues'), encode_data=False),
#             Layer(gdf, color_continuous_style('price', palette='sunset'), encode_data=False),
#         ],
#         basemap=mybasemap))
    
    break

fc
attr0 6447
attr1 18
attr2 11
attr3 6447
attr4 16372
attr5 79
attr6 3230
attr7 8
attr8 6


In [13]:
#vectors = np.load(f'{dname}/road2vec_n2v_32_nodes.npy')
np.save(f'{dname}/feat2vec_n2v_32_nodes.npy', df.values)

In [11]:
import osmnx as ox
from shapely.geometry import *

if not os.path.isdir('osmdata'):
    os.mkdir('osmdata')
    
DATASET_NAME = dname
OSM_FILE_PATH = f'osmdata/{DATASET_NAME}.graphml'

from shapely.geometry import MultiPoint


x1, y1, x2, y2 = gdf.total_bounds

house_center_latitude = (y1 + y2)/2 #sensor_hull.centroid.y
house_center_longitude = (x1 + x2)/2 #sensor_hull.centroid.x

     
graphs = dict()
# retrieve the street network for the location
if not os.path.isfile(OSM_FILE_PATH):
    center_point = gpd.GeoDataFrame(geometry = [Point(house_center_longitude, house_center_latitude)])
    center_point.crs = 'epsg:4326'
    center_point = center_point.to_crs('epsg:3310')
    max_distance = gdf.to_crs('epsg:3310').distance(center_point.iloc[0].geometry).max()+1000
    print('max_distance:', max_distance)
    graph = ox.graph_from_point((house_center_latitude, house_center_longitude), dist=max_distance)

    # save the street network to a shapefile
    ox.save_graphml(graph, filepath=OSM_FILE_PATH)
else:
    graph = ox.load_graphml(filepath=OSM_FILE_PATH)
    

# buildings = buildings.reset_index()
# buildings.geometry = buildings.geometry.centroid

In [12]:
# graph2 = ox.graph_from_place('vashon')
# osm_nodes2, osm_edges2 = ox.graph_to_gdfs(graph2)
# osm_nodes1, osm_edges1 = ox.graph_to_gdfs(graph)

# osm_nodes = pd.concat((osm_nodes1, osm_nodes2))
# osm_edges = pd.concat((osm_edges1, osm_edges2))

osm_nodes, osm_edges = ox.graph_to_gdfs(graph)

In [13]:



osm_nodes['osmidn'] = osm_nodes.index
osm_nodes['osmidstr'] = osm_nodes['osmidn'].astype(str)
osm_edges = osm_edges.reset_index()
cond = np.array([str(type(s)) for s in osm_edges['highway']]) == "<class 'str'>"
osm_edges = osm_edges[cond]

In [14]:
alist = osm_nodes.geometry.tolist()

In [15]:
np.random.shuffle(alist)

In [16]:
Layer(gpd.GeoDataFrame(geometry=alist[:1000]))

In [17]:
center_point = gpd.GeoDataFrame(geometry = [Point(house_center_longitude, house_center_latitude)])
center_point.crs = 'epsg:4326'
center_point = center_point.to_crs('epsg:3310')
max_distance = gdf.to_crs('epsg:3310').distance(center_point.iloc[0].geometry).max()+1000
buildings = ox.geometries.geometries_from_point((house_center_latitude, house_center_longitude), 
                                    tags = {'building': True},
                                    dist=max_distance)

In [18]:
rbuildings = buildings.reset_index()
fbuildings = buildings.reset_index()
fbuildings.geometry = fbuildings.geometry.centroid
fbuildings['nx'] = fbuildings.geometry.x
fbuildings['ny'] = fbuildings.geometry.y

fbuildings['barea'] = buildings.reset_index().to_crs('epsg:3310').area

corr_osmid = []
for _, htem in tqdm.tqdm(house_gdf.iterrows(), total=len(house_gdf)):
    ffbs = rbuildings[(fbuildings['nx'] > htem.lng - 0.002) & (fbuildings['nx'] < htem.lng + 0.002) & \
                        (fbuildings['ny'] > htem.lat - 0.002) & (fbuildings['ny'] < htem.lat + 0.002) & \
                         (fbuildings['barea'] > 70)]
    if len(ffbs) == 0:
        corr_osmid.append(-1)
    else:
        target_bd = ffbs.iloc[ffbs.distance(htem.geometry).values.argmin()]
        corr_osmid.append(target_bd['osmid'])

nhouse_gdf = house_gdf.copy()
nhouse_gdf['osmid'] = corr_osmid


  fbuildings.geometry = fbuildings.geometry.centroid

  target_bd = ffbs.iloc[ffbs.distance(htem.geometry).values.argmin()]
100%|█████████████████████████████████████████████████████████████████████████████| 68848/68848 [15:42<00:00, 73.06it/s]


In [19]:
osmid2geo = {osmid:geo for osmid, geo in zip(rbuildings['osmid'], rbuildings['geometry'])}

geos = []
for i, osmid in enumerate(nhouse_gdf['osmid']):
    if osmid > 0:
        geos.append(osmid2geo[osmid])
    else:
        geos.append(house_gdf.iloc[i].geometry)

nhouse_gdf.geometry = geos
nhouse_gdf.geometry = nhouse_gdf.geometry.centroid


  nhouse_gdf.geometry = nhouse_gdf.geometry.centroid


# Next

In [20]:
import node2vec
import numpy as np
import networkx as nx
from gensim.models import Word2Vec

with open(f'{dname}/Adj.txt', 'w') as fp:
    for _, item in osm_edges.iterrows():
        fp.write(f'{item.u} {item.v} 1.0\n')
        
        
def read_graph(edgelist):
    G = nx.read_edgelist(
        edgelist, nodetype=str, data=(('weight',float),),
        create_using=nx.DiGraph())
    return G

Adj_file = f'{dname}/Adj.txt'
nx_G = read_graph(Adj_file)

num_walks = 10
walk_length = 20
p = 2
q = 1
is_directed = True

G = node2vec.Graph(nx_G, is_directed, p, q)
G.preprocess_transition_probs()
node2vec_walks = G.simulate_walks(num_walks, walk_length)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  J = np.zeros(K, dtype=np.int)


Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


In [21]:
from gensim.models import Word2Vec

vector_size = 32
sentences = node2vec_walks
model_node = Word2Vec(sentences, window=5, min_count=0, workers=4, vector_size=vector_size)

edge_discover_path_list = []
for path in node2vec_walks:# + discover_path_list:
    edge = []
    for u, v in zip(path[:-1], path[1:]):
        edge.append(f'{u}-{v}')
    edge_discover_path_list.append(edge)
sentences = edge_discover_path_list
model_edge = Word2Vec(sentences, window=5, min_count=1, workers=4, vector_size=vector_size)

collecting all words and their counts
PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
PROGRESS: at sentence #10000, processed 199808 words, keeping 106558 word types
PROGRESS: at sentence #20000, processed 399588 words, keeping 183431 word types
PROGRESS: at sentence #30000, processed 599401 words, keeping 238593 word types
PROGRESS: at sentence #40000, processed 799160 words, keeping 279826 word types
PROGRESS: at sentence #50000, processed 998927 words, keeping 309845 word types
PROGRESS: at sentence #60000, processed 1198674 words, keeping 331613 word types
PROGRESS: at sentence #70000, processed 1398570 words, keeping 348720 word types
PROGRESS: at sentence #80000, processed 1598291 words, keeping 361554 word types
PROGRESS: at sentence #90000, processed 1798061 words, keeping 371319 word types
PROGRESS: at sentence #100000, processed 1997873 words, keeping 378698 word types
PROGRESS: at sentence #110000, processed 2197514 words, keeping 384478 word types
PROGRESS

PROGRESS: at sentence #1000000, processed 19976121 words, keeping 407505 word types
PROGRESS: at sentence #1010000, processed 20175789 words, keeping 407505 word types
PROGRESS: at sentence #1020000, processed 20375565 words, keeping 407505 word types
PROGRESS: at sentence #1030000, processed 20575266 words, keeping 407505 word types
PROGRESS: at sentence #1040000, processed 20775012 words, keeping 407505 word types
PROGRESS: at sentence #1050000, processed 20974826 words, keeping 407505 word types
PROGRESS: at sentence #1060000, processed 21174499 words, keeping 407505 word types
PROGRESS: at sentence #1070000, processed 21374227 words, keeping 407505 word types
PROGRESS: at sentence #1080000, processed 21574032 words, keeping 407505 word types
PROGRESS: at sentence #1090000, processed 21773828 words, keeping 407505 word types
PROGRESS: at sentence #1100000, processed 21973455 words, keeping 407505 word types
PROGRESS: at sentence #1110000, processed 22173239 words, keeping 407505 wor

PROGRESS: at sentence #1980000, processed 39551919 words, keeping 407505 word types
PROGRESS: at sentence #1990000, processed 39751585 words, keeping 407505 word types
PROGRESS: at sentence #2000000, processed 39951286 words, keeping 407505 word types
PROGRESS: at sentence #2010000, processed 40151125 words, keeping 407505 word types
PROGRESS: at sentence #2020000, processed 40350945 words, keeping 407505 word types
PROGRESS: at sentence #2030000, processed 40550671 words, keeping 407505 word types
PROGRESS: at sentence #2040000, processed 40750362 words, keeping 407505 word types
PROGRESS: at sentence #2050000, processed 40950130 words, keeping 407505 word types
PROGRESS: at sentence #2060000, processed 41149920 words, keeping 407505 word types
PROGRESS: at sentence #2070000, processed 41349721 words, keeping 407505 word types
PROGRESS: at sentence #2080000, processed 41549508 words, keeping 407505 word types
PROGRESS: at sentence #2090000, processed 41749198 words, keeping 407505 wor

PROGRESS: at sentence #2960000, processed 59127818 words, keeping 407505 word types
PROGRESS: at sentence #2970000, processed 59327554 words, keeping 407505 word types
PROGRESS: at sentence #2980000, processed 59527186 words, keeping 407505 word types
PROGRESS: at sentence #2990000, processed 59726949 words, keeping 407505 word types
PROGRESS: at sentence #3000000, processed 59926764 words, keeping 407505 word types
PROGRESS: at sentence #3010000, processed 60126492 words, keeping 407505 word types
PROGRESS: at sentence #3020000, processed 60326261 words, keeping 407505 word types
PROGRESS: at sentence #3030000, processed 60525997 words, keeping 407505 word types
PROGRESS: at sentence #3040000, processed 60725746 words, keeping 407505 word types
PROGRESS: at sentence #3050000, processed 60925407 words, keeping 407505 word types
PROGRESS: at sentence #3060000, processed 61125203 words, keeping 407505 word types
PROGRESS: at sentence #3070000, processed 61324949 words, keeping 407505 wor

PROGRESS: at sentence #3940000, processed 78704204 words, keeping 407505 word types
PROGRESS: at sentence #3950000, processed 78903934 words, keeping 407505 word types
PROGRESS: at sentence #3960000, processed 79103561 words, keeping 407505 word types
PROGRESS: at sentence #3970000, processed 79303348 words, keeping 407505 word types
PROGRESS: at sentence #3980000, processed 79503145 words, keeping 407505 word types
PROGRESS: at sentence #3990000, processed 79702941 words, keeping 407505 word types
PROGRESS: at sentence #4000000, processed 79902644 words, keeping 407505 word types
PROGRESS: at sentence #4010000, processed 80102526 words, keeping 407505 word types
PROGRESS: at sentence #4020000, processed 80302253 words, keeping 407505 word types
PROGRESS: at sentence #4030000, processed 80501914 words, keeping 407505 word types
PROGRESS: at sentence #4040000, processed 80701679 words, keeping 407505 word types
PROGRESS: at sentence #4050000, processed 80901397 words, keeping 407505 wor

EPOCH 1 - PROGRESS: at 35.82% examples, 1815466 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 38.06% examples, 1815525 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 40.32% examples, 1816466 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 42.53% examples, 1815180 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 44.77% examples, 1815603 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 47.03% examples, 1816690 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 49.33% examples, 1818785 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 51.50% examples, 1816360 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 53.76% examples, 1817264 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 56.05% examples, 1819183 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 58.36% examples, 1820905 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 60.66% examples, 1822605 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 62.96

EPOCH 3 - PROGRESS: at 96.87% examples, 2246817 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 99.62% examples, 2246431 words/s, in_qsize 7, out_qsize 0
EPOCH 3: training on 81401804 raw words (81401804 effective words) took 36.2s, 2246925 effective words/s
EPOCH 4 - PROGRESS: at 2.59% examples, 2107089 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 5.12% examples, 2082112 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 7.73% examples, 2095257 words/s, in_qsize 8, out_qsize 0
EPOCH 4 - PROGRESS: at 10.36% examples, 2104519 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 13.00% examples, 2110549 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 15.62% examples, 2112174 words/s, in_qsize 8, out_qsize 0
EPOCH 4 - PROGRESS: at 18.22% examples, 2112922 words/s, in_qsize 8, out_qsize 0
EPOCH 4 - PROGRESS: at 20.86% examples, 2116309 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 23.41% examples, 2111652 words/s, in_qsize 7, out_qsize 0
EPOCH 4

PROGRESS: at sentence #500000, processed 9487726 words, keeping 980214 word types
PROGRESS: at sentence #510000, processed 9677428 words, keeping 980267 word types
PROGRESS: at sentence #520000, processed 9867171 words, keeping 980317 word types
PROGRESS: at sentence #530000, processed 10056875 words, keeping 980372 word types
PROGRESS: at sentence #540000, processed 10246742 words, keeping 980422 word types
PROGRESS: at sentence #550000, processed 10436528 words, keeping 980468 word types
PROGRESS: at sentence #560000, processed 10626391 words, keeping 980506 word types
PROGRESS: at sentence #570000, processed 10816213 words, keeping 980548 word types
PROGRESS: at sentence #580000, processed 11005966 words, keeping 980601 word types
PROGRESS: at sentence #590000, processed 11195593 words, keeping 980630 word types
PROGRESS: at sentence #600000, processed 11385337 words, keeping 980662 word types
PROGRESS: at sentence #610000, processed 11575072 words, keeping 980684 word types
PROGRES

PROGRESS: at sentence #1490000, processed 28273950 words, keeping 980976 word types
PROGRESS: at sentence #1500000, processed 28463696 words, keeping 980976 word types
PROGRESS: at sentence #1510000, processed 28653421 words, keeping 980976 word types
PROGRESS: at sentence #1520000, processed 28843088 words, keeping 980976 word types
PROGRESS: at sentence #1530000, processed 29032882 words, keeping 980976 word types
PROGRESS: at sentence #1540000, processed 29222601 words, keeping 980977 word types
PROGRESS: at sentence #1550000, processed 29412339 words, keeping 980977 word types
PROGRESS: at sentence #1560000, processed 29602129 words, keeping 980977 word types
PROGRESS: at sentence #1570000, processed 29791948 words, keeping 980977 word types
PROGRESS: at sentence #1580000, processed 29981701 words, keeping 980977 word types
PROGRESS: at sentence #1590000, processed 30171514 words, keeping 980977 word types
PROGRESS: at sentence #1600000, processed 30361197 words, keeping 980977 wor

PROGRESS: at sentence #2470000, processed 46870127 words, keeping 980982 word types
PROGRESS: at sentence #2480000, processed 47059971 words, keeping 980982 word types
PROGRESS: at sentence #2490000, processed 47249792 words, keeping 980982 word types
PROGRESS: at sentence #2500000, processed 47439499 words, keeping 980982 word types
PROGRESS: at sentence #2510000, processed 47629165 words, keeping 980982 word types
PROGRESS: at sentence #2520000, processed 47818895 words, keeping 980982 word types
PROGRESS: at sentence #2530000, processed 48008604 words, keeping 980982 word types
PROGRESS: at sentence #2540000, processed 48198396 words, keeping 980983 word types
PROGRESS: at sentence #2550000, processed 48388108 words, keeping 980983 word types
PROGRESS: at sentence #2560000, processed 48577900 words, keeping 980983 word types
PROGRESS: at sentence #2570000, processed 48767634 words, keeping 980983 word types
PROGRESS: at sentence #2580000, processed 48957378 words, keeping 980983 wor

PROGRESS: at sentence #3450000, processed 65465987 words, keeping 980984 word types
PROGRESS: at sentence #3460000, processed 65655694 words, keeping 980984 word types
PROGRESS: at sentence #3470000, processed 65845455 words, keeping 980984 word types
PROGRESS: at sentence #3480000, processed 66035262 words, keeping 980984 word types
PROGRESS: at sentence #3490000, processed 66225019 words, keeping 980984 word types
PROGRESS: at sentence #3500000, processed 66414820 words, keeping 980984 word types
PROGRESS: at sentence #3510000, processed 66604629 words, keeping 980984 word types
PROGRESS: at sentence #3520000, processed 66794397 words, keeping 980984 word types
PROGRESS: at sentence #3530000, processed 66984122 words, keeping 980984 word types
PROGRESS: at sentence #3540000, processed 67173887 words, keeping 980984 word types
PROGRESS: at sentence #3550000, processed 67363565 words, keeping 980984 word types
PROGRESS: at sentence #3560000, processed 67553244 words, keeping 980984 wor

EPOCH 0 - PROGRESS: at 14.49% examples, 922437 words/s, in_qsize 8, out_qsize 2
EPOCH 0 - PROGRESS: at 15.74% examples, 924691 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 16.93% examples, 923278 words/s, in_qsize 8, out_qsize 1
EPOCH 0 - PROGRESS: at 18.43% examples, 938334 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 19.58% examples, 934504 words/s, in_qsize 8, out_qsize 2
EPOCH 0 - PROGRESS: at 20.74% examples, 931010 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 22.01% examples, 932617 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 23.44% examples, 941394 words/s, in_qsize 7, out_qsize 0
EPOCH 0 - PROGRESS: at 24.54% examples, 936643 words/s, in_qsize 7, out_qsize 0
EPOCH 0 - PROGRESS: at 25.84% examples, 939036 words/s, in_qsize 4, out_qsize 3
EPOCH 0 - PROGRESS: at 26.94% examples, 934281 words/s, in_qsize 5, out_qsize 2
EPOCH 0 - PROGRESS: at 28.12% examples, 932783 words/s, in_qsize 8, out_qsize 1
EPOCH 0 - PROGRESS: at 29.41% examples, 

EPOCH 1 - PROGRESS: at 48.68% examples, 1004499 words/s, in_qsize 8, out_qsize 3
EPOCH 1 - PROGRESS: at 50.10% examples, 1006815 words/s, in_qsize 8, out_qsize 0
EPOCH 1 - PROGRESS: at 51.62% examples, 1010830 words/s, in_qsize 6, out_qsize 1
EPOCH 1 - PROGRESS: at 52.93% examples, 1010704 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 54.47% examples, 1014627 words/s, in_qsize 8, out_qsize 1
EPOCH 1 - PROGRESS: at 56.04% examples, 1019295 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 57.22% examples, 1016716 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 58.76% examples, 1020559 words/s, in_qsize 8, out_qsize 1
EPOCH 1 - PROGRESS: at 59.91% examples, 1017514 words/s, in_qsize 4, out_qsize 3
EPOCH 1 - PROGRESS: at 61.04% examples, 1014429 words/s, in_qsize 4, out_qsize 3
EPOCH 1 - PROGRESS: at 62.59% examples, 1018102 words/s, in_qsize 6, out_qsize 1
EPOCH 1 - PROGRESS: at 64.12% examples, 1020695 words/s, in_qsize 7, out_qsize 3
EPOCH 1 - PROGRESS: at 65.62

EPOCH 2 - PROGRESS: at 86.29% examples, 1013589 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 87.69% examples, 1014519 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 89.05% examples, 1014873 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 90.54% examples, 1016808 words/s, in_qsize 8, out_qsize 1
EPOCH 2 - PROGRESS: at 92.04% examples, 1018793 words/s, in_qsize 8, out_qsize 1
EPOCH 2 - PROGRESS: at 93.39% examples, 1018780 words/s, in_qsize 5, out_qsize 2
EPOCH 2 - PROGRESS: at 94.89% examples, 1020580 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 96.36% examples, 1022111 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 97.83% examples, 1023607 words/s, in_qsize 8, out_qsize 1
EPOCH 2 - PROGRESS: at 99.33% examples, 1025244 words/s, in_qsize 8, out_qsize 1
EPOCH 2: training on 77326754 raw words (77326754 effective words) took 75.3s, 1026294 effective words/s
EPOCH 3 - PROGRESS: at 1.03% examples, 785300 words/s, in_qsize 4, out_qsize 3
EPOCH 

EPOCH 4 - PROGRESS: at 22.06% examples, 1052622 words/s, in_qsize 8, out_qsize 1
EPOCH 4 - PROGRESS: at 23.50% examples, 1056318 words/s, in_qsize 6, out_qsize 1
EPOCH 4 - PROGRESS: at 25.00% examples, 1061709 words/s, in_qsize 6, out_qsize 1
EPOCH 4 - PROGRESS: at 26.48% examples, 1065664 words/s, in_qsize 6, out_qsize 1
EPOCH 4 - PROGRESS: at 27.98% examples, 1069707 words/s, in_qsize 6, out_qsize 1
EPOCH 4 - PROGRESS: at 29.48% examples, 1073819 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 30.61% examples, 1064664 words/s, in_qsize 5, out_qsize 2
EPOCH 4 - PROGRESS: at 31.71% examples, 1053882 words/s, in_qsize 5, out_qsize 2
EPOCH 4 - PROGRESS: at 32.80% examples, 1044938 words/s, in_qsize 8, out_qsize 3
EPOCH 4 - PROGRESS: at 34.17% examples, 1045140 words/s, in_qsize 8, out_qsize 0
EPOCH 4 - PROGRESS: at 35.54% examples, 1044167 words/s, in_qsize 5, out_qsize 2
EPOCH 4 - PROGRESS: at 36.67% examples, 1037147 words/s, in_qsize 4, out_qsize 3
EPOCH 4 - PROGRESS: at 37.91

In [22]:
magnitudes = []
for osmid in osm_nodes['osmidstr']:
    if osmid in model_node.wv:
        vec = model_node.wv[osmid]
        mag = np.linalg.norm(vec)
        magnitudes.append(mag)
    else:
        magnitudes.append(-1)
osm_nodes['magnitude'] = magnitudes
# Layer(osm_nodes_3310[osm_nodes_3310['magnitude'] > 0], color_continuous_style('magnitude'))


magnitudes = []
osm_edges['u-v'] = osm_edges['u'].astype(str) + '-' + osm_edges['v'].astype(str)
for osmid in osm_edges['u-v']:
    if osmid in model_edge.wv:
        vec = model_edge.wv[osmid]
        mag = np.linalg.norm(vec)
        magnitudes.append(mag)
    else:
        magnitudes.append(-1)
osm_edges['magnitude'] = magnitudes
# Layer(osm_edges_3310[osm_edges_3310['magnitude'] > 0], color_continuous_style('magnitude'))

In [23]:
# ftest_edges = osm_edges_3310[osm_edges_3310['magnitude'] > 0].copy()
# ftest_edges['bx'] = ftest_edges.geometry.centroid.x
# ftest_edges['by'] = ftest_edges.geometry.centroid.y

# house_gdf_3310 = nhouse_gdf.to_crs('epsg:3310')
# house_gdf_3310['bx'] = house_gdf_3310.geometry.centroid.x
# house_gdf_3310['by'] = house_gdf_3310.geometry.centroid.y

# vectors = []
# for _, item in tqdm.tqdm(house_gdf_3310.iterrows(), total=len(house_gdf)):
#     iftest_edges = ftest_edges[(ftest_edges['bx'] > item['bx'] - 500) & 
#                 (ftest_edges['bx'] < item['bx'] + 500) & 
#                 (ftest_edges['by'] > item['by'] - 500) & 
#                 (ftest_edges['by'] < item['by'] + 500)]
    
#     jtem = iftest_edges.iloc[iftest_edges.distance(item.geometry).argmin()]
#     vec = model.wv[jtem['u-v']]
#     vectors.append(vec)

In [24]:
ftest_nodes = osm_nodes[osm_nodes['magnitude'] > 0].to_crs('epsg:3310')
ftest_nodes['bx'] = ftest_nodes.geometry.centroid.x
ftest_nodes['by'] = ftest_nodes.geometry.centroid.y
fosmid2geo = {osmid:geo for osmid, geo in zip(ftest_nodes['osmidstr'], ftest_nodes['geometry'])}

ftest_edges = osm_edges[osm_edges['magnitude'] > 0].to_crs('epsg:3310')
ftest_edges['bx'] = ftest_edges.geometry.centroid.x
ftest_edges['by'] = ftest_edges.geometry.centroid.y

house_gdf_3310 = nhouse_gdf.to_crs('epsg:3310')
house_gdf_3310['bx'] = house_gdf_3310.geometry.centroid.x
house_gdf_3310['by'] = house_gdf_3310.geometry.centroid.y

In [25]:
vectors = []
geos = []
for _, item in tqdm.tqdm(house_gdf_3310.iterrows(), total=len(house_gdf)):
    iftest_edges = ftest_edges[(ftest_edges['bx'] > item['bx'] - 500) & 
                                (ftest_edges['bx'] < item['bx'] + 500) & 
                                (ftest_edges['by'] > item['by'] - 500) & 
                                (ftest_edges['by'] < item['by'] + 500)]
    if len(iftest_edges) == 0:
        iftest_edges = ftest_edges
    
    
    jtem = iftest_edges.iloc[iftest_edges.distance(item.geometry).argmin()]
    u, v = str(jtem['u']), str(jtem['v'])
    vec_u, vec_v = model_node.wv[u], model_node.wv[v]
    dist_u, dist_v = item.geometry.distance(fosmid2geo[u]), item.geometry.distance(fosmid2geo[v])
    ratios = np.array([1/dist_u, 1/dist_v])
    ratios /= np.sum(ratios)
    
    
    vec = vec_u*ratios[0] + vec_v*ratios[1]
    vectors.append(vec)

100%|████████████████████████████████████████████████████████████████████████████| 68848/68848 [03:21<00:00, 342.04it/s]


In [26]:
np.save(f'{dname}/road2vec_n2v_32_nodes.npy', vectors)

In [27]:
house_gdf_3310 = house_gdf.to_crs('epsg:3310')
thouse_gdf_3310 = train_gdf.to_crs('epsg:3310')

In [28]:
n2v_vectors = np.array(vectors)
n2v_vectors_train = n2v_vectors[:len(thouse_gdf_3310)]

In [29]:
train_gdf_3310 = train_gdf.to_crs('epsg:3310')

In [30]:
dist_mat = []
for i, (_, item) in tqdm.tqdm(enumerate(house_gdf_3310.iterrows()), total=len(house_gdf_3310)):
    mdist1 = np.linalg.norm(n2v_vectors[i] - n2v_vectors_train, axis=-1)
    mdist1 /= mdist1.std()
    #dist_mat.append(mdist1)
    mdist2 = train_gdf_3310.distance(item.geometry).values
    mdist2 /= mdist2.std()
    dist_mat.append(mdist1 + mdist2*10)
dist_mat = np.stack(dist_mat, 0)

100%|█████████████████████████████████████████████████████████████████████████████| 68848/68848 [20:25<00:00, 56.19it/s]


In [31]:
j=100
ahgdf = train_gdf.copy()
ahgdf['diststr'] = dist_mat[j]

In [32]:
Map([
#     Layer(fhgdf, color_continuous_style('diststr', palette='sunset')),
#     Layer(gdf.iloc[data_ori['idx_geo'][j]], color_continuous_style('price', palette='sunset'))
    Layer(ahgdf.iloc[dist_mat[j].argsort()[1:61]], color_continuous_style('diststr', palette='sunset')),
    Layer(house_gdf.iloc[[j]], basic_style(color='green')),

])

  parameter_args = inspect.getargspec(decorated_function).args


In [33]:
new_idx = []
new_dist_eucli = []
new_dist_geo = []

for i, (_, item) in tqdm.tqdm(enumerate(house_gdf_3310.iterrows()), total=len(house_gdf_3310)):
    target = item.values[:-1]
    candidates = thouse_gdf_3310.iloc[dist_mat[i].argsort()[1:61]].values[:, :-1]
    new_idx.append(dist_mat[i].argsort()[1:61])
    new_dist_geo.append(dist_mat[i][dist_mat[i].argsort()[1:61]])
    new_dist_eucli.append(np.linalg.norm(target.astype(float) - candidates.astype(float), axis=-1))
    
new_idx = np.stack(new_idx, 0)
new_dist_eucli = np.stack(new_dist_eucli, 0)
new_dist_geo = np.stack(new_dist_geo, 0)

100%|█████████████████████████████████████████████████████████████████████████████| 68848/68848 [16:34<00:00, 69.26it/s]


In [34]:
newdata = dict() 
newdata['X_train'] = data['X_train']
newdata['y_train'] = data['y_train']
newdata['X_test'] = data['X_test']
newdata['y_test'] = data['y_test']
newdata['idx_eucli'] = new_idx
newdata['idx_geo'] = new_idx
newdata['dist_eucli'] = new_dist_eucli
newdata['dist_geo'] = new_dist_geo

In [35]:
dname

'sp'

In [36]:
np.savez(f'{dname}/data_n2v_nodes.npz', **newdata)