In [1]:
import os, tqdm

In [2]:
from sklearn.metrics import r2_score
import numpy as np

# metric
def metric(label, pred):
    assert label.shape == pred.shape
    
    with np.errstate(divide = 'ignore', invalid = 'ignore'):
        mask = (label == label) & (pred == pred)
        mask = mask.astype(np.float32)
        mask /= np.mean(mask)
        
        male = np.abs(np.subtract(np.log(pred), np.log(label))).astype(np.float32)
        mae = np.abs(np.subtract(pred, label)).astype(np.float32)
        
        male = np.nan_to_num(male * mask)
        male = np.mean(male)
        
        mae = np.nan_to_num(mae * mask)
        mae = np.mean(mae)
        
        rmse = np.square(mae)
        rmse = np.nan_to_num(rmse * mask)
        rmse = np.sqrt(np.mean(rmse))
        
        mape = np.divide(mae, label)
        mape = np.nan_to_num(mape * mask)
        mape = np.median(mape*mask)
        
        print('masked:', np.sum(mask == 0))
    return male, rmse, mape

In [3]:
os.listdir('.')

['house-dataset-osm-road2vec-poa.ipynb',
 'house-dataset-visualization.html',
 'house-dataset-visualization.ipynb',
 '.ipynb_checkpoints',
 'house-lgbm.ipynb',
 'generateSE.py',
 'sp',
 'house-dataset-visualization-Copy1.ipynb',
 'house-dataset-osm-road2vec-dbscan.ipynb',
 'node2vec.py',
 '.gitkeep ',
 'poa',
 'house-dataset-osm-road2vec-Copy2.ipynb',
 'fc',
 'kc',
 'house-dataset-osm-road2vec-kc.ipynb',
 'house_reverse_geocoding.py',
 '__pycache__',
 'house-lgbm-sonia.ipynb',
 'house-dataset-osm-road2vec-Copy1.ipynb',
 'house-dataset-osm-road2vec-sp.ipynb',
 'osmdata',
 'house-dataset-osm-road2vec.ipynb',
 'house-dataset-osm-buildings.ipynb',
 'house-dataset-osm-buildings-Copy1.ipynb',
 'cache']

In [4]:
datasets = ['kc']

In [5]:
import numpy as np

In [6]:
streetmap = {
    'style': 'mapbox://styles/mapbox/streets-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}
mybasemap = {
    #'style': 'mapbox://styles/mapbox/streets-v9',
    'style': 'mapbox://styles/mapbox/satellite-v9',
    'token': 'pk.eyJ1IjoiaHNtNjkxMSIsImEiOiJjazl0and6aDUwOWF2M2RvemdrYjllczV3In0.qGmaAF6v-1LAF9C-dnMLBg'
}

In [7]:
from cartoframes.viz import *

Note: NumExpr detected 64 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


In [8]:
import pandas as pd

In [9]:
import geopandas as gpd

In [10]:
for dname in ['kc']:#['kc', 'fc', 'sp', 'poa']:
    print(dname)
    data = np.load(f'{dname}/data.npz')
    
    dict1 = {'lat':data['X_train'][:,0], 'lng': data['X_train'][:,1], 'price': data['y_train']}
    dict2 = {'lat':data['X_test'][:,0], 'lng': data['X_test'][:,1], 'price': data['y_test']}
    attr_names = []
    for a in range(2, data['X_train'].shape[1]):
        dict1.update({f'attr{a-2}': data['X_train'][:, a]})
        dict2.update({f'attr{a-2}': data['X_test'][:, a]})
        attr_names.append(f'attr{a-2}')
    df1 = pd.DataFrame(dict1)
    df2 = pd.DataFrame(dict2)
    df = pd.concat([df1, df2])
    
    train_gdf = gpd.GeoDataFrame(df1.copy(), geometry=gpd.points_from_xy(x=df1.lng, y=df1.lat))
    train_gdf.crs = 'EPSG:4326'
    test_gdf = gpd.GeoDataFrame(df2.copy(), geometry=gpd.points_from_xy(x=df2.lng, y=df2.lat))
    test_gdf.crs = 'EPSG:4326'
    house_gdf = gpd.GeoDataFrame(df.copy(), geometry=gpd.points_from_xy(x=df.lng, y=df.lat))
    house_gdf.crs = 'EPSG:4326'
    #print(np.exp(df['price'].values).mean())
    gdf = house_gdf
    for attr in attr_names:
        print(attr, gdf[attr].nunique())
        if gdf[attr].nunique() < 30:
            gdf[attr] = gdf[attr].astype(str)
    gdfcpy = gdf.copy()
    
    
#     display(Map(
#         [
#             Layer(gdfcpy, color_category_style(tattr, cat=cat, palette='cb_blues'), encode_data=False),
#             Layer(gdf, color_continuous_style('price', palette='sunset'), encode_data=False),
#         ],
#         basemap=mybasemap))
    
    break

kc
attr0 13
attr1 30
attr2 1038
attr3 9782
attr4 6
attr5 2
attr6 5
attr7 5
attr8 12
attr9 946
attr10 306
attr11 116
attr12 70
attr13 70
attr14 777
attr15 8689


In [11]:
import osmnx as ox
from shapely.geometry import *

if not os.path.isdir('osmdata'):
    os.mkdir('osmdata')
    
DATASET_NAME = dname
OSM_FILE_PATH = f'osmdata/{DATASET_NAME}.graphml'

from shapely.geometry import MultiPoint


x1, y1, x2, y2 = gdf.total_bounds

house_center_latitude = (y1 + y2)/2 #sensor_hull.centroid.y
house_center_longitude = (x1 + x2)/2 #sensor_hull.centroid.x

     
graphs = dict()
# retrieve the street network for the location
if not os.path.isfile(OSM_FILE_PATH):
    center_point = gpd.GeoDataFrame(geometry = [Point(house_center_longitude, house_center_latitude)])
    center_point.crs = 'epsg:4326'
    center_point = center_point.to_crs('epsg:3310')
    max_distance = gdf.to_crs('epsg:3310').distance(center_point.iloc[0].geometry).max()+1000
    print('max_distance:', max_distance)
    graph = ox.graph_from_point((house_center_latitude, house_center_longitude), dist=max_distance)

    # save the street network to a shapefile
    ox.save_graphml(graph, filepath=OSM_FILE_PATH)
else:
    graph = ox.load_graphml(filepath=OSM_FILE_PATH)
    

# buildings = buildings.reset_index()
# buildings.geometry = buildings.geometry.centroid

In [12]:
graph2 = ox.graph_from_place('vashon')
osm_nodes2, osm_edges2 = ox.graph_to_gdfs(graph2)

In [13]:
osm_nodes1, osm_edges1 = ox.graph_to_gdfs(graph)


osm_nodes = pd.concat((osm_nodes1, osm_nodes2))
osm_edges = pd.concat((osm_edges1, osm_edges2))


osm_nodes['osmidn'] = osm_nodes.index
osm_nodes['osmidstr'] = osm_nodes['osmidn'].astype(str)
osm_edges = osm_edges.reset_index()
cond = np.array([str(type(s)) for s in osm_edges['highway']]) == "<class 'str'>"
osm_edges = osm_edges[cond]

In [14]:
alist = osm_nodes.geometry.tolist()

In [15]:
np.random.shuffle(alist)

In [16]:
Layer(gpd.GeoDataFrame(geometry=alist[:100]))

In [17]:
center_point = gpd.GeoDataFrame(geometry = [Point(house_center_longitude, house_center_latitude)])
center_point.crs = 'epsg:4326'
center_point = center_point.to_crs('epsg:3310')
max_distance = gdf.to_crs('epsg:3310').distance(center_point.iloc[0].geometry).max()+1000
buildings = ox.geometries.geometries_from_point((house_center_latitude, house_center_longitude), 
                                    tags = {'building': True},
                                    dist=max_distance)

In [18]:
rbuildings = buildings.reset_index()
fbuildings = buildings.reset_index()
fbuildings.geometry = fbuildings.geometry.centroid
fbuildings['nx'] = fbuildings.geometry.x
fbuildings['ny'] = fbuildings.geometry.y

fbuildings['barea'] = buildings.reset_index().to_crs('epsg:3310').area

corr_osmid = []
for _, htem in tqdm.tqdm(house_gdf.iterrows(), total=len(house_gdf)):
    ffbs = rbuildings[(fbuildings['nx'] > htem.lng - 0.002) & (fbuildings['nx'] < htem.lng + 0.002) & \
                        (fbuildings['ny'] > htem.lat - 0.002) & (fbuildings['ny'] < htem.lat + 0.002) & \
                         (fbuildings['barea'] > 70)]
    if len(ffbs) == 0:
        corr_osmid.append(-1)
    else:
        target_bd = ffbs.iloc[ffbs.distance(htem.geometry).values.argmin()]
        corr_osmid.append(target_bd['osmid'])

nhouse_gdf = house_gdf.copy()
nhouse_gdf['osmid'] = corr_osmid


  fbuildings.geometry = fbuildings.geometry.centroid

  target_bd = ffbs.iloc[ffbs.distance(htem.geometry).values.argmin()]
100%|████████████████████████████████████████████████████████████████████████████| 21608/21608 [03:13<00:00, 111.74it/s]


In [19]:
osmid2geo = {osmid:geo for osmid, geo in zip(rbuildings['osmid'], rbuildings['geometry'])}

geos = []
for i, osmid in enumerate(nhouse_gdf['osmid']):
    if osmid > 0:
        geos.append(osmid2geo[osmid])
    else:
        geos.append(house_gdf.iloc[i].geometry)

nhouse_gdf.geometry = geos
nhouse_gdf.geometry = nhouse_gdf.geometry.centroid


  nhouse_gdf.geometry = nhouse_gdf.geometry.centroid


# Next

In [20]:
import node2vec
import numpy as np
import networkx as nx
from gensim.models import Word2Vec

with open(f'{dname}/Adj.txt', 'w') as fp:
    for _, item in osm_edges.iterrows():
        fp.write(f'{item.u} {item.v} 1.0\n')
        
        
def read_graph(edgelist):
    G = nx.read_edgelist(
        edgelist, nodetype=str, data=(('weight',float),),
        create_using=nx.DiGraph())
    return G

Adj_file = f'{dname}/Adj.txt'
nx_G = read_graph(Adj_file)

num_walks = 10
walk_length = 20
p = 2
q = 1
is_directed = True

G = node2vec.Graph(nx_G, is_directed, p, q)
G.preprocess_transition_probs()
node2vec_walks = G.simulate_walks(num_walks, walk_length)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  J = np.zeros(K, dtype=np.int)


Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


In [21]:
from gensim.models import Word2Vec

vector_size = 32
sentences = node2vec_walks
model_node = Word2Vec(sentences, window=5, min_count=0, workers=4, vector_size=vector_size)

edge_discover_path_list = []
for path in node2vec_walks:# + discover_path_list:
    edge = []
    for u, v in zip(path[:-1], path[1:]):
        edge.append(f'{u}-{v}')
    edge_discover_path_list.append(edge)
sentences = edge_discover_path_list
model_edge = Word2Vec(sentences, window=5, min_count=1, workers=4, vector_size=vector_size)

collecting all words and their counts
PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
PROGRESS: at sentence #10000, processed 199927 words, keeping 103920 word types
PROGRESS: at sentence #20000, processed 399759 words, keeping 191375 word types
PROGRESS: at sentence #30000, processed 599642 words, keeping 263482 word types
PROGRESS: at sentence #40000, processed 799607 words, keeping 325022 word types
PROGRESS: at sentence #50000, processed 999502 words, keeping 376982 word types
PROGRESS: at sentence #60000, processed 1199425 words, keeping 420893 word types
PROGRESS: at sentence #70000, processed 1399233 words, keeping 457862 word types
PROGRESS: at sentence #80000, processed 1599127 words, keeping 489272 word types
PROGRESS: at sentence #90000, processed 1798952 words, keeping 515819 word types
PROGRESS: at sentence #100000, processed 1998868 words, keeping 538531 word types
PROGRESS: at sentence #110000, processed 2198744 words, keeping 557855 word types
PROGRESS

PROGRESS: at sentence #1000000, processed 19988103 words, keeping 687405 word types
PROGRESS: at sentence #1010000, processed 20188021 words, keeping 687405 word types
PROGRESS: at sentence #1020000, processed 20387899 words, keeping 687405 word types
PROGRESS: at sentence #1030000, processed 20587808 words, keeping 687405 word types
PROGRESS: at sentence #1040000, processed 20787634 words, keeping 687405 word types
PROGRESS: at sentence #1050000, processed 20987495 words, keeping 687405 word types
PROGRESS: at sentence #1060000, processed 21187345 words, keeping 687405 word types
PROGRESS: at sentence #1070000, processed 21387232 words, keeping 687405 word types
PROGRESS: at sentence #1080000, processed 21587168 words, keeping 687405 word types
PROGRESS: at sentence #1090000, processed 21787112 words, keeping 687405 word types
PROGRESS: at sentence #1100000, processed 21986973 words, keeping 687405 word types
PROGRESS: at sentence #1110000, processed 22186895 words, keeping 687405 wor

PROGRESS: at sentence #1980000, processed 39576656 words, keeping 687405 word types
PROGRESS: at sentence #1990000, processed 39776510 words, keeping 687405 word types
PROGRESS: at sentence #2000000, processed 39976437 words, keeping 687405 word types
PROGRESS: at sentence #2010000, processed 40176372 words, keeping 687405 word types
PROGRESS: at sentence #2020000, processed 40376283 words, keeping 687405 word types
PROGRESS: at sentence #2030000, processed 40576170 words, keeping 687405 word types
PROGRESS: at sentence #2040000, processed 40776041 words, keeping 687405 word types
PROGRESS: at sentence #2050000, processed 40976010 words, keeping 687405 word types
PROGRESS: at sentence #2060000, processed 41175914 words, keeping 687405 word types
PROGRESS: at sentence #2070000, processed 41375790 words, keeping 687405 word types
PROGRESS: at sentence #2080000, processed 41575678 words, keeping 687405 word types
PROGRESS: at sentence #2090000, processed 41775630 words, keeping 687405 wor

PROGRESS: at sentence #2960000, processed 59164859 words, keeping 687405 word types
PROGRESS: at sentence #2970000, processed 59364719 words, keeping 687405 word types
PROGRESS: at sentence #2980000, processed 59564550 words, keeping 687405 word types
PROGRESS: at sentence #2990000, processed 59764438 words, keeping 687405 word types
PROGRESS: at sentence #3000000, processed 59964285 words, keeping 687405 word types
PROGRESS: at sentence #3010000, processed 60164171 words, keeping 687405 word types
PROGRESS: at sentence #3020000, processed 60364031 words, keeping 687405 word types
PROGRESS: at sentence #3030000, processed 60563925 words, keeping 687405 word types
PROGRESS: at sentence #3040000, processed 60763799 words, keeping 687405 word types
PROGRESS: at sentence #3050000, processed 60963760 words, keeping 687405 word types
PROGRESS: at sentence #3060000, processed 61163629 words, keeping 687405 word types
PROGRESS: at sentence #3070000, processed 61363538 words, keeping 687405 wor

PROGRESS: at sentence #3940000, processed 78753096 words, keeping 687405 word types
PROGRESS: at sentence #3950000, processed 78953035 words, keeping 687405 word types
PROGRESS: at sentence #3960000, processed 79152918 words, keeping 687405 word types
PROGRESS: at sentence #3970000, processed 79352794 words, keeping 687405 word types
PROGRESS: at sentence #3980000, processed 79552754 words, keeping 687405 word types
PROGRESS: at sentence #3990000, processed 79752621 words, keeping 687405 word types
PROGRESS: at sentence #4000000, processed 79952558 words, keeping 687405 word types
PROGRESS: at sentence #4010000, processed 80152502 words, keeping 687405 word types
PROGRESS: at sentence #4020000, processed 80352410 words, keeping 687405 word types
PROGRESS: at sentence #4030000, processed 80552312 words, keeping 687405 word types
PROGRESS: at sentence #4040000, processed 80752244 words, keeping 687405 word types
PROGRESS: at sentence #4050000, processed 80952175 words, keeping 687405 wor

PROGRESS: at sentence #4920000, processed 98342265 words, keeping 687405 word types
PROGRESS: at sentence #4930000, processed 98542130 words, keeping 687405 word types
PROGRESS: at sentence #4940000, processed 98741928 words, keeping 687405 word types
PROGRESS: at sentence #4950000, processed 98941876 words, keeping 687405 word types
PROGRESS: at sentence #4960000, processed 99141716 words, keeping 687405 word types
PROGRESS: at sentence #4970000, processed 99341524 words, keeping 687405 word types
PROGRESS: at sentence #4980000, processed 99541385 words, keeping 687405 word types
PROGRESS: at sentence #4990000, processed 99741335 words, keeping 687405 word types
PROGRESS: at sentence #5000000, processed 99941253 words, keeping 687405 word types
PROGRESS: at sentence #5010000, processed 100141213 words, keeping 687405 word types
PROGRESS: at sentence #5020000, processed 100341116 words, keeping 687405 word types
PROGRESS: at sentence #5030000, processed 100541054 words, keeping 687405 

PROGRESS: at sentence #5890000, processed 117731170 words, keeping 687405 word types
PROGRESS: at sentence #5900000, processed 117931062 words, keeping 687405 word types
PROGRESS: at sentence #5910000, processed 118130953 words, keeping 687405 word types
PROGRESS: at sentence #5920000, processed 118330808 words, keeping 687405 word types
PROGRESS: at sentence #5930000, processed 118530637 words, keeping 687405 word types
PROGRESS: at sentence #5940000, processed 118730498 words, keeping 687405 word types
PROGRESS: at sentence #5950000, processed 118930434 words, keeping 687405 word types
PROGRESS: at sentence #5960000, processed 119130283 words, keeping 687405 word types
PROGRESS: at sentence #5970000, processed 119330183 words, keeping 687405 word types
PROGRESS: at sentence #5980000, processed 119530095 words, keeping 687405 word types
PROGRESS: at sentence #5990000, processed 119729964 words, keeping 687405 word types
PROGRESS: at sentence #6000000, processed 119929806 words, keepin

PROGRESS: at sentence #6860000, processed 137119843 words, keeping 687405 word types
PROGRESS: at sentence #6870000, processed 137319671 words, keeping 687405 word types
collected 687405 word types from a corpus of 137400662 raw words and 6874050 sentences
Creating a fresh vocabulary
Word2Vec lifecycle event {'msg': 'effective_min_count=0 retains 687405 unique words (100.00% of original 687405, drops 0)', 'datetime': '2023-08-21T17:56:16.961854', 'gensim': '4.3.1', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]', 'platform': 'Linux-5.15.0-76-generic-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
Word2Vec lifecycle event {'msg': 'effective_min_count=0 leaves 137400662 word corpus (100.00% of original 137400662, drops 0)', 'datetime': '2023-08-21T17:56:16.962759', 'gensim': '4.3.1', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]', 'platform': 'Linux-5.15.0-76-generic-x86_64-with-glibc2.31', 'event': 'p

EPOCH 0: training on 137400662 raw words (137400662 effective words) took 74.3s, 1849002 effective words/s
EPOCH 1 - PROGRESS: at 1.27% examples, 1742344 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 2.57% examples, 1758035 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 3.87% examples, 1765420 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 5.11% examples, 1750378 words/s, in_qsize 8, out_qsize 0
EPOCH 1 - PROGRESS: at 6.44% examples, 1763070 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 7.73% examples, 1765402 words/s, in_qsize 8, out_qsize 0
EPOCH 1 - PROGRESS: at 9.06% examples, 1773007 words/s, in_qsize 8, out_qsize 0
EPOCH 1 - PROGRESS: at 10.39% examples, 1778650 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 11.67% examples, 1776128 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 13.00% examples, 1781393 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 14.34% examples, 1786104 words/s, in_qsize 7, out_qsize 0
EPOCH 1 -

EPOCH 2 - PROGRESS: at 34.87% examples, 1837580 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 36.24% examples, 1839301 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 37.62% examples, 1841062 words/s, in_qsize 8, out_qsize 0
EPOCH 2 - PROGRESS: at 38.95% examples, 1840520 words/s, in_qsize 8, out_qsize 0
EPOCH 2 - PROGRESS: at 40.27% examples, 1839664 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 41.56% examples, 1837099 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 42.89% examples, 1836840 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 44.27% examples, 1838131 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 45.65% examples, 1839869 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 47.02% examples, 1840922 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 48.39% examples, 1842157 words/s, in_qsize 8, out_qsize 0
EPOCH 2 - PROGRESS: at 49.76% examples, 1843040 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 51.13

EPOCH 3 - PROGRESS: at 70.81% examples, 1830056 words/s, in_qsize 8, out_qsize 0
EPOCH 3 - PROGRESS: at 72.18% examples, 1830952 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 73.51% examples, 1830815 words/s, in_qsize 7, out_qsize 1
EPOCH 3 - PROGRESS: at 74.78% examples, 1829045 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 76.09% examples, 1828418 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 77.45% examples, 1829125 words/s, in_qsize 8, out_qsize 0
EPOCH 3 - PROGRESS: at 78.81% examples, 1829776 words/s, in_qsize 8, out_qsize 0
EPOCH 3 - PROGRESS: at 80.17% examples, 1830110 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 81.51% examples, 1830205 words/s, in_qsize 8, out_qsize 0
EPOCH 3 - PROGRESS: at 82.84% examples, 1830316 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 84.21% examples, 1830899 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 85.57% examples, 1831583 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 86.93

Word2Vec lifecycle event {'params': 'Word2Vec<vocab=687405, vector_size=32, alpha=0.025>', 'datetime': '2023-08-21T18:02:36.952973', 'gensim': '4.3.1', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:26:04) [GCC 10.4.0]', 'platform': 'Linux-5.15.0-76-generic-x86_64-with-glibc2.31', 'event': 'created'}
collecting all words and their counts
PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
PROGRESS: at sentence #10000, processed 189927 words, keeping 147108 word types
PROGRESS: at sentence #20000, processed 379759 words, keeping 282813 word types
PROGRESS: at sentence #30000, processed 569642 words, keeping 406210 word types
PROGRESS: at sentence #40000, processed 759607 words, keeping 520122 word types
PROGRESS: at sentence #50000, processed 949502 words, keeping 624389 word types
PROGRESS: at sentence #60000, processed 1139425 words, keeping 720526 word types
PROGRESS: at sentence #70000, processed 1329233 words, keeping 808698 word types
PROGRESS: 

PROGRESS: at sentence #950000, processed 18038681 words, keeping 1760720 word types
PROGRESS: at sentence #960000, processed 18228564 words, keeping 1760752 word types
PROGRESS: at sentence #970000, processed 18418453 words, keeping 1760776 word types
PROGRESS: at sentence #980000, processed 18608295 words, keeping 1760797 word types
PROGRESS: at sentence #990000, processed 18798220 words, keeping 1760838 word types
PROGRESS: at sentence #1000000, processed 18988103 words, keeping 1760864 word types
PROGRESS: at sentence #1010000, processed 19178021 words, keeping 1760891 word types
PROGRESS: at sentence #1020000, processed 19367899 words, keeping 1760910 word types
PROGRESS: at sentence #1030000, processed 19557808 words, keeping 1760928 word types
PROGRESS: at sentence #1040000, processed 19747634 words, keeping 1760953 word types
PROGRESS: at sentence #1050000, processed 19937495 words, keeping 1760967 word types
PROGRESS: at sentence #1060000, processed 20127345 words, keeping 1760

PROGRESS: at sentence #1920000, processed 36457385 words, keeping 1761255 word types
PROGRESS: at sentence #1930000, processed 36647323 words, keeping 1761255 word types
PROGRESS: at sentence #1940000, processed 36837143 words, keeping 1761255 word types
PROGRESS: at sentence #1950000, processed 37027056 words, keeping 1761256 word types
PROGRESS: at sentence #1960000, processed 37216941 words, keeping 1761256 word types
PROGRESS: at sentence #1970000, processed 37406784 words, keeping 1761257 word types
PROGRESS: at sentence #1980000, processed 37596656 words, keeping 1761257 word types
PROGRESS: at sentence #1990000, processed 37786510 words, keeping 1761258 word types
PROGRESS: at sentence #2000000, processed 37976437 words, keeping 1761259 word types
PROGRESS: at sentence #2010000, processed 38166372 words, keeping 1761259 word types
PROGRESS: at sentence #2020000, processed 38356283 words, keeping 1761260 word types
PROGRESS: at sentence #2030000, processed 38546170 words, keeping

PROGRESS: at sentence #2890000, processed 54875745 words, keeping 1761272 word types
PROGRESS: at sentence #2900000, processed 55065642 words, keeping 1761272 word types
PROGRESS: at sentence #2910000, processed 55255440 words, keeping 1761273 word types
PROGRESS: at sentence #2920000, processed 55445384 words, keeping 1761273 word types
PROGRESS: at sentence #2930000, processed 55635257 words, keeping 1761274 word types
PROGRESS: at sentence #2940000, processed 55825126 words, keeping 1761274 word types
PROGRESS: at sentence #2950000, processed 56014931 words, keeping 1761274 word types
PROGRESS: at sentence #2960000, processed 56204859 words, keeping 1761274 word types
PROGRESS: at sentence #2970000, processed 56394719 words, keeping 1761274 word types
PROGRESS: at sentence #2980000, processed 56584550 words, keeping 1761274 word types
PROGRESS: at sentence #2990000, processed 56774438 words, keeping 1761274 word types
PROGRESS: at sentence #3000000, processed 56964285 words, keeping

PROGRESS: at sentence #3860000, processed 73294085 words, keeping 1761275 word types
PROGRESS: at sentence #3870000, processed 73483978 words, keeping 1761275 word types
PROGRESS: at sentence #3880000, processed 73673802 words, keeping 1761275 word types
PROGRESS: at sentence #3890000, processed 73863688 words, keeping 1761275 word types
PROGRESS: at sentence #3900000, processed 74053571 words, keeping 1761275 word types
PROGRESS: at sentence #3910000, processed 74243485 words, keeping 1761275 word types
PROGRESS: at sentence #3920000, processed 74433352 words, keeping 1761275 word types
PROGRESS: at sentence #3930000, processed 74623245 words, keeping 1761275 word types
PROGRESS: at sentence #3940000, processed 74813096 words, keeping 1761275 word types
PROGRESS: at sentence #3950000, processed 75003035 words, keeping 1761275 word types
PROGRESS: at sentence #3960000, processed 75192918 words, keeping 1761275 word types
PROGRESS: at sentence #3970000, processed 75382794 words, keeping

PROGRESS: at sentence #4830000, processed 91713286 words, keeping 1761275 word types
PROGRESS: at sentence #4840000, processed 91903174 words, keeping 1761275 word types
PROGRESS: at sentence #4850000, processed 92093107 words, keeping 1761275 word types
PROGRESS: at sentence #4860000, processed 92282906 words, keeping 1761275 word types
PROGRESS: at sentence #4870000, processed 92472830 words, keeping 1761275 word types
PROGRESS: at sentence #4880000, processed 92662650 words, keeping 1761275 word types
PROGRESS: at sentence #4890000, processed 92852560 words, keeping 1761275 word types
PROGRESS: at sentence #4900000, processed 93042466 words, keeping 1761275 word types
PROGRESS: at sentence #4910000, processed 93232408 words, keeping 1761275 word types
PROGRESS: at sentence #4920000, processed 93422265 words, keeping 1761275 word types
PROGRESS: at sentence #4930000, processed 93612130 words, keeping 1761275 word types
PROGRESS: at sentence #4940000, processed 93801928 words, keeping

PROGRESS: at sentence #5790000, processed 109942156 words, keeping 1761276 word types
PROGRESS: at sentence #5800000, processed 110132053 words, keeping 1761276 word types
PROGRESS: at sentence #5810000, processed 110321964 words, keeping 1761276 word types
PROGRESS: at sentence #5820000, processed 110511888 words, keeping 1761276 word types
PROGRESS: at sentence #5830000, processed 110701735 words, keeping 1761276 word types
PROGRESS: at sentence #5840000, processed 110891630 words, keeping 1761276 word types
PROGRESS: at sentence #5850000, processed 111081525 words, keeping 1761276 word types
PROGRESS: at sentence #5860000, processed 111271476 words, keeping 1761276 word types
PROGRESS: at sentence #5870000, processed 111461383 words, keeping 1761276 word types
PROGRESS: at sentence #5880000, processed 111651257 words, keeping 1761276 word types
PROGRESS: at sentence #5890000, processed 111841170 words, keeping 1761276 word types
PROGRESS: at sentence #5900000, processed 112031062 wo

PROGRESS: at sentence #6750000, processed 128171177 words, keeping 1761276 word types
PROGRESS: at sentence #6760000, processed 128361077 words, keeping 1761276 word types
PROGRESS: at sentence #6770000, processed 128550968 words, keeping 1761276 word types
PROGRESS: at sentence #6780000, processed 128740862 words, keeping 1761276 word types
PROGRESS: at sentence #6790000, processed 128930694 words, keeping 1761276 word types
PROGRESS: at sentence #6800000, processed 129120610 words, keeping 1761276 word types
PROGRESS: at sentence #6810000, processed 129310456 words, keeping 1761276 word types
PROGRESS: at sentence #6820000, processed 129500328 words, keeping 1761276 word types
PROGRESS: at sentence #6830000, processed 129690203 words, keeping 1761276 word types
PROGRESS: at sentence #6840000, processed 129880106 words, keeping 1761276 word types
PROGRESS: at sentence #6850000, processed 130070011 words, keeping 1761276 word types
PROGRESS: at sentence #6860000, processed 130259843 wo

EPOCH 0 - PROGRESS: at 49.54% examples, 1017229 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 50.40% examples, 1018770 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 51.15% examples, 1018115 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 52.04% examples, 1020127 words/s, in_qsize 8, out_qsize 0
EPOCH 0 - PROGRESS: at 52.86% examples, 1020708 words/s, in_qsize 5, out_qsize 2
EPOCH 0 - PROGRESS: at 53.52% examples, 1018329 words/s, in_qsize 8, out_qsize 2
EPOCH 0 - PROGRESS: at 54.33% examples, 1018711 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 55.05% examples, 1017582 words/s, in_qsize 7, out_qsize 0
EPOCH 0 - PROGRESS: at 55.94% examples, 1019375 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 56.83% examples, 1021195 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 57.71% examples, 1022823 words/s, in_qsize 8, out_qsize 0
EPOCH 0 - PROGRESS: at 58.48% examples, 1022322 words/s, in_qsize 6, out_qsize 1
EPOCH 0 - PROGRESS: at 59.21

EPOCH 1 - PROGRESS: at 29.84% examples, 1041030 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 30.69% examples, 1042121 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 31.39% examples, 1038035 words/s, in_qsize 6, out_qsize 1
EPOCH 1 - PROGRESS: at 32.06% examples, 1033476 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 32.68% examples, 1027436 words/s, in_qsize 5, out_qsize 2
EPOCH 1 - PROGRESS: at 33.41% examples, 1025654 words/s, in_qsize 6, out_qsize 1
EPOCH 1 - PROGRESS: at 34.30% examples, 1028461 words/s, in_qsize 8, out_qsize 0
EPOCH 1 - PROGRESS: at 35.04% examples, 1026822 words/s, in_qsize 7, out_qsize 0
EPOCH 1 - PROGRESS: at 35.81% examples, 1025851 words/s, in_qsize 6, out_qsize 1
EPOCH 1 - PROGRESS: at 36.50% examples, 1022971 words/s, in_qsize 8, out_qsize 2
EPOCH 1 - PROGRESS: at 37.29% examples, 1023175 words/s, in_qsize 6, out_qsize 1
EPOCH 1 - PROGRESS: at 38.12% examples, 1023943 words/s, in_qsize 6, out_qsize 1
EPOCH 1 - PROGRESS: at 39.16

EPOCH 2 - PROGRESS: at 12.36% examples, 1056323 words/s, in_qsize 5, out_qsize 2
EPOCH 2 - PROGRESS: at 13.08% examples, 1047983 words/s, in_qsize 7, out_qsize 0
EPOCH 2 - PROGRESS: at 13.73% examples, 1036980 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 14.50% examples, 1034150 words/s, in_qsize 5, out_qsize 2
EPOCH 2 - PROGRESS: at 15.17% examples, 1025909 words/s, in_qsize 7, out_qsize 3
EPOCH 2 - PROGRESS: at 15.88% examples, 1018764 words/s, in_qsize 5, out_qsize 2
EPOCH 2 - PROGRESS: at 16.52% examples, 1009314 words/s, in_qsize 8, out_qsize 2
EPOCH 2 - PROGRESS: at 17.47% examples, 1018987 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 18.42% examples, 1028007 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 19.37% examples, 1036498 words/s, in_qsize 8, out_qsize 1
EPOCH 2 - PROGRESS: at 20.10% examples, 1033354 words/s, in_qsize 5, out_qsize 2
EPOCH 2 - PROGRESS: at 20.89% examples, 1032149 words/s, in_qsize 8, out_qsize 1
EPOCH 2 - PROGRESS: at 21.76

EPOCH 2 - PROGRESS: at 94.02% examples, 1035815 words/s, in_qsize 8, out_qsize 1
EPOCH 2 - PROGRESS: at 94.85% examples, 1036086 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 95.58% examples, 1035164 words/s, in_qsize 8, out_qsize 2
EPOCH 2 - PROGRESS: at 96.43% examples, 1035759 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 97.12% examples, 1034610 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 98.04% examples, 1035790 words/s, in_qsize 6, out_qsize 1
EPOCH 2 - PROGRESS: at 98.96% examples, 1036913 words/s, in_qsize 8, out_qsize 1
EPOCH 2 - PROGRESS: at 99.86% examples, 1038045 words/s, in_qsize 8, out_qsize 0
EPOCH 2: training on 130526612 raw words (130526612 effective words) took 125.7s, 1038387 effective words/s
EPOCH 3 - PROGRESS: at 0.73% examples, 933289 words/s, in_qsize 8, out_qsize 1
EPOCH 3 - PROGRESS: at 1.58% examples, 1019771 words/s, in_qsize 6, out_qsize 1
EPOCH 3 - PROGRESS: at 2.47% examples, 1061859 words/s, in_qsize 7, out_qsize 0
EPOCH

EPOCH 3 - PROGRESS: at 73.33% examples, 1015767 words/s, in_qsize 6, out_qsize 1
EPOCH 3 - PROGRESS: at 74.20% examples, 1016994 words/s, in_qsize 6, out_qsize 1
EPOCH 3 - PROGRESS: at 74.95% examples, 1016323 words/s, in_qsize 4, out_qsize 3
EPOCH 3 - PROGRESS: at 75.60% examples, 1014493 words/s, in_qsize 5, out_qsize 2
EPOCH 3 - PROGRESS: at 76.47% examples, 1015552 words/s, in_qsize 8, out_qsize 3
EPOCH 3 - PROGRESS: at 77.27% examples, 1015547 words/s, in_qsize 5, out_qsize 2
EPOCH 3 - PROGRESS: at 78.11% examples, 1016179 words/s, in_qsize 6, out_qsize 1
EPOCH 3 - PROGRESS: at 79.02% examples, 1017920 words/s, in_qsize 6, out_qsize 1
EPOCH 3 - PROGRESS: at 79.76% examples, 1016939 words/s, in_qsize 5, out_qsize 2
EPOCH 3 - PROGRESS: at 80.54% examples, 1016651 words/s, in_qsize 6, out_qsize 1
EPOCH 3 - PROGRESS: at 81.31% examples, 1016567 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 82.21% examples, 1017978 words/s, in_qsize 7, out_qsize 0
EPOCH 3 - PROGRESS: at 83.01

EPOCH 4 - PROGRESS: at 53.66% examples, 1033035 words/s, in_qsize 4, out_qsize 3
EPOCH 4 - PROGRESS: at 54.35% examples, 1030517 words/s, in_qsize 5, out_qsize 2
EPOCH 4 - PROGRESS: at 55.10% examples, 1029704 words/s, in_qsize 6, out_qsize 1
EPOCH 4 - PROGRESS: at 55.99% examples, 1031405 words/s, in_qsize 6, out_qsize 1
EPOCH 4 - PROGRESS: at 56.83% examples, 1031943 words/s, in_qsize 6, out_qsize 1
EPOCH 4 - PROGRESS: at 57.54% examples, 1030386 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 58.20% examples, 1027883 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 58.97% examples, 1027474 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 59.75% examples, 1027070 words/s, in_qsize 6, out_qsize 1
EPOCH 4 - PROGRESS: at 60.61% examples, 1028269 words/s, in_qsize 8, out_qsize 1
EPOCH 4 - PROGRESS: at 61.49% examples, 1029741 words/s, in_qsize 7, out_qsize 0
EPOCH 4 - PROGRESS: at 62.37% examples, 1031177 words/s, in_qsize 8, out_qsize 0
EPOCH 4 - PROGRESS: at 63.25

In [22]:
magnitudes = []
for osmid in osm_nodes['osmidstr']:
    if osmid in model_node.wv:
        vec = model_node.wv[osmid]
        mag = np.linalg.norm(vec)
        magnitudes.append(mag)
    else:
        magnitudes.append(-1)
osm_nodes['magnitude'] = magnitudes
# Layer(osm_nodes_3310[osm_nodes_3310['magnitude'] > 0], color_continuous_style('magnitude'))


magnitudes = []
osm_edges['u-v'] = osm_edges['u'].astype(str) + '-' + osm_edges['v'].astype(str)
for osmid in osm_edges['u-v']:
    if osmid in model_edge.wv:
        vec = model_edge.wv[osmid]
        mag = np.linalg.norm(vec)
        magnitudes.append(mag)
    else:
        magnitudes.append(-1)
osm_edges['magnitude'] = magnitudes
# Layer(osm_edges_3310[osm_edges_3310['magnitude'] > 0], color_continuous_style('magnitude'))

In [23]:
# ftest_edges = osm_edges_3310[osm_edges_3310['magnitude'] > 0].copy()
# ftest_edges['bx'] = ftest_edges.geometry.centroid.x
# ftest_edges['by'] = ftest_edges.geometry.centroid.y

# house_gdf_3310 = nhouse_gdf.to_crs('epsg:3310')
# house_gdf_3310['bx'] = house_gdf_3310.geometry.centroid.x
# house_gdf_3310['by'] = house_gdf_3310.geometry.centroid.y

# vectors = []
# for _, item in tqdm.tqdm(house_gdf_3310.iterrows(), total=len(house_gdf)):
#     iftest_edges = ftest_edges[(ftest_edges['bx'] > item['bx'] - 500) & 
#                 (ftest_edges['bx'] < item['bx'] + 500) & 
#                 (ftest_edges['by'] > item['by'] - 500) & 
#                 (ftest_edges['by'] < item['by'] + 500)]
    
#     jtem = iftest_edges.iloc[iftest_edges.distance(item.geometry).argmin()]
#     vec = model.wv[jtem['u-v']]
#     vectors.append(vec)

In [24]:
ftest_nodes = osm_nodes[osm_nodes['magnitude'] > 0].to_crs('epsg:3310')
ftest_nodes['bx'] = ftest_nodes.geometry.centroid.x
ftest_nodes['by'] = ftest_nodes.geometry.centroid.y
fosmid2geo = {osmid:geo for osmid, geo in zip(ftest_nodes['osmidstr'], ftest_nodes['geometry'])}

ftest_edges = osm_edges[osm_edges['magnitude'] > 0].to_crs('epsg:3310')
ftest_edges['bx'] = ftest_edges.geometry.centroid.x
ftest_edges['by'] = ftest_edges.geometry.centroid.y

house_gdf_3310 = nhouse_gdf.to_crs('epsg:3310')
house_gdf_3310['bx'] = house_gdf_3310.geometry.centroid.x
house_gdf_3310['by'] = house_gdf_3310.geometry.centroid.y

In [26]:
vectors = []
geos = []
for _, item in tqdm.tqdm(house_gdf_3310.iterrows(), total=len(house_gdf)):
    iftest_edges = ftest_edges[(ftest_edges['bx'] > item['bx'] - 500) & 
                                (ftest_edges['bx'] < item['bx'] + 500) & 
                                (ftest_edges['by'] > item['by'] - 500) & 
                                (ftest_edges['by'] < item['by'] + 500)]
    if len(iftest_edges) == 0:
        iftest_edges = ftest_edges
    
    
    jtem = iftest_edges.iloc[iftest_edges.distance(item.geometry).argmin()]
    u, v = str(jtem['u']), str(jtem['v'])
    vec_u, vec_v = model_node.wv[u], model_node.wv[v]
    dist_u, dist_v = item.geometry.distance(fosmid2geo[u]), item.geometry.distance(fosmid2geo[v])
    ratios = np.array([1/dist_u, 1/dist_v])
    ratios /= np.sum(ratios)
    
    
    vec = vec_u*ratios[0] + vec_v*ratios[1]
    vectors.append(vec)

100%|████████████████████████████████████████████████████████████████████████████| 21608/21608 [01:38<00:00, 218.44it/s]


In [28]:
np.save(f'{dname}/road2vec_n2v_32_nodes.npy', vectors)

In [29]:
house_gdf_3310 = house_gdf.to_crs('epsg:3310')
thouse_gdf_3310 = train_gdf.to_crs('epsg:3310')

In [31]:
n2v_vectors = np.array(vectors)
n2v_vectors_train = n2v_vectors[:len(thouse_gdf_3310)]

In [32]:
train_gdf_3310 = train_gdf.to_crs('epsg:3310')

In [33]:
dist_mat = []
for i, (_, item) in tqdm.tqdm(enumerate(house_gdf_3310.iterrows()), total=len(house_gdf_3310)):
    mdist1 = np.linalg.norm(n2v_vectors[i] - n2v_vectors_train, axis=-1)
    mdist1 /= mdist1.std()
    #dist_mat.append(mdist1)
    mdist2 = train_gdf_3310.distance(item.geometry).values
    mdist2 /= mdist2.std()
    dist_mat.append(mdist1 + mdist2*10)
dist_mat = np.stack(dist_mat, 0)

100%|████████████████████████████████████████████████████████████████████████████| 21608/21608 [01:43<00:00, 209.27it/s]


In [34]:
j=100
ahgdf = train_gdf.copy()
ahgdf['diststr'] = dist_mat[j]

In [35]:
Map([
#     Layer(fhgdf, color_continuous_style('diststr', palette='sunset')),
#     Layer(gdf.iloc[data_ori['idx_geo'][j]], color_continuous_style('price', palette='sunset'))
    Layer(ahgdf.iloc[dist_mat[j].argsort()[1:61]], color_continuous_style('diststr', palette='sunset')),
    Layer(house_gdf.iloc[[j]], basic_style(color='green')),

])

  parameter_args = inspect.getargspec(decorated_function).args


In [36]:
new_idx = []
new_dist_eucli = []
new_dist_geo = []

for i, (_, item) in tqdm.tqdm(enumerate(house_gdf_3310.iterrows()), total=len(house_gdf_3310)):
    target = item.values[:-1]
    candidates = thouse_gdf_3310.iloc[dist_mat[i].argsort()[1:61]].values[:, :-1]
    new_idx.append(dist_mat[i].argsort()[1:61])
    new_dist_geo.append(dist_mat[i][dist_mat[i].argsort()[1:61]])
    new_dist_eucli.append(np.linalg.norm(target.astype(float) - candidates.astype(float), axis=-1))
    
new_idx = np.stack(new_idx, 0)
new_dist_eucli = np.stack(new_dist_eucli, 0)
new_dist_geo = np.stack(new_dist_geo, 0)

100%|████████████████████████████████████████████████████████████████████████████| 21608/21608 [01:31<00:00, 235.36it/s]


In [37]:
newdata = dict() 
newdata['X_train'] = data['X_train']
newdata['y_train'] = data['y_train']
newdata['X_test'] = data['X_test']
newdata['y_test'] = data['y_test']
newdata['idx_eucli'] = new_idx
newdata['idx_geo'] = new_idx
newdata['dist_eucli'] = new_dist_eucli
newdata['dist_geo'] = new_dist_geo

In [38]:
dname

'kc'

In [39]:
np.savez(f'{dname}/data_n2v_nodes.npz', **newdata)