# Geo Embeddings

Notebook to apply different embeddings to represent h3 cells

@roman

24 / 3 / 25

---
# Settings

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import h3
import os
import networkx as nx
from node2vec import Node2Vec
from sklearn.manifold import TSNE

from INEGIpy import MarcoGeoestadistico



In [None]:
# settings
pd.options.display.max_columns = 120
geo_framework = MarcoGeoestadistico()

# params
HEX_RESOLUTION = 9

---
# Data

## Read

### Inegi


In [None]:
# get all mexico states
gdf_mex_states = geo_framework.Entidades()
gdf_mex_states.head()

### Hexagons

In [None]:
# hex_id to lat/lon
def hex_to_lat_lon(hex_id):
    lat, lon = h3.h3_to_geo(hex_id)
    return lat, lon

# vectorize function
v_hex_to_lat_lon = np.vectorize(hex_to_lat_lon)

In [None]:
# read hex
gdf_terrain_prices = pd.read_parquet("../../data/misc/terrain_prices.parquet")

# to lat/lon
gdf_terrain_prices["latitude"], gdf_terrain_prices["longitude"] = v_hex_to_lat_lon(gdf_terrain_prices["hex_id"])

# to geodataframe
gdf_terrain_prices = gpd.GeoDataFrame(
    gdf_terrain_prices, 
    geometry=gpd.points_from_xy(gdf_terrain_prices["longitude"], gdf_terrain_prices["latitude"]),
    crs="EPSG:4326"
)

# look
print(gdf_terrain_prices.shape)
gdf_terrain_prices.head()

In [None]:
# plot
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# plot mexico but without boundries
gdf_mex_states.plot(ax=ax, color="gray", alpha=0.5)

# add hexagons
gdf_terrain_prices.plot(ax=ax, color="red", alpha=0.5)

# show
plt.show()

### Cities

In [None]:
# read cities
gdf_cities = gpd.read_parquet("../../data/misc/cities.parquet")

# crs to 4326
gdf_cities = gdf_cities.to_crs(epsg=4326)

# look
print(gdf_cities.shape)
gdf_cities.head()

In [None]:
# plot
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

# plot mexico but without boundries
gdf_mex_states.plot(ax=ax, color="gray", alpha=0.5)

# add hexagons
gdf_cities.plot(ax=ax, color="red")

# show
plt.show()

## Wrangle

In [None]:
# spatial join between terrains and cities
gdf_terrain_prices = (
    gdf_terrain_prices
    .sjoin(
        gdf_cities, 
        how="left", 
        predicate="intersects"
    )
    .drop(columns="index_right")
)

# look
print(gdf_terrain_prices.shape)
gdf_terrain_prices.head()

---
# Graph2Vec

## S1: Get krings for each hex

In [None]:
# function to get neighbors at ring of size K
def get_neighbors(hex_id, K):
    return list(h3.hex_ring(hex_id, K))

# get neighbors and explode into a DataFrame
df_hex_neighbors = (
    gdf_terrain_prices[["hex_id"]]
    .assign(neighbors=lambda df: df["hex_id"].apply(get_neighbors, K=1))
    .explode("neighbors")
    .rename(columns={"hex_id": "hex_center_id", "neighbors": "hex_neighbor_id"})
    .reset_index(drop=True)
    .set_index(["hex_center_id"])
)

# look
df_hex_neighbors

In [None]:
np.isclose(df_hex_neighbors.shape[0] / 6, gdf_terrain_prices.shape[0])

In [None]:
# unnique hex_center_id
df_hex_neighbors.index.nunique()

## S2: Create Graph

In [None]:
# edges
edges = [
    (center, neighbor)
    for center, neighbor in tqdm(df_hex_neighbors.itertuples())
]

# look edges
edges[:5]

In [None]:
# create graph
G = nx.Graph()
G.add_edges_from(edges)

# Check the graph structure
print(f"Number of nodes: {G.number_of_nodes()}, Number of edges: {G.number_of_edges()}")

Some Hex IDs in the Neighbor Column Don’t Exist in the Main Column

## S3: Train Graph2Vec

In [None]:
# Define Node2Vec model
node2vec = Node2Vec(G, dimensions=16, walk_length=10, num_walks=10, workers=4)

# Fit model
model = node2vec.fit(window=5, min_count=1, batch_words=4)

In [None]:
# Get embeddings
df_hex_embeddings = pd.DataFrame({hex_id: model.wv[hex_id] for hex_id in G.nodes}).T

# rename columns
df_hex_embeddings.columns = [f"hex_embedding_{i+1}" for i in range(df_hex_embeddings.shape[1])]

df_hex_embeddings

In [None]:
# join embeddings to df_terrain_prices
cols_of_embeddings = df_hex_embeddings.filter(like="hex_embedding").columns.tolist()

gdf_terrain_prices_v2 = (
    gdf_terrain_prices
    .merge(
        df_hex_embeddings.reset_index().rename(columns={"index": "hex_id"}),
        how="inner",
        on="hex_id"
    )
    .loc[:, ["hex_id"] + cols_of_embeddings]
)

# look
print(gdf_terrain_prices_v2.shape)
gdf_terrain_prices_v2.head()

---
# Save

In [None]:
# save embeddings
gdf_terrain_prices_v2.to_parquet("../../data/misc/hex_embeddings.parquet")

In [None]:
# save model
model.save("../../models/hex2vec.model")

**Load full model**
model = Word2Vec.load("node2vec_model.model")


**Load just the word vectors**

from gensim.models import KeyedVectors

model_wv = KeyedVectors.load_word2vec_format("hex_embeddings.bin", binary=True)

**Get the embedding for a hex ID**

hex_id = "8928308280fffff"  # Example hex ID

embedding = model.wv[hex_id]

print(embedding)



---
# Sandbox

In [None]:
gdf_terrain_prices

In [None]:
# duplicated in gdf_terrain_prices hex_id
gdf_terrain_prices['hex_id'].duplicated().sum()