In [1]:
from geotessera import GeoTessera
import numpy as np
# Initialize GeoTessera client
gt = GeoTessera()

# Define region of interest (bbox)
# Example: Region in Spain
bbox = (-1.671467, 42.804532, -1.599112, 42.838276)  # (min_lon, min_lat, max_lon, max_lat)
year = 2024

print(f"Bounding box: {bbox}")
print(f"Year: {year}")

Could not check for updates: [WinError 183] No se puede crear un archivo que ya existe: 'C:\\Users\\2012m\\AppData\\Local\\geotessera\\.registry.parquet_tmp_k85e_dlm.parquet' -> 'C:\\Users\\2012m\\AppData\\Local\\geotessera\\registry.parquet'
Could not check for landmasks updates: [WinError 183] No se puede crear un archivo que ya existe: 'C:\\Users\\2012m\\AppData\\Local\\geotessera\\.landmasks.parquet_tmp_z7qp9oiy.parquet' -> 'C:\\Users\\2012m\\AppData\\Local\\geotessera\\landmasks.parquet'


Bounding box: (-1.671467, 42.804532, -1.599112, 42.838276)
Year: 2024


In [2]:
# Fetch embeddings for the region
tiles_to_fetch = gt.registry.load_blocks_for_region(bounds=bbox, year=year)
embeddings_gen = gt.fetch_embeddings(tiles_to_fetch)

# Collect all embeddings, coordinates and metadata
all_embeddings = []
all_coordinates = []
tile_metadata = []

print("Fetching embeddings from GeoTessera...")
for year, tile_lon, tile_lat, embedding_array, crs, transform in embeddings_gen:
    h, w, channels = embedding_array.shape
    
    # Flatten spatial dimensions: (H, W, 128) -> (H*W, 128)
    flat_emb = embedding_array.reshape(-1, embedding_array.shape[-1])
    all_embeddings.append(flat_emb)
    
    # Calculate pixel coordinates using the affine transform
    pixel_coords = []
    for row in range(h):
        for col in range(w):
            # Transform pixel coordinates to geographic coordinates
            lon, lat = transform * (col, row)
            pixel_coords.append((lon, lat))
    
    all_coordinates.extend(pixel_coords)
    
    tile_metadata.append({
        'lat': tile_lat,
        'lon': tile_lon,
        'shape': embedding_array.shape,
        'n_pixels': flat_emb.shape[0],
        'crs': crs,
        'transform': transform
    })
    print(f"  Tile ({tile_lat:.2f}, {tile_lon:.2f}): {embedding_array.shape}")

# Concatenate all embeddings
embeddings_array = np.vstack(all_embeddings)

print(f"\n✓ Total embeddings: {embeddings_array.shape}")
print(f"✓ Number of tiles: {len(tile_metadata)}")
print(f"✓ Total pixels: {embeddings_array.shape[0]:,}")
print(f"✓ Embedding dimension: {embeddings_array.shape[1]}")
print(f"✓ Coordinates calculated: {len(all_coordinates):,}")


Fetching embeddings from GeoTessera...
  Tile (42.85, -1.65): (1124, 835, 128)
  Tile (42.85, -1.55): (1125, 836, 128)

✓ Total embeddings: (1879040, 128)
✓ Number of tiles: 2
✓ Total pixels: 1,879,040
✓ Embedding dimension: 128
✓ Coordinates calculated: 1,879,040


In [4]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

pdf = pd.DataFrame(embeddings_array)

# Add ID and coordinates
pdf['id'] = range(len(pdf))
pdf['lon'] = [coord[0] for coord in all_coordinates]
pdf['lat'] = [coord[1] for coord in all_coordinates]

cols = ['id', 'lon', 'lat'] + [col for col in pdf.columns if col not in ['id', 'lon', 'lat']]
pdf = pdf[cols]

print(f"✓ Created Pandas DataFrame with {len(pdf):,} rows")
print(f"✓ Columns: {list(pdf.columns[:5])}... ({len(pdf.columns)} total)")

# Save to Parquet (efficient columnar format)
parquet_path = "GeoTessera_Pamplona_embeddings.parquet"
print(f"\nSaving to Parquet: {parquet_path}...")
pdf.to_parquet(parquet_path, engine='pyarrow', index=False)
print(f"✓ Saved to Parquet with coordinates")


✓ Created Pandas DataFrame with 1,879,040 rows
✓ Columns: ['id', 'lon', 'lat', 0, 1]... (131 total)

Saving to Parquet: GeoTessera_Pamplona_embeddings.parquet...
✓ Saved to Parquet with coordinates
