# Data Processing

## Imports

In [None]:
import geopandas as gpd
import osmnx as ox
import pandas as pd

## Parameters

In [None]:
cantonal_facilities_filepath = "data/raw/cantonal-facilities.csv"
gps_coords_col = "GPS Coordinates (Latitude, Longitude)"
address_col = "Postal Address"
epsg = 2056
dst_filepath = "data/procesed/cantonal-facilities.gpkg"

## Load Data and get numeric lat/lon from string coordinates

In [None]:
df = pd.read_csv(cantonal_facilities_filepath)
df[["lat", "lon"]] = (
    df[gps_coords_col]
    .str.split(", ")
    .apply(pd.Series)
    .rename(columns={0: "lat", 1: "lon"})
)
df = df.drop(columns=[gps_coords_col])

## Drop rows missing both coordinates and address_col

In [None]:
df = df[~(df[address_col].isna() & df[["lat", "lon"]].isna().any(axis=1))]

## Infer missing coordinates from address

In [None]:
nan_coords_cond = df[["lat", "lon"]].isna().any(axis=1)
df.loc[nan_coords_cond, ["lat", "lon"]] = (
    df.loc[nan_coords_cond, address_col]
    .apply(lambda x: ox.geocode(x))
    .apply(pd.Series)
    .rename(columns={0: "lat", 1: "lon"})
)

In [None]:
# show that we have no more missing coordinates
df[["lat", "lon"]].isna().any(axis=1).sum()

## Create geo data frame and dump to file

In [None]:
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df["lon"], df["lat"]), crs="EPSG:4326"
).to_crs(epsg=epsg)

In [None]:
gdf.to_file(dst_filepath)