In [3]:
import geopandas as gpd
import pandas as pd

geom = gpd.read_file('src/Vectorisation/')
geom = geom.reset_index()
geom.rename(columns={'index': 'geom_id'}, inplace=True)
df = pd.read_excel('src/legende_renove_2.xlsx')
# only one entry with no folio, we can drop it (only information is in column "*": 'MANQUE DES NUMEROS POUR FO. 10 (IL MANQUE PROBABLEMENT UNE PAGE)')
df = df[df['folio'].notnull()]
points = gpd.read_file('src/numeros_merged-v2.json') # no null values in the relevant part. 
print(len(points))
points.drop(columns=['id'], inplace=True)
points = points.drop_duplicates()
print(len(points))

def check_decimal_vals(df, col) -> None:
    tdf = df.copy()
    tdf['decimal_part'] = tdf.apply(lambda x: x[col] - int(x[col]), axis=1)
    dec_vals = list(tdf.decimal_part.value_counts().items())
    if len(dec_vals) > 1 or (len(dec_vals) == 1 and dec_vals[0][0] != 0.0):
        print(f'There are {col} with decimal parts. This is unexpected.')
        print(dec_vals)
        print('Please check the data.')

def number_to_parcel_id(number: float) -> str:
    str_number = str(number)
    if '.' not in str_number:
        return str_number
    vals = [v.strip() for v in str(number).split('.')]
    if len(vals) == 1:
        print(f'Unexpected value: {number}.')
        return vals[0]
    main_part = vals[0]
    decim_part = vals[1]
    if decim_part == '0':
        return main_part
    return f'{main_part}-{decim_part}'

def fix_numerical_error(v:float) -> str:
    return str(float(int(10*float(v)))/(10.))

check_decimal_vals(df, 'folio')
check_decimal_vals(points, 'folio')
df['parcel_id'] = df['nr'].apply(number_to_parcel_id)
df['folio'] = df['folio'].astype(int).astype(str)
points['folio'] = points['folio'].astype(int).astype(str)
points['num'] = points['num'].apply(fix_numerical_error) # fix numerical error
points['parcel_id'] = points['num'].apply(number_to_parcel_id)
df['merge_id'] = df['folio'] + 'f' + df['parcel_id']
points['merge_id'] = points['folio'] + 'f' + points['parcel_id']


13887
13847


In [None]:
from shapely.geometry import Point, Polygon
from tqdm.notebook import tqdm
tqdm.pandas()

def locate_parcel_geom_id(geom: list[tuple[str, Polygon]], point: Point) -> str:
    for id, geometry in geom:
        if geometry.contains(point):
            return id
    return None

geom_dict = list(geom.set_index('geom_id')['geometry'].to_dict().items())
geom_dict

# this part is slow, around 25 to 30 minutes. Optimisation possible according to that link, but require numba: https://stackoverflow.com/questions/36399381/whats-the-fastest-way-of-checking-if-a-point-is-inside-a-polygon-in-python
points['geom_id'] =  points.progress_apply(lambda x: locate_parcel_geom_id(geom_dict, x.geometry), axis=1)
points['geom_id'] = points['geom_id'].apply(lambda x: str(int(x)) if not pd.isna(x) else None)

  0%|          | 0/13847 [00:00<?, ?it/s]

In [None]:
# so we don't need to recompute the geometry and to which polygon they belong. 
points.to_crs('EPSG:4326').to_file('src/numeros_merged-v3.geojson', driver='GeoJSON')
geom.to_crs('EPSG:4326').to_file('src/geometries_with_index.geojson', driver='GeoJSON')

In [None]:
num_df = gpd.read_file('numeros_merged-v3.geojson')
num_df.drop(columns=['id'], inplace=True)
num_df = num_df.drop_duplicates()
gdf = gpd.read_file('geometries_with_index.geojson')

In [None]:
# to prove RÃ©mi's claim that the lowest page_number and folio_nmbr is always the first to appear in the lowest volume in the cadastral planches
check_number_order = False
if check_number_order:
    gdf['pg_nbr'] = gdf['layer'].apply(lambda s: s.split('_')[-1] if s else None)
    gdf['folio_nbr'] = gdf['layer'].apply(lambda s: s.split('_')[-2] if s else None)
    gdf[['folio_nbr', 'pg_nbr']].drop_duplicates().sort_values(by=['folio_nbr', 'pg_nbr']).iloc[50:]

In [None]:
merged_df=df.merge(num_df[['merge_id', 'geom_id', 'geometry']], on='merge_id', how='left')
merged_df['geometries'] = merged_df.apply(lambda v: [v['geom_id'], v['geometry']] ,axis=1)
df_final = df.set_index('*')
df_final['geometries'] = merged_df[['*', 'geometries']].groupby('*').agg(list).reset_index().set_index('*')
df_final['geometries'].apply(len).value_counts()

In [None]:
gdf['geom_id'] = gdf['geom_id'].astype(str)
if 'pg_nbr' in gdf.columns:
    gdf.drop(columns=['pg_nbr'], inplace=True)
if 'folio_nbr' in gdf.columns:
    gdf.drop(columns=['folio_nbr'], inplace=True)
gdf.to_file('lausanne-1888-cadastre-renove-geometries-20250311.geojson', driver='GeoJSON')
df_final.drop(columns=['merge_id', 'nr']).to_csv('lausanne-1888-cadastre-renove-registre-20250311.csv')
num_df.drop(columns=['num']).to_file('lausanne-1888-cadastre-renove-points-20250311.geojson', driver='GeoJSON')
