# Creation of the merchants dataset, from the 1740 Catastici
To do so, we take the people who have more than 1 entry in the register, where at least one is a Bottega

The final dataset is a csv containing:
- person (original name)
- shop_type
- shop_type_eng (original translation)
- shop_category (original metacategory)
- shop_place (wkt string)  
- shop_lat/lng (float)
- house_place (wkt string)
- house_lat/lng  (float)

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

In [10]:
landregister_1740 = gpd.read_file("../public/1740_Catastici_2025-09-24.geojson")
print("length of langregister ",len(landregister_1740))
landregister_1740.head()

length of langregister  31743


Unnamed: 0,uid,author,owner_code,owner_count,PP_OwnerCode,PP_OwnerCode_SIMPL,owner_name,ten_name,function,an_rendi,...,PP_Bottega_TRAD,PP_Bottega_METACATEGORY,PP_Owner_Title,PP_Owner_Entity,PP_Owner_FirstName,PP_Owner_LastName,PP_Owner_Notes,tif_path_img,path_img,geometry
0,AGN-0001,Davide Drago,PPL,1,Private,Private,Paolina Franco,,casa in soler rovinosa,,...,,,,,Paolina,FRANCO,PERSON,/catastici/Catastici-439/8/1127.tif,439_Dorsoduro/8_SAgnese/SAgnese_0_1.png,POINT (291127.764 5034215.544)
1,AGN-0002,Davide Drago,PPL,1,Private,Private,Nobil homo Andrea Vendramin,Francesco Palamon,casa in soler,22.0,...,,,NOBIL HOMO,,Andrea,VENDRAMIN,PERSON,/catastici/Catastici-439/8/1127.tif,439_Dorsoduro/8_SAgnese/SAgnese_0_1.png,POINT (291129.343 5034214.754)
2,AGN-0003,Davide Drago,PPL,1,Private,Private,Nobil homo Andrea Vendramin,Rovo Bognolo,casa in soler,32.0,...,,,NOBIL HOMO,,Andrea,VENDRAMIN,PERSON,/catastici/Catastici-439/8/1127.tif,439_Dorsoduro/8_SAgnese/SAgnese_0_1.png,POINT (291131.001 5034214.201)
3,AGN-0004,Davide Drago,PPL,1,Private,Private,Nobil homo Andrea Vendramin,Zuanne Fanti,casa in soler,28.0,...,,,NOBIL HOMO,,Andrea,VENDRAMIN,PERSON,/catastici/Catastici-439/8/1127.tif,439_Dorsoduro/8_SAgnese/SAgnese_0_1.png,POINT (291135.422 5034211.596)
4,AGN-0005,Davide Drago,PPL,1,Private,Private,Nobil homo Andrea Vendramin,Michiel Gasparini,casa in soler,18.0,...,,,NOBIL HOMO,,Andrea,VENDRAMIN,PERSON,/catastici/Catastici-439/8/1127.tif,439_Dorsoduro/8_SAgnese/SAgnese_0_1.png,POINT (291136.685 5034210.728)


In [None]:
df = landregister_1740.drop(columns=["author","owner_code","owner_count","place","PP_OwnerCode","PP_OwnerCode_SIMPL","an_rendi","id_napo","quantity_income","quality_income","parish_std","sestiere","PP_Function_TOP","PP_Function_MID","PP_Function_PROPERTY", "PP_Function_GEOMETRY","PP_Owner_Title","PP_Owner_Entity","PP_Owner_FirstName","PP_Owner_LastName","PP_Owner_Notes", "tif_path_img", "path_img"])
# create dataframe with duplicated tenants 
ten = df["ten_name"]

ten_norm = (
    ten.where(ten.notna())    # keep NaN as NaN so they are not counted
    .str.strip()
    .str.lower()
    .replace("", pd.NA)       # treat empty strings as NaN
)

# names that occur more than once (excluding NaN)
counts = ten_norm.dropna().value_counts()
dup_names = counts[counts > 1].index.tolist()

# dataframe only with duplicated tenants, and add normalized name column
df = df[ten_norm.isin(dup_names)].copy()
df["ten_name_norm"] = ten_norm.loc[df.index]
df.head()


Unnamed: 0,uid,owner_name,ten_name,function,PP_Bottega_STD,PP_Bottega_COUNT,PP_Bottega_TRAD,PP_Bottega_METACATEGORY,geometry,ten_name_norm
1,AGN-0002,Nobil homo Andrea Vendramin,Francesco Palamon,casa in soler,,,,,POINT (291129.343 5034214.754),francesco palamon
5,AGN-0006,Nobil homo Andrea Vendramin,Giovanni Battista Carleschi,casa in soler,,,,,POINT (291138.185 5034209.859),giovanni battista carleschi
6,AGN-0007,Nobil homo Andrea Vendramin,Francesco Martini,casa in soler,,,,,POINT (291145.37 5034206.148),francesco martini
12,AGN-0013,Innocente Nastasi,Antonio Buranello,casa pepian,,,,,POINT (291165.897 5034195.411),antonio buranello
15,AGN-0016,Testa Ruinato,Gasparo Moro,casa soler,,,,,POINT (291167.95 5034200.622),gasparo moro


In [None]:
# group by normalized tenant name and keep only the groups where the person has a shop
grouped = df.groupby("ten_name_norm")

def keep_group(g: pd.DataFrame) -> bool:
    has_bottega = g["PP_Bottega_STD"].notna().any()
    return has_bottega 

kept_groups = {name: grp for name, grp in grouped if keep_group(grp)}

print(f"Total number of duplicated-tenant groups kept: {len(kept_groups)}")


Total number of duplicated-tenant groups kept: 968


In [47]:
# Build a single CSV where each feature is a person with:
#  - person (original name)
#  - shop_type
#  - shop_type_eng (original translation)
#  - shop_category (original metacategory)
#  - shop_place (wkt string)  
#  - shop_lat/lng (float)
#  - house_place (wkt string)
#  - house_lat/lng  (float)
persons = []
for name_norm, grp in kept_groups.items():
    g = gpd.GeoDataFrame(grp).copy()
    # ensure proper geometry column
    if g.geometry.name not in g.columns:
        raise RuntimeError(f"group {name_norm} has no geometry")
    # pick home: prefer a row whose 'function' contains "casa"
    home_candidates = g[g["function"].fillna("").astype(str).str.contains("casa", case=False, na=False)]
    home_row = home_candidates.iloc[0] if not home_candidates.empty else g.iloc[0]
    home_geom = home_row.geometry


    shop_rows = g[g["PP_Bottega_STD"].notna()].copy()
    shop_row = shop_rows.iloc[0]
    shop_type = shop_row["PP_Bottega_STD"]
    shop_geom = shop_row.geometry

    def geom_to_latlng(geom):
        if geom is None:
            return (pd.NA, pd.NA)
        return (float(geom.y), float(geom.x))  # (lat, lng)

    home_lat, home_lng = geom_to_latlng(home_geom)
    shop_lat, shop_lng = geom_to_latlng(shop_geom)


    persons.append({
        "person": home_row.get("ten_name"),
        "shop_type": shop_type,
        "shop_type_eng": shop_row.get("PP_Bottega_TRAD"),
        "shop_category": shop_row.get("PP_Bottega_METACATEGORY"),
        "shop_place": shop_geom.wkt,
        "shop_lat": shop_lat,
        "shop_lng": shop_lng,
        "house_place": home_geom.wkt,
        "house_lat": home_lat,
        "house_lng": home_lng
    })

# Create GeoDataFrame (geometry = house)
#crs = landregister_1740.crs

#persons_gdf = gpd.GeoDataFrame(persons, geometry="geometry", crs=crs)
persons_df = pd.DataFrame(persons)
# Write single GeoJSON (each feature = a person; shop_places are in properties as WKT list)
out_fp = "../public/merchants_dataset.csv"
#persons_gdf.to_file(out_fp, driver="GeoJSON")
persons_df.to_csv(out_fp, index=False)

print(f"Saved {len(persons_df)} persons -> {out_fp}")
persons_df.head()

Saved 968 persons -> ../public/merchants_dataset.csv


Unnamed: 0,person,shop_type,shop_type_eng,shop_category,shop_place,shop_lat,shop_lng,house_place,house_lat,house_lng
0,Agostin Cigaggia,FORMAGIER,(Person) Cheese,FOOD_DRINK,POINT (290857.01847614034 5035730.88015714),5035731.0,290857.018476,POINT (290857.01847614034 5035730.88015714),5035731.0,290857.018476
1,Agostin Fabris,FORNER,(Person) Oven,FOOD_DRINK,POINT (292354.62244628853 5035118.308935367),5035118.0,292354.622446,POINT (292354.62244628853 5035118.308935367),5035118.0,292354.622446
2,Agostin Lazari,FABRO,(Person) Forger,METAL_CRAFT,POINT (291466.5331725447 5035133.210374592),5035133.0,291466.533173,POINT (291898.1586415941 5035003.6428168155),5035004.0,291898.158642
3,Agostin Meneghetti,ORESE,(Object) Jewellery,LUXURY_ITEMS,POINT (291536.64384332683 5035425.966183828),5035426.0,291536.643843,POINT (291391.04315041157 5035483.791403256),5035484.0,291391.04315
4,Agostin Redolfi,SCALETER,(Person) Pastries,FOOD_DRINK,POINT (291390.2704865179 5035236.682245659),5035237.0,291390.270487,POINT (291390.2704865179 5035236.682245659),5035237.0,291390.270487
