# Farm Data Cleanup and match with locality geospatial information

In [None]:
import pandas as pd
from pathlib import Path
import plotly.express as px
import numpy as np
import json
import geopandas as gp
gp.options.io_engine = "pyogrio"

Preprocessing and augmentation of farm data with geospatial information

In [None]:
ws = Path("/home/aschneuwl/workspace/")
ws2 = Path("/mnt/wks3/aschneuwl/workspace")

# Farm Data

In [None]:
farms = pd.read_parquet(ws2 / Path("data/preprocessed/dairy") / Path("b01.parquet"))

In [None]:
farms["zip"]

In [None]:
farms = farms.drop_duplicates(subset ="farmId")
farms.loc[farms.country == "CHE", "country"] = "CH"

# Geospatial Data

In [None]:
geodata_fpath = Path("/home/aschneuwl/workspace/data/geo/historisierte-administrative_grenzen_g0_1850-2015_gemeinde_2056.json")

In [None]:
with open(geodata_fpath) as fp:
    geojson_data = json.load(fp)

In [None]:
geodata_2024_fpath = Path("/home/aschneuwl/workspace/agecon-thesis/notebooks/swissBOUNDARIES3D_1_5_TLM_HOHEITSGEBIET.shp")
geojson_data_2024 = gp.read_file(geodata_2024_fpath)

In [None]:
localities_zip = gp.read_parquet(ws2 / Path("data/preprocessed/geo") / Path("swiss_localities_with_altitudes.parquet"))

In [None]:
zip_to_largst_bfs_fpath = Path("/home/aschneuwl/workspace/agecon-thesis/notebooks/zip_largest_bfs_2024.json")

with open(zip_to_largst_bfs_fpath) as fp:
    zip_to_largst_bfs = json.load(fp)

# City Directory (Municipality Identification Codes) - Deprecated
BFS Nr -> Municipality Number assigned by the BFS (Bundesamt für Statistik)

In [None]:
city_directory = pd.read_csv("/home/aschneuwl/workspace/data/geo/AMTOVZ_CSV_LV95/AMTOVZ_CSV_LV95.csv", sep=";",
                             names=["city", "plz", "addDigit", "municipality", "bfs", "kt", "e", "n", "lang", "val"])
city_directory["plz_str"] = city_directory.plz.apply(lambda x: str(x))

In [None]:
### https://www.bfs.admin.ch/bfsstatic/dam/assets/343051/master

manual_mappings_zip_bfs = {
    # Montagny-Chamard
    "1440": "5922",
    # Chavannes-sous-Orsonnens
    "1693": "2114",
    # Bouloz
    "1698": "2324",
    # Sion
    "1951": "6266",
    # Bienne
    "2500": "371",
    # Le Prédame
    "2711" : "6748",
    # Montfaucon
    "2875": "6751",
    # Niesen Kulm
    "3712": "567",
    # Waldegg BE
    "3802": "571",
    # Montana-Vermala
    "3962": "6243",
    # Solothurn
    "4502": "2601",
    # Brugg AG 
    "5201": "4095",
    # Linn (heute Bötzberg) 
    "5224": "4103",
    # Oberehrendingen
    "5422" : "4049",
    # Unterehrendingen
    "5424": "4049",
    # Luzern
    "6000" : "1061",
    # Engelberg
    "6391" : "6391",
    # Schwyz
    "6431": "1372",
    # Campsut-Crüt 
    "7446": "3681",
    # Zürich
    "8000": "261",
    # ETHZ
    "8092": "261",
    # Kindhausen AG
    "8963": "4023",
    # Rindal
    "9234": "3393",
    # Kronbühl
    "9302": "3204", 
    # Chur
    "7005": "3901",
    "7007": "3901",
    # Landquart Fabriken
    "7207": "3942",
    # Fruthwilen
    "8559" : "4851",
    # Wil SG
    "9501" : "3427",
}

In [None]:
def map_plz_to_bfs(plz: str):
    res = city_directory[city_directory["plz_str"] == plz]

    bfs = None
    if res.shape[0] != 0:
        bfs = res.bfs.values[0]
    elif plz in manual_mappings_zip_bfs.keys():
        bfs = manual_mappings_zip_bfs[plz]
        
    return bfs
        

In [None]:
def map_plz_to_bfs(plz: str):
    bfs = None
    
    if plz in zip_to_largst_bfs.keys():
        bfs = zip_to_largst_bfs[plz]

    elif plz in manual_mappings_zip_bfs.keys():
        bfs = manual_mappings_zip_bfs[plz]
    
    return bfs

## Countries

### Number of countries

In [None]:
farms.country.nunique()

### Number of farms not in Switzerland

In [None]:
(farms.country != "CH").sum()

### Country Distribution

In [None]:
farms.country.value_counts()

In [None]:
farms_ch.columns

# Swiss Farms Data Cleaning

In [None]:
farms_ch = farms[(farms.country == "CH") & (farms.zip != "")]
farms_ch = farms_ch[~(farms_ch["zip"] == "0")]

In [None]:
missing_zip_codes = set(farms_ch.zip.unique()).difference(set(localities_zip.ZIP4.unique()))

In [None]:
for z in sorted(missing_zip_codes):
    print(z, farms_ch[farms_ch["zip"] == z].city.values)

In [None]:
localities_zip.ZIP_ID.nunique()

In [None]:
localities_zip.shape

In [None]:
manual_mappings_zip_to_current_zip = {
    # Montagny-Chamard
    "1440": "1442",
    # Treytorrens
    "1488": "1538",
    # Chavannes-sous-Orsonnens
    "1693": "1694",
    # Bouloz
    "1698": "1699",
    # Sion
    "1951": "1950",
    #Le Col-des-Roches
    "2412": "2400",
    # Bienne
    "2500": "2504",
    # Brüttelen
    "2578": "3237",
    # Le Prédame
    "2711" : "2714",
    # Montfaucon
    "2875": "2362",
    # Niesen Kulm
    "3712": "3713",
    # Waldegg BE
    "3802": "3800",
    # Montana-Vermala
    "3962": "3963",
    # Solothurn
    "4502": "4500",
    # Brugg AG 
    "5201": "5200",
    # Linn (heute Bötzberg) 
    "5224": "5225",
    # Oberehrendingen
    "5422" : "5420",
    # Unterehrendingen
    "5424": "5420",
    # Luzern
    "6000" : "6003",
    # Engelberg
    "6391" : "6390",
    # Schwyz
    "6431": "6430",
    # Chur
    "7005": "7000",
    "7007": "7000",
    # Landquart Fabriken
    "7207": "7206",
    # Campsut-Crüt 
    "7446": "7447",
    # Zürich
    "8000": "8005",
    # ETHZ
    "8092": "8005",
    # Kindhausen AG
    "8963": "8962",
    # Rindal
    "9234": "9604",
    # Kronbühl
    "9302": "9300"
    # Fruthwilen
    "8559" : "8269",
    # Wil SG
    "9501" : "9500",
}

In [None]:
def replace_zip_code(zip_code: str) -> str:
    if zip_code in manual_mappings_zip_to_current_zip.keys():
        return manual_mappings_zip_to_current_zip[zip_code]
    else:
        return zip_code