# Livestock dataset extraction

Author: Thiago Nascimento (thiago.nascimento@eawag.ch)

This notebook is used to retrieve and concatenate the livestock dataset into a table for publication alongisde the used data.

## Requirements
**Python:**

* Python>=3.6
* Jupyter
* geopandas=0.10.2
* numpy
* os
* pandas=2.1.3
* tqdm

Check the Github repository for an environment.yml (for conda environments) or requirements.txt (pip) file.

**Files:**

* GVE_Catchments.shp


**Directory:**

* Clone the GitHub directory locally
* Place any third-data variables in their respective directory.
* ONLY update the "PATH" variable in the section "Configurations", with their relative path to the EStreams directory. 


## References
* 
## Observations
* Part of the data is interpolated. 

# Import modules

In [None]:
import pandas as pd
import numpy as np
import tqdm as tqdm
import os
import warnings
import geopandas as gpd

# Configurations

In [None]:
# Only editable variables:
# Relative path to your local directory
PATH = "../.."
# Suppress all warnings
warnings.filterwarnings("ignore")

path_data = r"C:\Users\nascimth\Documents\data\CAMELS_CH_Chem\data"

* #### The users should NOT change anything in the code below here. 

In [None]:
# Non-editable variables:
PATH_OUTPUT = "results\Dataset\catchment_aggregated_data\livestock_data"

# Set the directory:
os.chdir(PATH)

# Import data

In [None]:
catchments_gve = gpd.read_file(path_data+'\shapefile_gve\GVE_Catchments.shp')
catchments_gve["bafu_id"] = catchments_gve["gauge_id"]
catchments_gve

In [None]:
# Network CAMELS_CH_Chem
network_camels_ch_chem = pd.read_excel(path_data+"\CAMELS_CH_chem_stations_short_v3.xlsx", sheet_name='all_5')
#network_camels_ch_chem.set_index("basin_id", inplace=True)
network_camels_ch_chem

In [None]:
# The majority is primarly formed by the bafu id, so here we check the oens that ARE NOT:
network_camels_ch_chem[network_camels_ch_chem.bafu_id.isna()]

In [None]:
# Merge the DataFrames for achieving the bafu_id ias the last column
catchments_gve = pd.merge(catchments_gve, network_camels_ch_chem[['bafu_id', 'basin_id']], on='bafu_id', how='left')
catchments_gve = pd.merge(catchments_gve, network_camels_ch_chem[['nawa_id', 'basin_id']], on='nawa_id', how='left')
catchments_gve = pd.merge(catchments_gve, network_camels_ch_chem[['naduf_id', 'basin_id']], on='naduf_id', how='left')

catchments_gve

In [None]:
# Replace all 0s with NaN:
catchments_gve.loc[:, ['gauge_id', "naduf_id", "nawa_id"]] = catchments_gve.loc[:, ['gauge_id', "naduf_id", "nawa_id"]].replace(0, np.nan)

In [None]:
# Create the new 'basin_id' column based on the priority order
catchments_gve['basin_id_new'] = np.nan
catchments_gve['basin_id_new'] = np.where(
    catchments_gve['gauge_id'].notna(), catchments_gve['basin_id_x'],
    np.where(
        catchments_gve['nawa_id'].notna(), catchments_gve['basin_id_y'],
        catchments_gve['basin_id']
    )
)

# Display the updated DataFrame
catchments_gve

In [None]:
# Here we can check the data
catchments_gve[catchments_gve.basin_id_new.isna()]

In [None]:
# Here we solve it manuallly
catchments_gve.loc[84, ["basin_id_new"]] = 2403.0
catchments_gve.loc[110, ["basin_id_new"]] = 2622.0

catchments_gve[catchments_gve.basin_id_new.isna()]

In [None]:
# Drop duplicates based on 'gauge_id'
catchments_gve_unique = catchments_gve.drop_duplicates(subset="basin_id_new")
catchments_gve_unique.set_index("basin_id_new", inplace=True)
catchments_gve_unique

In [None]:
# Delete the 2403 (BAFU) since it is empty for our time range, and 2622 (merged with 2243):
catchments_gve_unique.drop(2403.0, axis=0, inplace=True)
catchments_gve_unique.drop(2622.0, axis=0, inplace=True)

In [None]:
catchments_gve_unique

In [None]:
livestock_df = pd.DataFrame(index=network_camels_ch_chem.basin_id.astype(float))

livestock_df[['gve_1980_S', 'gve_1985_S', 'gve_1990_S', 'gve_1996_S', 'gve_1997_S', 'gve_1998_S', 'gve_1999_S', 'gve_2000_S', 
              'gve_2001_S', 'gve_2002_S', 'gve_2003_S', 'gve_2004_S', 'gve_2005_S', 'gve_2006_S',
       'gve_2007_S', 'gve_2008_S', 'gve_2009_S', 'gve_2010_S', 'gve_2011_S', 'gve_2012_S', 'gve_2013_S', 
       'gve_2014_S', 'gve_2015_S', 'gve_2016_S', 'gve_2017_S', 'gve_2018_S', 'gve_2019_S', 'gve_2020_S', 'gve_2021_S',
       'gve_2022_S',
       'gve_1980_h', 'gve_1985_h', 'gve_1990_h',
       'gve_1996_h', 'gve_1997_h', 'gve_1998_h', 'gve_1999_h', 'gve_2000_h',
       'gve_2001_h', 'gve_2002_h', 'gve_2003_h', 'gve_2004_h', 'gve_2005_h',
       'gve_2006_h', 'gve_2007_h', 'gve_2008_h', 'gve_2009_h', 'gve_2010_h',
       'gve_2011_h', 'gve_2012_h', 'gve_2013_h', 'gve_2014_h', 'gve_2015_h',
       'gve_2016_h', 'gve_2017_h', 'gve_2018_h', 'gve_2019_h', 'gve_2020_h',
       'gve_2021_h', 'gve_2022_h']] = catchments_gve_unique[['gve_1980_S', 'gve_1985_S', 'gve_1990_S', 'gve_1996_S', 'gve_1997_S', 'gve_1998_S', 'gve_1999_S', 'gve_2000_S', 
              'gve_2001_S', 'gve_2002_S', 'gve_2003_S', 'gve_2004_S', 'gve_2005_S', 'gve_2006_S',
       'gve_2007_S', 'gve_2008_S', 'gve_2009_S', 'gve_2010_S', 'gve_2011_S', 'gve_2012_S', 'gve_2013_S', 
       'gve_2014_S', 'gve_2015_S', 'gve_2016_S', 'gve_2017_S', 'gve_2018_S', 'gve_2019_S', 'gve_2020_S', 'gve_2021_S',
       'gve_2022_S',
       'gve_1980_h', 'gve_1985_h', 'gve_1990_h',
       'gve_1996_h', 'gve_1997_h', 'gve_1998_h', 'gve_1999_h', 'gve_2000_h',
       'gve_2001_h', 'gve_2002_h', 'gve_2003_h', 'gve_2004_h', 'gve_2005_h',
       'gve_2006_h', 'gve_2007_h', 'gve_2008_h', 'gve_2009_h', 'gve_2010_h',
       'gve_2011_h', 'gve_2012_h', 'gve_2013_h', 'gve_2014_h', 'gve_2015_h',
       'gve_2016_h', 'gve_2017_h', 'gve_2018_h', 'gve_2019_h', 'gve_2020_h',
       'gve_2021_h', 'gve_2022_h']]

In [None]:
# Now we can have it file by file exported

In [None]:
livestock_df_sum = pd.DataFrame(index=network_camels_ch_chem.basin_id.astype(float))

livestock_df_sum[['gve_1980_S', 'gve_1985_S', 'gve_1990_S', 'gve_1996_S', 'gve_1997_S', 'gve_1998_S', 'gve_1999_S', 'gve_2000_S', 
              'gve_2001_S', 'gve_2002_S', 'gve_2003_S', 'gve_2004_S', 'gve_2005_S', 'gve_2006_S',
       'gve_2007_S', 'gve_2008_S', 'gve_2009_S', 'gve_2010_S', 'gve_2011_S', 'gve_2012_S', 'gve_2013_S', 
       'gve_2014_S', 'gve_2015_S', 'gve_2016_S', 'gve_2017_S', 'gve_2018_S', 'gve_2019_S', 'gve_2020_S', 'gve_2021_S',
       'gve_2022_S']] = catchments_gve_unique[['gve_1980_S', 'gve_1985_S', 'gve_1990_S', 'gve_1996_S', 'gve_1997_S', 'gve_1998_S', 'gve_1999_S', 'gve_2000_S', 
              'gve_2001_S', 'gve_2002_S', 'gve_2003_S', 'gve_2004_S', 'gve_2005_S', 'gve_2006_S',
       'gve_2007_S', 'gve_2008_S', 'gve_2009_S', 'gve_2010_S', 'gve_2011_S', 'gve_2012_S', 'gve_2013_S', 
       'gve_2014_S', 'gve_2015_S', 'gve_2016_S', 'gve_2017_S', 'gve_2018_S', 'gve_2019_S', 'gve_2020_S', 'gve_2021_S',
       'gve_2022_S']]


livestock_df_sum

In [None]:
livestock_df_ha = pd.DataFrame(index=network_camels_ch_chem.basin_id.astype(float))

livestock_df_ha[['gve_1980_h', 'gve_1985_h', 'gve_1990_h',
       'gve_1996_h', 'gve_1997_h', 'gve_1998_h', 'gve_1999_h', 'gve_2000_h',
       'gve_2001_h', 'gve_2002_h', 'gve_2003_h', 'gve_2004_h', 'gve_2005_h',
       'gve_2006_h', 'gve_2007_h', 'gve_2008_h', 'gve_2009_h', 'gve_2010_h',
       'gve_2011_h', 'gve_2012_h', 'gve_2013_h', 'gve_2014_h', 'gve_2015_h',
       'gve_2016_h', 'gve_2017_h', 'gve_2018_h', 'gve_2019_h', 'gve_2020_h',
       'gve_2021_h', 'gve_2022_h']] = catchments_gve_unique[['gve_1980_h', 'gve_1985_h', 'gve_1990_h',
       'gve_1996_h', 'gve_1997_h', 'gve_1998_h', 'gve_1999_h', 'gve_2000_h',
       'gve_2001_h', 'gve_2002_h', 'gve_2003_h', 'gve_2004_h', 'gve_2005_h',
       'gve_2006_h', 'gve_2007_h', 'gve_2008_h', 'gve_2009_h', 'gve_2010_h',
       'gve_2011_h', 'gve_2012_h', 'gve_2013_h', 'gve_2014_h', 'gve_2015_h',
       'gve_2016_h', 'gve_2017_h', 'gve_2018_h', 'gve_2019_h', 'gve_2020_h',
       'gve_2021_h', 'gve_2022_h']]

livestock_df_ha

In [None]:
for basin in tqdm.tqdm(livestock_df.index):
    livestock_df_sum_basin = pd.DataFrame(data=livestock_df_sum.loc[basin, :])

    # Use regex to extract numeric part from the index
    livestock_df_sum_basin.index = livestock_df_sum_basin.index.to_series().str.extract('(\d+)', expand=False)

    # Optionally, convert the index back to numeric type if you want them as integers
    livestock_df_sum_basin.index = pd.to_numeric(livestock_df_sum_basin.index)

    livestock_df_sum_basin.index.name = "date"

    livestock_df_sum_basin.columns = ["gve_sum"]

    livestock_df_sum_basin["gve_ha"] = livestock_df_ha.loc[basin, :].values

    livestock_df_sum_basin.index = livestock_df_sum_basin.index.astype(int)

    # Generate a full range of years from 1980 to 2019
    full_range = pd.DataFrame(index=range(1980, 2021))

    # Reindex the dataframe to include all years
    livestock_df_sum_basin_interpolated = livestock_df_sum_basin.reindex(full_range.index)

    # Interpolate missing values
    livestock_df_sum_basin_interpolated = livestock_df_sum_basin_interpolated.interpolate(method='linear')


    livestock_df_sum_basin_interpolated = livestock_df_sum_basin_interpolated.round(4)
    livestock_df_sum_basin_interpolated.index.name = "date"

    livestock_df_sum_basin_interpolated.to_csv(PATH_OUTPUT + "\\camels_ch_chem_livestock_"+str(int(basin))+".csv", encoding='latin')

# End