# metrics - honduras

Abril 2023

### Diana Jaimes

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import math
import glob
import datetime
from datetime import datetime
import re
from datetime import datetime, date, timedelta


##=====================================================
#metrics
##=====================================================


from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import kendalltau



In [2]:
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

# Parameters:

In [3]:
folder = 'Z:\\1.Data\\Honduras\\raw\\monthly\\'
feature = 'prec'
output_path=
overall_file=
monthly_file=

# Fuctions

In [6]:
def read_files(path_folder:str,
               feature
              )-> list:
    """
    function to identify the files of the observed stations and satellite data.

    Args:
        path_folder(str): Folder path
        feature(str): feature to read
    Returns:
        (list) : Lists with all concatenated files. 
        The first list corresponds to the observed data files and the second to the satellite data files.

    """
    filenames = glob.glob(
        path_folder + "/*.csv"
    )
    files_feat = [s for s in filenames if feature in s]
    obs_stations_file = [s for s in files_feat if 'obs' in s]
    satellites_file = [i for i in set(files_feat) if i not in obs_stations_file]
    return obs_stations_file, satellites_file 

def fun_get_data_grid(folder,
                      feature
                     )-> pd.DataFrame:
    """
    function to read data from both satellite and station files

    Args:
        path_folder(str): Folder path
        feature(str): feature to read
    Returns:
        df_stations(df) : dataframe with station data
        data_grid(df) :dataframe with satellital data
    """
    station_data, grid_data = read_files(folder, feature)
    df_stations = pd.read_csv(
        station_data[0]
    ).rename(
        columns={'Unnamed: 0':'date'}
    )
    df_stations['date'] = pd.to_datetime(df_stations['date'])
    df_stations.set_index('date', inplace=True)
    df_stations['source'] = 'obs_stations'
    
    data_grid = pd.DataFrame()
    for file in grid_data:
        df_gridded = pd.read_csv(file).rename(
            columns={'Unnamed: 0':'date'}
        )
        df_gridded['date'] = pd.to_datetime(df_gridded['date'])
        df_gridded.set_index('date', inplace=True)
        name = file[file.find(folder) + len(folder):file.find(feature)-1]
        df_gridded['source'] = name
        data_grid = pd.concat(
            [data_grid, df_gridded],
            axis=0
        )
    return df_stations, data_grid

def overall_metrics(df_reference,
                    df_comparison)-> pd.DataFrame:
    """
        Function that calculates comparison metrics between station data and satellite data.
        The data sets are within the same time period.
    Args:
        df_reference(str):Reference dataset, namely the data from the stations.
        df_comparison(str): Comparison dataset, namely the data from satellite sources.
    Returns:
        rta(df) : Dataframe with the obtained metrics, where the indices are the stations,
        the columns are the metrics and the compared satellite source.
    
    """
    metrics = ['r2', 'rmse', 'kendall', 'mape', 'source']
    results = pd.DataFrame(columns=metrics)
    rta = pd.DataFrame()
    for source in df_comparison.source.unique():
        for column in df_reference.columns[:-2]:
            ref = df_reference[column].dropna()
            comparison = df_comparison[df_comparison.source==source][column].dropna()
            #This guarantees that the dataframes are within the same range of dates
            common_dates = ref.index.intersection(comparison.index).to_list()
            if len(common_dates)!=0:
                ref_comm = ref.loc[common_dates]
                comparison_comm = comparison.loc[common_dates]
                r2 = r2_score(ref_comm, comparison_comm)
                rmse = mean_squared_error(ref_comm, comparison_comm)
                tau, pvalue = kendalltau(ref_comm, comparison_comm)
                #Division by zero is avoided by adding 0.1 to change the reference
                if ref_comm.min()==0 or comparison_comm.min()==0:
                    mape=np.mean(
                        np.abs(
                        ((ref_comm+0.1) - (comparison_comm+0.1)) / (ref_comm+0.1))
                    ) * 100
                    results.loc[column] = [r2, rmse, tau, mape, source]
                else:
                    mape=np.mean(
                        np.abs(
                            (ref_comm - comparison_comm) / ref_comm))*100
                    results.loc[column] = [r2, rmse, tau, mape, source]
            else:
                results.loc[column] = [np.nan, np.nan, np.nan, np.nan, source]
        rta =  pd.concat([rta, results], ignore_index=False, axis=0)
    return rta

def monthly_metrics(df_reference, df_comparison):
    """
        Function that calculates comparison metrics between station data and satellite data for each month
        (using the mean as the aggregation function). Datasets are within the same time period.
    Args:
        df_reference(str):Reference dataset, namely the data from the stations.
        df_comparison(str): Comparison dataset, namely the data from satellite sources.
    Returns:
        rta(df) : Dataframe with the obtained metrics, where the indices are the stations,
        the columns are the metrics and the compared satellite source.
    """
    metrics = ['r2', 'rmse', 'kendall', 'mape', 'source']
    results = pd.DataFrame(columns=metrics)
    rta = pd.DataFrame()
    for source in df_comparison.source.unique():
        for column in df_reference.columns[:-2]:
            ref = df_reference[column].dropna()
            comparison = df_comparison[df_comparison.source==source][column].dropna()
            common_dates = ref.index.intersection(comparison.index).to_list()
            if len(common_dates)!=0:
                #Method to aggregate data by month
                ref_comm = df_reference.loc[common_dates][['month', column]].groupby(['month']).mean()
                comparison_comm = df_comparison.loc[common_dates][['month', column]].groupby(['month']).mean()
                r2 = r2_score(ref_comm, comparison_comm)
                rmse=mean_squared_error(ref_comm, comparison_comm)
                tau, pvalue= kendalltau(ref_comm, comparison_comm)
                #Division by zero is avoided by adding 0.1 to change the reference
                if ref_comm.min()[0]==0 or comparison_comm.min()[0]==0:
                    mape=np.mean(
                        np.abs(
                        ((ref_comm+0.1) - (comparison_comm+0.1)) / (ref_comm+0.1))
                    ) * 100
                    results.loc[column] = [r2, rmse, tau, mape[0], source]
                else:
                    mape=np.mean(
                        np.abs(
                            (ref_comm - comparison_comm) / ref_comm))*100
                    results.loc[column] = [r2, rmse, tau, mape[0], source]
            else:
                results.loc[column] = [np.nan, np.nan, np.nan, np.nan, source]
        rta =  pd.concat([rta, results], ignore_index=False, axis=0)
    return rta

# Execution

In [7]:
df_stations, df_gridded = fun_get_data_grid(folder, feature)

In [8]:
df_stations['month'] = df_stations.index.month
df_gridded['month'] =df_gridded.index.month

In [9]:
overall_metrics(df_stations, df_gridded)

Unnamed: 0,R2,RMSE,Kendall Tau,MAPE,source
ESQUIAS,0.711642,2878.497210,0.649946,578.610062,imerg
LA ERMITA,0.616002,3737.155855,0.605415,477.667562,imerg
SULACO,0.098230,4965.811405,0.594611,3095.951397,imerg
PLAYITAS,0.566061,2700.370439,0.700130,848.096675,imerg
FLORES,0.651430,2136.559531,0.684505,1025.110281,imerg
...,...,...,...,...,...
SABANA GRANDE,-0.549002,46583.550818,0.141942,18332.345634,mswep
LA LABOR,-0.468646,17230.907449,0.202714,2914.902172,mswep
LA MESA,-2.992097,17253.896982,-0.146976,2768.459733,mswep
POTRERILLOS,-1.703969,18407.055441,0.012793,12401.959231,mswep


In [10]:
monthly_metrics(df_stations, df_gridded)

Unnamed: 0,R2,RMSE,Kendall Tau,MAPE,source
ESQUIAS,0.832095,1091.846431,0.757576,63.704706,imerg
LA ERMITA,0.823620,601.994829,0.666667,25.737709,imerg
SULACO,0.636921,1237.726135,0.636364,111.745053,imerg
PLAYITAS,0.500224,1921.841768,0.787879,175.517683,imerg
FLORES,0.724095,1094.509458,0.727273,151.178634,imerg
...,...,...,...,...,...
SABANA GRANDE,0.695092,5040.292476,0.757576,413.154959,mswep
LA LABOR,0.864968,1093.729485,0.818182,129.874907,mswep
LA MESA,-0.429773,2090.783853,0.636364,81.057903,mswep
POTRERILLOS,-0.032105,3443.312212,0.666667,325.773192,mswep
