# Intrawindow plot

In [5]:
import glob
import pandas as pd
from plotting_utils import (
    modify_score_dict,
    make_df_from_score_dict,
)

def score_improvement_bysite(model, targets_df, target_variable, suffix=""):
    '''
    This function collects the forecast scores for the specifed model and target variable.
    Then it returns a dataframe with columns for the difference in CRPS and RMSE
    compared to the historical and naive persistence null model (note that the naive will only be RMSE).
    '''
    score_dict = {}
    # For each site, score CRPS and RMSE individually and add to score_dict
    for site_id in targets_df.site_id.unique():
        site_dict = {}
        glob_prefix = f'forecasts/{site_id}/{target_variable}/{model}_{suffix}/forecast*'
        csv_list = sorted(glob.glob(glob_prefix))
        for csv in csv_list:
            site_dict = modify_score_dict(
                csv, 
                targets_df, 
                target_variable, 
                site_id, 
                suffix, 
                site_dict
            )
        score_dict[site_id] = site_dict

    # Producing a dataframe from the score dictionary, as df's are easier
    # to manipulate
    df = make_df_from_score_dict(score_dict)
    import pdb; pdb.set_trace()

    # Using the mean CRPS score over the forecast horizon
    df = df.groupby(['site_id', 'date', 'metric', 'model']).mean().reset_index()

    # Creating a CRPS and RMSE dataframe separately which is definitely
    # not the most elegant solution here
    crps_df = df[df['metric'] == 'crps']
    rmse_df = df[df['metric'] == 'rmse']
    
    forecast_dfs = [df_[df_['model'] == 'forecast'] for df_ in [crps_df, rmse_df]]
    historical_dfs = [df_[df_['model'] == 'historical'] for df_ in [crps_df, rmse_df]]
    naive_df = df[df['model'] == 'naive']
    naive_df = naive_df.rename(columns={'value': 'value_naive'})

    # Merge the two DataFrames on site_id, date, and metric
    crps_merged = pd.merge(
        forecast_dfs[0], 
        historical_dfs[0], 
        on=['site_id', 'date', 'metric'], 
        suffixes=('_forecast', '_historical')
    )

    rmse_merged = pd.merge(
        forecast_dfs[1], 
        historical_dfs[1], 
        on=['site_id', 'date', 'metric'], 
        suffixes=('_forecast', '_historical')
    )

    rmse_merged = pd.merge(
        rmse_merged, 
        naive_df, 
        on=['site_id', 'date', 'metric'], 
    )
    # Calculate percent improvement for each metric
    crps_merged['difference_historical_ml_crps'] = (
        crps_merged['value_forecast'] - crps_merged['value_historical']
    )
    
    rmse_merged['difference_historical_ml_rmse'] = (
        rmse_merged['value_forecast'] - rmse_merged['value_historical'] 
    ) 
    
    rmse_merged['difference_naive_ml_rmse'] = (
        rmse_merged['value_forecast'] - rmse_merged['value_naive'] 
    )

    rmse_merged['difference_naive_historical_rmse'] = (
        rmse_merged['value_historical'] - rmse_merged['value_naive']
    )

    # Deleting unnecessary columns
    rmse_merged = rmse_merged.drop(rmse_merged.filter(like='model').columns, axis=1)
    rmse_merged = rmse_merged.drop(rmse_merged.filter(like='value').columns, axis=1)
    crps_merged = crps_merged.drop(crps_merged.filter(like='model').columns, axis=1)
    crps_merged = crps_merged.drop(crps_merged.filter(like='value').columns, axis=1)

    # Joining the two df's along site id and date then adding a combined improvement column
    # for comparison against the climatology model
    merged_df = pd.merge(crps_merged, rmse_merged, on=['site_id', 'date'], how='inner')
    merged_df = merged_df.drop(merged_df.filter(like='metric').columns, axis=1)
    merged_df['model'] = model

    return merged_df

In [None]:
import warnings

targets = pd.read_csv("targets.csv.gz")
score_improvement_bysite('BlockRNN', targets, 'oxygen', suffix='default', )

> [0;32m/tmp/ipykernel_268510/3853195171.py[0m(37)[0;36mscore_improvement_bysite[0;34m()[0m
[0;32m     35 [0;31m[0;34m[0m[0m
[0m[0;32m     36 [0;31m    [0;31m# Using the mean CRPS score over the forecast horizon[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 37 [0;31m    [0mdf[0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mgroupby[0m[0;34m([0m[0;34m[[0m[0;34m'site_id'[0m[0;34m,[0m [0;34m'date'[0m[0;34m,[0m [0;34m'metric'[0m[0;34m,[0m [0;34m'model'[0m[0;34m][0m[0;34m)[0m[0;34m.[0m[0mmean[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mreset_index[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     38 [0;31m[0;34m[0m[0m
[0m[0;32m     39 [0;31m    [0;31m# Creating a CRPS and RMSE dataframe separately which is definitely[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  df


      site_id        date metric       model     value     t
0        ARIK  2022_09_18   crps    forecast  2.889973  17.0
1        ARIK  2022_09_18   crps    forecast  3.725640  18.0
2        ARIK  2022_09_18   crps    forecast  4.316091  19.0
3        ARIK  2022_09_18   crps    forecast  4.640427  20.0
4        ARIK  2022_09_18   crps    forecast  4.422085  21.0
...       ...         ...    ...         ...       ...   ...
19075    WLOU  2023_06_15   crps  historical  0.056116  29.0
19076    WLOU  2023_06_15   crps  historical  0.069429  30.0
19077    WLOU  2023_06_15   rmse    forecast  0.537986   NaN
19078    WLOU  2023_06_15   rmse  historical  0.197400   NaN
19079    WLOU  2023_06_15   rmse       naive  0.595755   NaN

[19080 rows x 6 columns]


In [8]:
from plotting_utils import get_validation_series

get_validation_series(
        targets, 
        'POSE', 
        'oxygen', 
        '2022-08-19', 
        30,
    )