## ...

In [1]:
import xlrd
import os, sys
import re
import calendar
from glob import glob
import yaml

from functools import partial
from itertools import cycle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# plt.style.use('_classic_test')
# plt.style.use('seaborn-paper')
# plt.style.use('seaborn-pastel')
# plt.style.use('ggplot')


In [2]:
with open('configurations.yaml', 'r') as stream:
    CONFIG = yaml.load(stream, Loader=yaml.FullLoader)

In [3]:
temp_dataset_path = CONFIG['observations']['temperature']['dataset_path']
temp_output_path = CONFIG['observations']['temperature']['output_path']
prec_dataset_path = CONFIG['observations']['percepitation']['dataset_path']
prec_output_path = CONFIG['observations']['percepitation']['output_path']

models_dataset_path = CONFIG['models']['dataset_path']
models_output_path = CONFIG['models']['output_path']

plots_output_path = CONFIG['plots']['output']

models_start_year = CONFIG['models']['time']['start']['year']
models_hour_interval = CONFIG['models']['time']['hour_interval']
models_start_timestamp = CONFIG['models']['time']['timestamp']['start']
models_end_timestamp = CONFIG['models']['time']['timestamp']['end']
models_freq_timestamp = CONFIG['models']['time']['timestamp']['freq']

models_temp_filename = CONFIG['models']['variables']['temperature']['filename']

In [4]:
seasons_dict = CONFIG['dates']['seasons_dict']
period = CONFIG['period']['historical']

## Read observations

In [5]:
# OPTIMIZE!
# REDO


def get_xls_files(variable, obs_path):
    '''
        variable: 'temp' or 'prec'
        glob_path: path to file type excel, used by glob
            example: '..\\data\\observations\\*.xls'
            
        Returns a dict of all file_paths in the following format
            file_path': (variable, year)
            item example: '..\\data\\observations\\PREC1979.XLS': ('prec', '1979')

    '''
    glob_path = os.path.join(os.getcwd(), obs_path) + '\*.xls'    # the '\'  in '\*.xls' is nedded
    print(glob_path)
    
    def all_4digits_year(dict_files):
        '''
            returns the dict of files with porper year yyyy
        '''
        return {file:(pair[0], '19'+ pair[1] if len(pair[1]) == 2 else pair[1]) for file,pair in dict_files.items()}

    file_pattern = re.compile(rf'{variable}\s*(?P<year>\d*).xls' , flags=re.IGNORECASE)
    files = {f:(variable, re.search(file_pattern, os.path.basename(f)).group(1)) for f in glob(glob_path) if re.match(file_pattern, os.path.basename(f))}
    return all_4digits_year(files)


def get_data_from_xls(filename_path, year):
    '''
    
    
    '''
    xls = pd.ExcelFile(filename_path)
    df_raw = pd.read_excel(xls, header=None, usecols=range(0,25), index=False)
    # clean rows with no data (first col empty, must have a day or a month)
    df_raw.dropna(axis=0, how='all', subset=[0], inplace=True)

    # df to hold final result
    columns = ['month', 'day', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
    df_temp = pd.DataFrame(columns = columns)

    months_list = ['jan', 'fev', 'mar', 'abr', 'mai', 'jun', 'jul', 'ago', 'set', 'out', 'nov', 'dez', 'stop']
    months_dict = {m:i for i,m in enumerate(months_list[:-1], 1)}
    def month_iter(months):     # allows to consume all the months once, it uses a 'stop' word to easing the parser
        for month in months:
            yield month

    # regex for 1-31
    day_pattern = re.compile(r'(?P<day>([1-9]|[12]\d|3[01]))$') 

    # month iterator (can be done with a simple list as well)
    month_it = month_iter(months_list)
    month = next(month_it) # starts with 'jan'

    for row in df_raw.iterrows():
        label = row[1][0]                                                  # for each row the first col
        month_pattern = re.compile(rf'{month}' , flags=re.IGNORECASE)
        if re.match(month_pattern, str(label)):                      # assumes that the first line has the 'jan' matching pattern
            prev_month = month
            try:
                month = next(month_it)
            except:
                break                                               # the end of the list
        if re.match(day_pattern, str(label)):
            day = label                                             # it is a day
            new_series = pd.Series([months_dict[prev_month], day]).append(row[1][1:]) # select only the values (temp/prec)
            new_series.index = df_temp.columns
            df_temp = df_temp.append(new_series, ignore_index=True)
    df_temp['year'] = year      
    return df_temp[['year'] + columns]



# def get_all_obs_data_old(dict_files, variable, out_dir = None):
#     '''
#         Returns a df with all temperature series
#         Saves the df to  ..\\data\\observations\\output\\
#     '''
#     # order file paths by year, ascending.
#     files_years = sorted([(path,year[1]) for path,year in dict_files.items()], key = lambda v: v[1])

#     columns = columns = ['year', 'month', 'day', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
#     df_data = pd.DataFrame(columns = columns)

#     for file_path,year in files_years:
#         df_temp = get_data_from_xls(file_path, year)
#         assert (df_temp.shape[0] <= 366) and (df_temp.shape[0] >= 365), '365 or 366 days in file'
#         print('file_path - {}\tyear - {}\tlen: {}'.format(file_path, year, df_temp.shape[0]))
#         df_data = df_data.append(df_temp)

#     # fix dtypes    
#     dict_types = {col:'float' for col in df_data.columns}
#     dict_types['year'], dict_types['month'], dict_types['day']  = 'int', 'int', 'int'
#     df_data = df_data.astype(dict_types)
    
#     # save dataframe to csv
#     filename = '{}_{}_{}.csv'.format(variable, files_years[0][1], files_years[-1][1])
#     save_df2csv(df_data, filename, out_dir)
#     return df_data.reset_index(drop=True)




def get_all_obs_data(dict_files, variable, out_dir = None):
    '''
        Returns a df with all temperature series
        Numbers are rounded to 2 decimal places
        Saves the df to  ..\\data\\observations\\output\\
    '''
    # order file paths by year, ascending.
    files_years = sorted([(path,year[1]) for path,year in dict_files.items()], key = lambda v: v[1])

    columns = ['year', 'month', 'day', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
    df_data = pd.DataFrame(columns = columns)

    for file_path,year in files_years:
        df_temp = get_data_from_xls(file_path, year)
        assert (df_temp.shape[0] <= 366) and (df_temp.shape[0] >= 365), '365 or 366 days in file'
        print('file_path - {}\tyear - {}\tlen: {}'.format(file_path, year, df_temp.shape[0]))
        df_data = df_data.append(df_temp)

    # fix dtypes    
    dict_types = {col:'float' for col in df_data.columns}
    dict_types['year'], dict_types['month'], dict_types['day']  = 'int', 'int', 'int'
    df_data = df_data.astype(dict_types)
    # round to 2 decimal places
    df_data = df_data.round(decimals = {col:2 for col in columns[3:]})
    
    
    
    
    # save dataframe to csv
    filename = '{}_{}_{}.csv'.format(variable, files_years[0][1], files_years[-1][1])
    save_df2csv(df_data, filename, out_dir)
    return df_data.reset_index(drop=True)



## Read Models

In [6]:
# def make_dates4models(ndata, year_ini = models_start_year,
#                       hours_interval = models_hour_interval):
#     '''
#         ndata: number of data points
#         year_ini = 1971
#         hours_interval = 3
#     '''
#     year, month, day, hour = year_ini, 1, 1, hours_interval          # first hour is 3 (data is every 3 hours)
#     DataV=[]
#     for t in range(0, ndata):
#         DataV.append([year,month,day,hour])
#         hour  += 3
#         if hour > 21:
#             hour = 0
#             day += 1
#             if day > calendar.monthrange(year, month)[1]:
#                 day = 1
#                 month += 1
#                 if month > 12:
#                     month = 1
#                     year += 1
#     DataV = [[year_ini,    1,    1,    0]] + DataV
#     return np.array(DataV)

# #assert make_dates4models(102271).shape == (102272, 4)
# # d = make_dates4models(102271)
# # d[:10], d[-10:]
# # array([[1971,    1,    1,    0],
# #         [1971,    1,    1,    3],
# #         [1971,    1,    1,    6],
# #         [1971,    1,    1,    9],
# #         [1971,    1,    1,   12],
# #         [1971,    1,    1,   15],
# #         [1971,    1,    1,   18],
# #         [1971,    1,    1,   21],
# #         [1971,    1,    2,    0],
# #         [1971,    1,    2,    3]
# #       ...
# #         [2005,   12,   30,   18],
# #         [2005,   12,   30,   21],
# #         [2005,   12,   31,    0],
# #         [2005,   12,   31,    3],
# #         [2005,   12,   31,    6],
# #         [2005,   12,   31,    9],
# #         [2005,   12,   31,   12],
# #         [2005,   12,   31,   15],
# #         [2005,   12,   31,   18],
# #         [2005,   12,   31,   21]])


def select_models_folders(dir_models = models_dataset_path, period = period):
    folders = [folder for folder in  os.listdir(os.path.join(os.getcwd(), dir_models)) if period in folder]
    return folders, len(folders)

In [7]:
def add_season2df(df):
    cols = df.columns
    conditions = [(df['month'].isin([12, 1, 2])), (df['month'].isin([3, 4, 5])),
                (df['month'].isin([6, 7, 8])), (df['month'].isin([9, 10, 11]))]
    df['season'] = np.select(conditions, seasons_dict.keys())
    cols = cols.insert(1, 'season')
    return df[cols]

In [8]:
def get_data_models(dir_models = models_dataset_path, 
                    var_filename = models_temp_filename, 
                    start_timestamp = models_start_timestamp, end_timestamp = models_end_timestamp,
                    freq_timestamp = models_freq_timestamp, plot = False):
    '''
        For temperature only!! Convertion to C
        
    '''
    res = []                        # to accumulate the dataseries
    color = cycle('rbgkmc')
    folders, nmodels = select_models_folders(dir_models, period)
    for model_id, folder in enumerate(folders):
        T2m = np.loadtxt(os.path.join(dir_models, folder, var_filename)) - 273.15
        T2m = pd.Series(T2m, name = model_id, index=pd.date_range(
                                                    start = start_timestamp,
                                                    end = models_end_timestamp,
                                                    freq = freq_timestamp,
                                                    dayfirst=True ))
        if plot:
            T2m.plot(figsize=(18,8), color=next(color))
            plt.title('Model: {}'.format(model_id))
            plt.show();
        res.append(T2m) 
        
    res = pd.concat(res, axis = 1)
    res.index.name = 'date'
    
    if plot:
        res.plot(figsize=(18,8))
        plt.tight_layout()
        plt.show();
    return res

## Save and load prepared data

In [9]:
def save_datafile(save_fn, filename, output_dir):
    '''
        out_filename: defines what is computed, ex. temp_sea_avg_obs
    '''
    out_dir = os.path.join(os.getcwd(), output_dir)
    if not os.path.exists(out_dir):
        print('Output folder does not exist. Created a new one: {}'.format(out_dir))
        os.makedirs(out_dir)
    if filename[-4:] != '.csv':
        filename = filename + '.csv'
    save_fn(path_or_buf = os.path.join(os.getcwd(), out_dir, filename))
    
# def save_df2csv_old(df, filename, output_dir, index=False):
#     '''
        
#     '''
#     # to get other args when called
#     save_fn = partial(df.to_csv, sep=',', header=True, index=index, date_format='%Y-%m-%d %H')
#     return save_datafile(save_fn, filename, output_dir)

def save_df2csv(df, filename, output_dir, index=False):
    '''
        
    '''
    # to get other args when called
    save_fn = partial(df.to_csv, sep=',', header=True, index=index, date_format='%Y-%m-%d %H:%M:%S')
    return save_datafile(save_fn, filename, output_dir)


# def save_nparray2npy(data, filename, output_dir = None):
#     if not output_dir:
#         print('arg: output_dir is necessary.\nTry one of these:\n{}\n{}'.format(temp_output_path, prec_output_path, models_output_path))
#     save_fn = partial(save_nparray, array_data = data)
#     return save_datafile(save_fn, filename, output_dir)




# def save_nparray(array_data = None, path_or_buf = None):
#     path_or_buf = path_or_buf +'.npy'
#     with open(path_or_buf, 'wb') as f:
#         np.save(f, array_data)
#         print('\nNumpy array saved to file: {}'.format(path_or_buf))



def save_plot(plot, filename, output_dir = plots_output_path):
    '''
        out_filename: defines what is computed, ex. temp_sea_avg_obs
    '''
    out_dir = os.path.join(os.getcwd(), output_dir)
    if not os.path.exists(out_dir):
        print('Output folder does not exist. Created a new one: {}'.format(out_dir))
        os.makedirs(out_dir)
    path_or_buf = os.path.join(os.getcwd(), out_dir, filename)
    plot.savefig(path_or_buf,
                 dpi=None,
                 facecolor='w',
                 edgecolor='w',
                 orientation='portrait',
                 papertype=None, format=None,
                 transparent=False,
                 bbox_inches=None,
                 pad_inches=0.1,
                 frameon=None,
                 metadata=None)
    

In [10]:
# # Load np array
# def load_nparray_file(in_filename_path):
#     with open(in_filename_path, 'rb') as f:
#         data = np.load(f)
#     return data

# def load_nparray_models(in_filename):
#     in_dir = os.path.join(os.getcwd(), models_output_path)
#     return load_nparray_file(os.path.join(in_dir, in_filename))



# TO DELETE!!!!!
def load_csv2df_old(filename, filepath):
    def conv_to_int(c):    # to guarantee that the hours' column names are type int
        try:
            c = c.astype(int32)
        except:
            return c
        return c
    date_dtype = {'year':np.int32, 'month':np.int32, 'day':np.int32}
    filename_path = os.path.join(os.getcwd(), filepath, filename)
    df = pd.read_csv(filename_path, sep=',', parse_dates=True, dtypes = hours_dtype)
    df.columns = [conv_to_int(col) for col in df.columns]
    return df



def load_csv2df(filename, filepath, freq_index=None):
    '''
        if it is a timeseries 'date' and a 'freq_index' value is passed it is the index
    '''
    def conv_to_int(c):    # to guarantee that the hours' column names are type int
        try:
            c = int(c)
        except:
            return c
        return c
    
    filename_path = os.path.join(os.getcwd(), filepath, filename)
    date_dtype = {'year':np.int32, 'month':np.int32, 'day':np.int32}
    df = pd.read_csv(filename_path, sep=',', parse_dates=True, dtype = date_dtype)
    
    df.columns = [conv_to_int(col) for col in df.columns]
    if ('date' in df.columns) and (freq_index):
        df.set_index('date', inplace=True) # if it is a timeseries 'date' is the index
        df.index = pd.DatetimeIndex(df.index.values, freq=freq_index)
        df.index.name = 'date'
    return df



def compare_saved_loaded_df(df_, df):
    assert df_.index.equals(df.index)
    assert df_.columns.equals(df.columns)
    assert np.isclose(df_.values, df.values, equal_nan=True).all()
    return True

## Pre-processing

In [11]:
# # OPTIMIZE!

# def obs_avg_3h(df):
#     df_obs = df.iloc[:, 0:3]  # use Dates
#     for c in range(0, 22, 3):
#         df_obs[c] = df.iloc[:, c+3:c+3+3].mean(axis=1)
#     return df_obs.round(decimals=2)                              # decimals=2

In [12]:
# def make_timeseries_format(df, existing_cols, new_col_name, var_name):
#     '''
#         df
#         existing_cols: list
#         new_col_name: string
#         var_name:string
#         Example:header = [year	month	day	0	3	6	9	12	15	18	21]
#                 new_header = [year	month	day	hour	temp_3h]
#     '''
#     df_temp = add_season2df(df_temp)
#     df_temp = pd.melt(df, id_vars=existing_cols, var_name=new_col_name, value_name=var_name)
#     # 'hour' must be int
#     df_temp['hour'] = df_temp['hour'].astype(int)
#     df_temp.reset_index(drop=True, inplace=True)
#     print(df_obs.iloc[327634:327636,:])
#     df_temp['date'] = df_temp.apply(lambda s: pd.datetime(*map(int, (s.year, s.month, s.day, s.hour)))
#                                   , axis=1)
#     df_temp = df_temp.set_index('date')
#     df_temp.drop(['year','month','day','hour'], axis=1, inplace=True)
#     df_temp.sort_index(inplace=True)
#     return df_temp


#### TO DELETE
# def obs_ts_format_old(df, existing_cols, new_col_name, var_name):
#     '''
    
#     '''
#     df_temp = df.copy(deep=True)
#     df_temp = pd.melt(df_temp, id_vars=existing_cols, var_name=new_col_name, value_name=var_name)

#     df_temp = add_season2df(df_temp)                  # add season column
#     df_temp['hour'] = df_temp['hour'].astype(int)     # 'hour' must be int
#     df_temp.reset_index(drop=True, inplace=True)
    
#     df_temp['date'] = df_temp.apply(lambda s: pd.datetime(*map(int, (s.year, s.month, s.day, s.hour)))
#                                   , axis=1)
#     df_temp = df_temp.set_index('date')
#     df_temp.drop(['year','month','day','hour'], axis=1, inplace=True)
#     df_temp.sort_index(inplace=True)
#     return df_temp


def obs_ts_format(df, existing_cols, new_col_name, var_name, freq_index='3H'):
    '''
    
    '''
    df_temp = df.copy(deep=True)
    df_temp = pd.melt(df_temp, id_vars=existing_cols, var_name=new_col_name, value_name=var_name)

    df_temp = add_season2df(df_temp)                  # add season column
    df_temp['hour'] = df_temp['hour'].astype(int)     # 'hour' must be int
    df_temp.reset_index(drop=True, inplace=True)
    
    df_temp['date'] = df_temp.apply(lambda s: pd.datetime(*map(int, (s.year, s.month, s.day, s.hour)))
                                  , axis=1)
    df_temp = df_temp.set_index('date')
    df_temp.drop(['year','month','day','hour'], axis=1, inplace=True)
    df_temp.sort_index(inplace=True)
    df_temp = df_temp.asfreq(freq_index)
    return df_temp

In [13]:
def filter_df_by_season(df, season = None, month_col='month'):
    '''
    
    '''
    df = df.copy()
    return df.loc[df['month'].isin(seasons_dict[season]), :]

# def filter_df_by_hour_old(df, hour = None, hour_col='hour'):
#     '''
    
#     '''
#     df = df.copy()
#     return df.loc[df['hour'] == hour, :]


def filter_ts_by_hour(df, hour = None):
    '''
    
    '''
    df = df.copy()
    return df[df.index.hour==hour]



def filter_ts_by_season(ts, season = None):
    '''
    
    '''
    ts = ts.copy()
    return ts.loc[ts.index.month.isin(seasons_dict[season]), :]


def df_all_models(list_df_models):
    cols = list_df_models[0].columns
    for cmod,model in enumerate(list_df_models):
        model['model'] = cmod
    df =  pd.concat([*list_df_models], axis=0)
    return df[['model', *cols]]




# def join_obs2models(df_obs = None, df_models = None,
#                     obs_file = None, models_file = None, 
#                     obs_path = None, models_path = None):
#     '''
#         Prepare data for computation of differences between observations and models
#         For both dataframes and csv files the data must be "by_hour" kind!
    
#     '''
#     # get observations and model dataframes
#     if not isinstance(df_obs, pd.DataFrame):
#         try:
#             df_obs = load_csv2df(obs_file, obs_path if obs_path else temp_output_path)
#         except:
#             print('You must specify a valid observations filename and path')
#     if not isinstance(df_models, pd.DataFrame):
#         try:
#             df_models = load_csv2df(models_file, models_path if models_path else models_output_path)
#         except:
#             print('You must specify a valid models filename and path')
    
#     # prepare observations df
#     df_obs = add_season2df(df_obs)
#     ts_obs = make_timeseries_format(df_obs, ['year', 'season', 'month', 'day'],
#                            'hour', 17)
#     key = ['year','season', 'month', 'day', 'hour']
#     ts_obs = ts_obs.set_index(key)
    
#     # prepare models dataframe
#     # columns 'mod_obs' will hold the models' numbers and the observation number = 17
#     df_models = df_models.rename(columns = {'model':'mod_obs'})
#     # all variables in 'index' and only temperature as column/time series
#     ts_models = make_timeseries_format(df_models, ['mod_obs', 'year', 'season', 'month', 'day'],
#                            'hour', 'temp_3h')
#     # transform table to put models in column
#     df_by_model = ts_models.pivot_table(values='temp_3h', columns='mod_obs', index=key)
#     # joinning
#     df_by_model = df_by_model.join(ts_obs,  lsuffix='L', rsuffix='R', how='inner')
#     df_by_model.sort_index(inplace=True)
#     df_by_model.reset_index(inplace=True)
#     return df_by_model

## Processing 

In [14]:
def percentiles_table(df, values, by = 'hour', percentiles = [1,5,25,50,75,95,99]):
    '''
        Returns the percentiles of a value by a specific column (by)
        Organizes a timeseries in tabular form for a specific column
        df header: year	month	day	hour	temp_3h
        
        values: example 'temp_3h'
        
    '''
    index = [col for col in df.columns if col not in [values, by]]
    df  = df_obs_3_jja.pivot_table(values = values, columns = by, index = index)
    df = df.reset_index(drop=True)
    df_desc = df.describe(percentiles=np.asarray(percentiles)/100)
    df_desc = df_desc.T
    df_desc.reset_index(inplace=True)
    df_desc.rename(columns={'index': by}, inplace=True)
    df_desc.drop(['count', 'std'], axis=1, inplace=True)
    df_desc
    return df_desc.set_index(by)

## Error metrics

In [15]:
def fn_models_obs(metric, df_data, model_cols, observation_col):
    return df_data[model_cols].apply(metric, obs = df_data[observation_col], axis=0)

##### Bias 

In [16]:
def diff_(s, obs):
    return (obs - s).mean()

  
def bias(df, model_cols, observation_col):
    return fn_models_obs(diff_, df, model_cols, observation_col)

##### S score - Perkins Skill score 

In [17]:
def pdf(series, bins = None):
    '''
        pdf for pandas dataSeries, using a numpy array func
        returns both the density and the bin edges
    '''
    density, _ = np.histogram(series, bins = bins, density=True)
    return density


def  minimun(s, obs):
    return np.minimum(s, obs)

def perkins_skill_score(df, model_cols, obs_col, bins=None):
    if not bins:
        bins = range(0, 51)   # 0ºC to 50ºC
    list_pdfs = [pdf(s, bins) for _,s in df[model_cols+[obs_col]].items()]
    # transpose it!
    df_pdf = pd.DataFrame(list_pdfs, index=list(range(0,18)), columns=bins[:-1]).T

    return fn_models_obs(minimun, df_pdf, model_cols, obs_col).sum(axis=0)*100

##### Sigma score - Normalized standard deviation measure

In [18]:
def sigma_(s, obs):
    return s.std()/obs.std()


def sigma_score(df, model_cols, obs_col):
    return fn_models_obs(sigma_, df, model_cols, obs_col)

##### YK skewness - Yule-Kendall skewness

In [19]:
def YK_old(df, model_cols, obs_col):
    df = df[model_cols+[obs_col]].describe(percentiles=[0.05, 0.5, 0.95]).iloc[4:7, :]
    return ((df.loc['95%', :] - df.loc['50%', :]) - (df.loc['50%', :] - df.loc['5%', :]))/ (df.loc['95%', :] - df.loc['5%', :])

In [20]:
def YK_(df, model_cols, obs_col):
    df = df[model_cols+[obs_col]].describe(percentiles=[0.05, 0.5, 0.95]).iloc[4:7, :]
    return ((df.loc['95%', :] - df.loc['50%', :]) - (df.loc['50%', :] - df.loc['5%', :]))/ (df.loc['95%', :] - df.loc['5%', :])


def yk_diff(s, obs):
    return s-obs

def YK_skewness_by_hour(df, model_cols, obs_col):
    df_yk = metric_by_hour(YK_, df, model_cols, obs_col)
    return df_yk.loc[MODELS_NUMBERS, :].apply(yk_diff, obs = df_yk.loc[OBS_NUMBER, :], axis=1)

In [21]:
def metric_by_hour(metric, df, model_cols, obs_col):
    res = []
    for hour in range(0,22,3):
        df_filtered = filter_ts_by_hour(df, hour)
        res.append(metric(df_filtered, model_cols, obs_col))
    return pd.concat(res, axis=1, keys=list(range(0,22,3)))

#### pdf, cdf, pmf...

In [22]:
def compute_pdf_cdf(sample, plot=True):
    '''
        Visualize the ECDF, interpolated PDF and samples
        
    '''
    bins=int(np.sqrt(len(sample)))
#     bins =  100
    hist = np.histogram(sample, bins = bins)
    hist_dist = st.rv_histogram(hist)

    sample_min, sample_max = np.min(sample), np.max(sample)
    pdf, Ecdf = hist_dist.pdf, hist_dist.cdf
    assert Ecdf(sample_min) == 0, 'Something wrong with ECDF!'
    assert Ecdf(sample_max) == 1, 'Something wrong with ECDF!'

    if plot:
        X = np.linspace(sample_min, sample_max, bins)
        plt.title("PDF")
        _ = plt.hist(sample, density = True, bins = bins)
        _ = plt.plot(X, pdf(X), label = 'PDF')
        #_ = plt.plot(X, Ecdf(X), label = 'CDF')
        plt.legend()
        plt.margins(0.02)
        plt.show()
    return pdf, Ecdf


def compute_pdf_KDE(sample, bandwidth=2, kernel='gaussian', plot=True):
    '''
    
    '''
    bins=int(np.sqrt(len(sample)))
    #print(bins)
    sample = sample.reshape((len(sample), 1))
    # fit density
    model = KernelDensity(bandwidth=bandwidth, kernel=kernel)
    model.fit(sample)
    
    if plot:
        sample_min, sample_max = np.min(sample), np.max(sample)
        X = np.linspace(sample_min, sample_max, bins)
        values = X
        values = values.reshape((len(values), 1))
        #print(values.shape)
        probabilities = model.score_samples(values)
        probabilities = np.exp(probabilities)
        _ = plt.hist(sample, bins=bins, density=True)
        _ = plt.plot(values[:], probabilities)
        plt.title("pdf using KDE")
        plt.show()
    #return np.exp(model.score_samples)
    return model.score_samples

## Visualizations

In [23]:
# def plot_season_avg_temp(data, hours_interval = models_hour_interval):
#     hours = np.arange(0, 22, 3)

#     fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(15, 10), sharex=True, sharey=True)
#     fig.suptitle('Temperature Season Average')

#     for nsea,sea,ax in zip(range(4), seasons_dict.keys(), axs.flat):
#         for cmod in range(0,17):
#             ax.plot(hours, data[cmod, nsea, :], ':', alpha=0.6)
#             ax.set_title(sea)
#         ax.plot(hours, np.mean(data[:, nsea, :], axis=0) , '-')

#     fig.legend(labels = [i for i in range(17)] + ['avg'], loc='upper right')
#     plt.show()

In [24]:
def plt_metrics_by_hour(df, title,
                        filename = None,
                        output_dir = plots_output_path,
                        sub_folder=None,
                        df_other=None):
    # fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(15, 30), sharex=True, sharey=True)
    fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(15, 10), sharex=True, sharey=True)  # figsize=(w, h)
    fig.suptitle(title, fontsize=20)
    
    for hour,ax in zip(range(0,22,3), axs.flat):
        plt.subplot(ax)
        ax.plot(df[hour], '-o')
        if isinstance(df_other, pd.DataFrame):
            plt.plot(df_other[hour], '-o')
        ax.set_title('Hour = {}'.format(hour), fontsize=14)
        plt.grid(axis='x', color='gray', linestyle='--')
        plt.xticks(list(range(0,17)))
        ax.margins(0.05)
    if isinstance(df_other, pd.DataFrame):
        plt.legend(labels=['bias corrected'], loc='best')
    if filename:
        if sub_folder:
            output_dir = os.path.join(output_dir, sub_folder)
        save_plot(plt, filename+'_per_h', output_dir = output_dir)
    plt.show();
    
    df.plot(figsize=(15, 10), style = '-o')
    fig.legend(labels = df.columns, loc='upper right', title='hour')
    plt.xticks(list(range(0,17)))
    plt.margins(0.05)
    plt.grid(axis='x', color='silver', linestyle='--')
    if filename:
        save_plot(plt, filename, output_dir = output_dir)
    print(output_dir)
    plt.show();

## Ranking

In [25]:
def set_ranks(df_metric, metric=None, scale=None, decimals=None): 
    if metric == 'bias':
        scale = 7
        metric_prep = df_metric.abs().div(scale).round(decimals=1)
        metric_rank = metric_prep.rank(method='dense', ascending=False)
    
    elif metric == 'perkins':
        scale = 100
        metric_prep = df_metric.div(scale).round(decimals=1)
        metric_rank = metric_prep.rank(method='dense', ascending=True)
    
    elif metric == 'sigma':
        scale = 2
        metric_prep = (df_metric - 1).abs().div(scale).round(decimals=1)
        metric_rank = metric_prep.rank(method='dense', ascending=False)
    
    elif metric == 'yk':
        scale = 1
        metric_prep = df_metric.abs().div(scale).round(decimals=1)
        metric_rank = metric_prep.rank(method='dense', ascending=False)
    
    else:
        print('{} is not a valid metric [bias, sigma, perkins, yk]'.format(metric))
    
    print('abs({})\tmin:{}\tmax:{} '.format(metric, df_metric.abs().min().min(), df_metric.abs().max().max()))
    print('transf({})\tmin:{}\tmax:{} '.format(metric, metric_prep.abs().min().min(), metric_prep.abs().max().max()))
    print('{} rank\tmin:{}\tmax:{} '.format(metric, metric_rank.abs().min().min(), metric_rank.abs().max().max()))
    
    return metric_rank


def plot_bump(metric_rank,
              title,
              filename=None,
              output_dir = plots_output_path,
              sub_folder=None, 
              figsize=(18,18)):
    for row in metric_rank.iterrows():
        delta = row[0]*0.025
        metric_rank.iloc[row[0],:] = row[1] + delta

    metric_rank.T.plot(figsize=(18,18), style='-o')
    plt.yticks(list(range(1, int(metric_rank.max().max())+1)))
    plt.gca().invert_yaxis()
    plt.xticks(list(range(0,22,3)))
    plt.xlabel('hour')
    plt.grid(axis='y')
    plt.margins(0.1)
    plt.title(title)
    plt.legend(loc='lower right');
    if filename:
        if sub_folder:
            output_dir = os.path.join(output_dir, sub_folder)
        save_plot(plt, filename, output_dir = output_dir)