In [104]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from itertools import permutations, repeat, product
from datetime import datetime
%matplotlib inline

## Examples for creating permutations using itertools' *product*, rather than using *zip*, *repeat*, and *permute*...

In [105]:
import itertools
import collections

# Let's get all permutations of the elements of a with those in b:
a = ['foo', 'bar', 'baz']
b = ['x', 'y', 'z']
prodlist = [p for p in itertools.product(a,b)]

# and let's create a namedtuple to store these
Prod = collections.namedtuple('Prod', 'name, value')

tuplelist = []
for p in prodlist:
    prod = Prod(name=p[0], value=p[1])
    tuplelist.append(prod)
print(f"tuplelist: {tuplelist}")

tuplelist: [Prod(name='foo', value='x'), Prod(name='foo', value='y'), Prod(name='foo', value='z'), Prod(name='bar', value='x'), Prod(name='bar', value='y'), Prod(name='bar', value='z'), Prod(name='baz', value='x'), Prod(name='baz', value='y'), Prod(name='baz', value='z')]


In [46]:
# let's try permutations between three lists
a = ['foo', 'bar', 'baz']
b = ['x', 'y']
c = ['1', '2']
threeprodlist = [ p for p in itertools.product(a,b, c)]
print(f" permutations from product of three: {threeprodlist}")

 permutations from product of three: [('foo', 'x', '1'), ('foo', 'x', '2'), ('foo', 'y', '1'), ('foo', 'y', '2'), ('bar', 'x', '1'), ('bar', 'x', '2'), ('bar', 'y', '1'), ('bar', 'y', '2'), ('baz', 'x', '1'), ('baz', 'x', '2'), ('baz', 'y', '1'), ('baz', 'y', '2')]


In [10]:
# most pythonic way to add/append unique elements to a list

namelist = []
for t in tuplelist:
    # save/add only the unique names
    if t.name not in namelist:
        namelist.append(t.name)

namelist

['foo', 'bar', 'baz']

## Handling Input data for Performance Diagram in METviewer using Pandas

In [106]:
# !!! REPLACE THIS PATH TO THE SAMPLE DATA WITH YOUR PATH !!!
sample_data = pd.read_csv('/Users/minnawin/skunk_performance_diag/plot_20200317_151252.data', sep='\t', header='infer')
sample_data.head(100)

Unnamed: 0,model,vx_mask,fcst_init_beg,fcst_valid_beg,fcst_lead,interp_mthd,fcst_var,stat_name,stat_value,stat_ncl,stat_ncu,stat_bcl,stat_bcu
0,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.70888,0.70700,0.71076,,
1,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.72716,0.72531,0.72900,,
2,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.71065,0.70877,0.71253,,
3,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.75192,0.75012,0.75370,,
4,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.79177,0.79008,0.79344,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-13 12:00:00,360000,NEAREST,APCP_06,FAR,0.65657,0.65460,0.65853,,
96,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-13 12:00:00,360000,NEAREST,APCP_06,FAR,0.70898,0.70710,0.71086,,
97,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-13 12:00:00,360000,NEAREST,APCP_06,FAR,0.76458,0.76282,0.76633,,
98,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-13 12:00:00,360000,NEAREST,APCP_06,FAR,0.80729,0.80566,0.80892,,


In [12]:
# Convert the string representation of the date time to Python datetime objs so we can
# do comparisons later on?

fcst_valids = sample_data['fcst_valid_beg']
datetime_objs = []
for fcst in fcst_valids:
    dateobj = datetime.strptime(fcst, "%Y-%m-%d %H:%M:%S")
    datetime_objs.append(dateobj)
sample_data['Datetime'] = datetime_objs
sample_data

Unnamed: 0,model,vx_mask,fcst_init_beg,fcst_valid_beg,fcst_lead,interp_mthd,fcst_var,stat_name,stat_value,stat_ncl,stat_ncu,stat_bcl,stat_bcu,Datetime
0,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.70888,0.70700,0.71076,,,2016-08-16 18:00:00
1,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.72716,0.72531,0.72900,,,2016-08-16 18:00:00
2,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.71065,0.70877,0.71253,,,2016-08-16 18:00:00
3,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.75192,0.75012,0.75370,,,2016-08-16 18:00:00
4,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.79177,0.79008,0.79344,,,2016-08-16 18:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,GFS_0p25_G193,SH_CMORPH_G193,2016-08-12 00:00:00,2016-08-12 18:00:00,180000,NEAREST,APCP_06,PODY,0.56437,0.56232,0.56642,,,2016-08-12 18:00:00
1136,GFS_0p25_G193,SH_CMORPH_G193,2016-08-12 00:00:00,2016-08-12 18:00:00,180000,NEAREST,APCP_06,PODY,0.44881,0.44676,0.45087,,,2016-08-12 18:00:00
1137,GFS_0p25_G193,SH_CMORPH_G193,2016-08-12 00:00:00,2016-08-12 18:00:00,180000,NEAREST,APCP_06,PODY,0.41751,0.41547,0.41955,,,2016-08-12 18:00:00
1138,GFS_0p25_G193,SH_CMORPH_G193,2016-08-12 00:00:00,2016-08-12 18:00:00,180000,NEAREST,APCP_06,PODY,0.35847,0.35649,0.36046,,,2016-08-12 18:00:00


### Subset the original dataframe based on the combination of model, vx_mask, and stat_name

In [13]:
# Based on what's in the config file for model, vx_model, and stat_name, subset the data 
model = "GFS_0p25_G193"
stat_name = "FAR"
vx_mask = "NH_CMORPH_G193"

In [14]:
far_nh = sample_data[(sample_data.model=='GFS_0p25_G193') & (sample_data.stat_name=='FAR') & (sample_data.vx_mask=='NH_CMORPH_G193')]
far_nh

Unnamed: 0,model,vx_mask,fcst_init_beg,fcst_valid_beg,fcst_lead,interp_mthd,fcst_var,stat_name,stat_value,stat_ncl,stat_ncu,stat_bcl,stat_bcu,Datetime
0,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.70888,0.70700,0.71076,,,2016-08-16 18:00:00
1,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.72716,0.72531,0.72900,,,2016-08-16 18:00:00
2,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.71065,0.70877,0.71253,,,2016-08-16 18:00:00
3,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.75192,0.75012,0.75370,,,2016-08-16 18:00:00
4,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.79177,0.79008,0.79344,,,2016-08-16 18:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-12 18:00:00,180000,NEAREST,APCP_06,FAR,0.67435,0.67241,0.67629,,,2016-08-12 18:00:00
546,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-12 18:00:00,180000,NEAREST,APCP_06,FAR,0.70157,0.69968,0.70346,,,2016-08-12 18:00:00
547,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-12 18:00:00,180000,NEAREST,APCP_06,FAR,0.73819,0.73637,0.74000,,,2016-08-12 18:00:00
548,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-12 18:00:00,180000,NEAREST,APCP_06,FAR,0.77037,0.76862,0.77210,,,2016-08-12 18:00:00


In [15]:
far_nh_sum  = far_nh[['stat_value']].sum()
far_nh_sum

stat_value    139.76594
dtype: float64

In [16]:
far_nh_mean = far_nh[['stat_value']].mean()
far_nh_mean

stat_value    0.73561
dtype: float64

In [17]:
far_nh_median = far_nh[['stat_value']].median()
far_nh_median

stat_value    0.716905
dtype: float64

In [18]:
# Using df.query('col1 == "some_value1" & col2 == "some_other_value"') instead of df[(df.col1=="somevalue1") & (df.col2=="some_other_value"]
# which is not only messy and hard to read, but error prone with all the square braces and parentheses...new_sample_data = sample_data.query('model == "GFS_0p25_G193" & vx_mask == "NH_CMORPH_G193" & stat_name == "FAR"')
new_sample_data =  sample_data.query('model=="GFS_0p25_G193" & stat_name=="FAR" & vx_mask=="NH_CMORPH_G193" & fcst_valid_beg=="2016-08-16 18:00:00" & fcst_var=="APCP_06"')
new_sample_data

Unnamed: 0,model,vx_mask,fcst_init_beg,fcst_valid_beg,fcst_lead,interp_mthd,fcst_var,stat_name,stat_value,stat_ncl,stat_ncu,stat_bcl,stat_bcu,Datetime
0,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.70888,0.707,0.71076,,,2016-08-16 18:00:00
1,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.72716,0.72531,0.729,,,2016-08-16 18:00:00
2,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.71065,0.70877,0.71253,,,2016-08-16 18:00:00
3,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.75192,0.75012,0.7537,,,2016-08-16 18:00:00
4,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.79177,0.79008,0.79344,,,2016-08-16 18:00:00
5,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.81854,0.81694,0.82013,,,2016-08-16 18:00:00
6,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.85596,0.8545,0.85741,,,2016-08-16 18:00:00
7,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.87975,0.8784,0.88109,,,2016-08-16 18:00:00
8,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.90672,0.90551,0.90791,,,2016-08-16 18:00:00
9,GFS_0p25_G193,NH_CMORPH_G193,2016-08-12 00:00:00,2016-08-16 18:00:00,1140000,NEAREST,APCP_06,FAR,0.98678,0.9863,0.98725,,,2016-08-16 18:00:00


#### Now the new_sample_data dataframe contains the model, vx_mask, fcst_valid_beg, stat_name and fcst_var of interest.  Let's calculate the mean, median, and sum of the statistic of interest.  In this case it is FAR for the GFS model for the NH at 2016-08-16 16:00:00 for APCP_06:

In [19]:
new_sample_mean = new_sample_data['stat_value'].mean()
print(f"The mean for all FAR values for GFS, NH, APCP_06 at 2016-08-16 16:00:00 is: {new_sample_mean}")

The mean for all FAR values for GFS, NH, APCP_06 at 2016-08-16 16:00:00 is: 0.813813


In [20]:
new_sample_median = new_sample_data['stat_value'].median()
print(f"The median for all FAR values for GFS, NH, APCP_06 at 2016-08-16 16:00:00 is: {new_sample_median}")

The median for all FAR values for GFS, NH, APCP_06 at 2016-08-16 16:00:00 is: 0.805155


In [21]:
new_sample_sum = new_sample_data['stat_value'].sum()
print(f"The sum for all FAR values for GFS, NH, APCP_06 at 2016-08-16 16:00:00 is: {new_sample_sum}")

The sum for all FAR values for GFS, NH, APCP_06 at 2016-08-16 16:00:00 is: 8.13813


##### The above needs to be repeated for all the other fcst_valid_beg (or fcst_init_beg, whatever was specified as the indy_var value in the YAML configuration file to get all  

### Try doing things without hard-coded column names, use the variable names corresponding to the column names instead...

In [108]:
var1 = "model"
var2 = "vx_mask"
sample_data[var2]


0       NH_CMORPH_G193
1       NH_CMORPH_G193
2       NH_CMORPH_G193
3       NH_CMORPH_G193
4       NH_CMORPH_G193
             ...      
1135    SH_CMORPH_G193
1136    SH_CMORPH_G193
1137    SH_CMORPH_G193
1138    SH_CMORPH_G193
1139    SH_CMORPH_G193
Name: vx_mask, Length: 1140, dtype: object

# Misc Python snippets...

## Playing with converting date times

In [22]:
# Convert the datetime string into a Python datetime object
some_date = "2016-08-12 18:00:00"
dateobj = datetime.strptime(some_date, "%Y-%m-%d %H:%M:%S")
print(f"date string: {some_date}, type: {type(some_date)}")
print(f"date object: {dateobj}, type: {type(dateobj)}")

date string: 2016-08-12 18:00:00, type: <class 'str'>
date object: 2016-08-12 18:00:00, type: <class 'datetime.datetime'>


## Playing with empty lists that are attributes within a named tuple (remember, named tuples are immutable)

In [23]:
# Can we populate the named tuple, then come back and fill in attributes with empty lists???  It appears to work, we 
# aren't trying to mutate an immutable data struct/type, just an attribute within the data structure.
import collections
Pets = collections.namedtuple('Pets', 'name, species, breed, shots')
daphne = Pets(name="Daphne", species="Dog", breed="labrador retriever", shots = [])
shots = ['bordatella', 'rabies', 'parvo']
for shot in shots:
    daphne.shots.append(shot)

In [156]:
daphne


Pets(name='Daphne', species='Dog', breed='labrador retriever', shots=['bordatella', 'rabies', 'parvo'])


#### Make handling of fcst_var_val and series_val inner keys more generic:

Let's make the handling of the inner keys for fcst_var_val and series_val more generic, so we don't assume the following:


series_val:

    model: GFS_0p25_G193
    vx_mask:
      -NH_CMORPH_G193
      -SH_CMORPH_G193
      -TROP_CMORPH_G193
      
and instead think in terms of more generic/unpredicted/unknown number of inner keys and inner values:

series_val:

    inner_key1: key1val
    inner_key2: 
        -key2_val1
        -key2_val2
        -key2_val3
    inner_keyn:
        -keyn_val1
        -keyn_val2
        


Let's start with something simpler, like for fcst_var_val...

fcst_var_val:

    innerkey:
   
      -value1
      -value2
      
   #we can assume that there are only two values for the inner key, since the inner key represents a field variable like
   #TEMP, RH, APCP, etc. and the values correspond to FAR and PODY (since this is a performance diagram, we are plotting
   #Success ratio (1-FAR) vs PODY.

In [150]:
# create the simpler of the two dictionaries from the config file
fcst_var_val_dict = {'fcst_var_val':{'var_of_interest':["statname1", "statname2"]}}

In [151]:
fcst_var_val_dict

{'fcst_var_val': {'var_of_interest': ['statname1', 'statname2']}}

In [152]:
# get the dictionary that is the value to the 'fcst_var_val' key
fcst_vals = fcst_var_val_dict['fcst_var_val']
fcst_vals

{'var_of_interest': ['statname1', 'statname2']}

In [180]:
# next, get the values that correspond to the inner key of the inner dictionary
inner_key = list(fcst_vals.keys())
var_of_interest = inner_key[0]

In [156]:
stat_names = fcst_vals[var_of_interest]
print(stat_names)

['statname1', 'statname2']


In [157]:
parameter = {'series_val':{'var1':["var1val1","var1val2"], 'var2':["var2val1","var2val2","var2val3"], 'var3':["var3val1"]}}
parameter

{'series_val': {'var1': ['var1val1', 'var1val2'],
  'var2': ['var2val1', 'var2val2', 'var2val3'],
  'var3': ['var3val1']}}

In [181]:
var_dict = parameter['series_val']
var_dict

{'var1': ['var1val1', 'var1val2'],
 'var2': ['var2val1', 'var2val2', 'var2val3'],
 'var3': ['var3val1']}

In [182]:
# unpack the dictionary
all_lists = [*var_dict.values()]
all_lists

[['var1val1', 'var1val2'], ['var2val1', 'var2val2', 'var2val3'], ['var3val1']]

In [183]:
# permute the items in all the lists, unpack the dictionary to get a list of lists
# now we have a way to subset the data frame based on these variable combinations
prod = [p for p in itertools.product(*all_lists)]
prod

[('var1val1', 'var2val1', 'var3val1'),
 ('var1val1', 'var2val2', 'var3val1'),
 ('var1val1', 'var2val3', 'var3val1'),
 ('var1val2', 'var2val1', 'var3val1'),
 ('var1val2', 'var2val2', 'var3val1'),
 ('var1val2', 'var2val3', 'var3val1')]

### Tatiana's solution using filtering and numpy

all_points - array of tuples that contain all series permutations
all_fields_values.keys() - all fields for filtering. The other of fields  = order of values in a tuple
for point_ind, point in enumerate(all_points):
   for field_ind, field in enumerate(all_fields_values.keys()):

       filter_value = point[field_ind]
       #if the value has, or  ; - split
        if "," in filter_value:
           filter_list = filter_value.split(',')
        elif ";" in filter_value:
           filter_list = filter_value.split(';')
       else:
          filter_list = [filter_value]
      for i, filter_val in enumerate(filter_list):
         if is_string_integer(filter_val):
            filter_list[i] = int(filter_val)
   
    all_filters.append((input_data[field].isin(filter_list)))

  # use numpy to select the rows where any record evaluates to True
  mask = np.array(all_filters).all(axis=0)
  point_data = self.input_data.loc[mask]