In [1]:
import os
import pandas as pd
import numpy as np
import itertools
import json
from datetime import datetime, time

In [29]:
def check_folder(path, params, verbose = False):
    trees = list(itertools.product(params['year'], params['month'], params['day'],
                                   ['FundamentalData'], params['leadtime']))
    years = os.listdir(path)
    months = ["%02d" % (m,) for m in range(1,13)]
    days = ["%02d"% (d,) for d in range(1,32)]
    leadtimes = [f"FundamentalData_D-{str(l)}.csv" for l in range(1,16)]
    substitutes = [years, months, days, ['FundamentalData'], leadtimes]
    i = len(trees)
    while i>0:
        file = trees.pop(0)
        if '.' in file:
            files = []
            for j,el in enumerate(file):
                files.append([el] if el!='.' else substitutes[j])
            
            trees += list(itertools.product(*files))
        else:
            trees.append(file)
        i-=1    
        
    trees = list(map(lambda x: os.path.join(*x), trees))
    filtered_trees = [tree for tree in trees if os.path.isfile(os.path.join(path, tree))]
    if len(filtered_trees) < len(trees):
        print("Of the dates you have specified only %d are currently available." %(len(filtered_trees)) +\
              "To obtain which of them are missing, specifiy verbose = True")
        if verbose:
            print(list(set(trees) - set(filtered_trees)))
    return filtered_trees

def get_forecasts(path = "P:/CH/Weather Data/FUNDAMENTAL_DATA",
                  year = None,
                  month = None,
                  day = None,
                  leadtime = None,
                  model = None,
                  variable = None,
                  country = None,
                  run = None):
    date_params = {
        "year": [str(year) if  year is not None else '.'] if not isinstance(year, list) else [str(y) for y in year],
        "month": ["%02d" % (month,) if month is not None else '.'] if not isinstance(month, list) \
                                                                    else ["%02d" % (m,) for m in month], 
        "day": ["%02d" % (day,) if day is not None else '.'] if not isinstance(day, list) \
                                                                    else ["%02d" % (d,) for d in day],
        "leadtime": [f"FundamentalData_D-{str(leadtime)}.csv" if leadtime is not None else '.'] if not isinstance(leadtime, list) \
                                                            else [f"FundamentalData_D-{str(l)}.csv" for l in leadtime],
    }
    files = check_folder(path, date_params, verbose = False)
    forecast_df = []
    for file in files:
        df = pd.read_csv(os.path.join(path, file))
        df['values'] = df['values'].apply(lambda x: json.loads(x))
        df['flowdate'] = pd.to_datetime(df['flowdate'])
        df.drop('forecast_datetime', axis = 1, inplace = True)
        df['leadtime'] = int(file.split(".")[0][-1])
        df['hour'] = [list(range(0,24))]*len(df)
        if model is not None:
            df = df.loc[df['model'] == model if isinstance(model, str) else df['model'].isin(model)]
            
        if country is not None:
            df = df.loc[df['country'] == country if isinstance(country, str) else df['country'].isin(country)]
            
        if variable is not None:
            df = df.loc[df['fund_type'] == variable if isinstance(variable, str) else df['country'].isin(variable)]
            
        if run is not None:
            df = df.loc[df['run'] == int(run)]
        
        if df.empty:
            print("File %s does not contain any element satisfying the query" %file)
            continue
            
        lst_col = ['values', 'hour']
        df = pd.DataFrame({
            col:np.repeat(df[col].values, 24)
            for col in df.columns.difference(lst_col)
         }).assign(**{col:np.concatenate(df[col].values) for col in lst_col})[df.columns.tolist()]
        
        df['date'] = df.apply(lambda x: datetime.combine(x['flowdate'], time(x['hour'],0)), axis =1)
        df.set_index('date', inplace = True)
        df.drop(['flowdate', 'hour'], axis = 1, inplace = True)
        
        forecast_df.append(df)
    
    return pd.concat(forecast_df).dropna(how="all",axis=1) 

In [8]:
df = get_forecasts(year = 2021, month = [6,7,8,9], day = 30, model = None, variable = "Wind", country = "DE", run = 12)


Of the dates you have specified only 48 are currently available.To obtain which of them are missing, specifiy verbose = True
['2021\\09\\30\\FundamentalData\\FundamentalData_D-2.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-15.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-8.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-6.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-11.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-13.csv', '2021\\07\\30\\FundamentalData\\FundamentalData_D-2.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-12.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-9.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-7.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-10.csv', '2021\\06\\30\\FundamentalData\\FundamentalData_D-14.csv']
File 2021\06\30\FundamentalData\FundamentalData_D-1.csv does not contain any element satisfying the query
File 2021\07\30\FundamentalData\FundamentalData_D-1.csv does not c

In [32]:
df = get_forecasts(year = 2021, month = 10, day = None, model = ["GFS_ENS","EC_ENS"], variable = "Wind", country = "DE", run = 12)


Of the dates you have specified only 451 are currently available.To obtain which of them are missing, specifiy verbose = True
File 2021\10\01\FundamentalData\FundamentalData_D-1.csv does not contain any element satisfying the query
File 2021\10\01\FundamentalData\FundamentalData_D-4.csv does not contain any element satisfying the query
File 2021\10\01\FundamentalData\FundamentalData_D-14.csv does not contain any element satisfying the query
File 2021\10\02\FundamentalData\FundamentalData_D-5.csv does not contain any element satisfying the query
File 2021\10\02\FundamentalData\FundamentalData_D-15.csv does not contain any element satisfying the query
File 2021\10\03\FundamentalData\FundamentalData_D-6.csv does not contain any element satisfying the query
File 2021\10\04\FundamentalData\FundamentalData_D-7.csv does not contain any element satisfying the query
File 2021\10\05\FundamentalData\FundamentalData_D-8.csv does not contain any element satisfying the query
File 2021\10\06\Fundamen

In [15]:
df.dropna(how="any")

Unnamed: 0_level_0,country,fund_type,model,run,unit,values,leadtime,scenario
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [33]:
df

Unnamed: 0_level_0,country,fund_type,model,run,unit,values,leadtime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-10-01 00:00:00,DE,Wind,EC_ENS,12.0,MW,22257,2
2021-10-01 01:00:00,DE,Wind,EC_ENS,12.0,MW,22694,2
2021-10-01 02:00:00,DE,Wind,EC_ENS,12.0,MW,23141,2
2021-10-01 03:00:00,DE,Wind,EC_ENS,12.0,MW,23585,2
2021-10-01 04:00:00,DE,Wind,EC_ENS,12.0,MW,23895,2
...,...,...,...,...,...,...,...
2021-10-31 19:00:00,DE,Wind,GFS_ENS,12.0,MW,23472,4
2021-10-31 20:00:00,DE,Wind,GFS_ENS,12.0,MW,23524,4
2021-10-31 21:00:00,DE,Wind,GFS_ENS,12.0,MW,23455,4
2021-10-31 22:00:00,DE,Wind,GFS_ENS,12.0,MW,23331,4


# GFS Retrieval

In [4]:
import getgfs

In [5]:
f=getgfs.Forecast("0p25")

In [18]:
f.search("geopotential")

[('cape180_0mb',
  '** 180-0 mb above ground convective available potential energy [j/kg] ',
  97),
 ('cape255_0mb',
  '** 255-0 mb above ground convective available potential energy [j/kg] ',
  97),
 ('cape90_0mb',
  '** 90-0 mb above ground convective available potential energy [j/kg] ',
  98),
 ('capesfc', '** surface convective available potential energy [j/kg] ', 103),
 ('pevprsfc', '** surface potential evaporation rate [w/m^2] ', 108),
 ('hgt0c', '** 0c isotherm geopotential height [gpm] ', 120),
 ('potsig995', '** 0.995 sigma level potential temperature [k] ', 127),
 ('hgtsfc', '** surface geopotential height [gpm] ', 133),
 ('hgtprs',
  '** (1000 975 950 925 900.. 10 7 4 2 1) geopotential height [gpm] ',
  133),
 ('hgt2pv', '** pv=2e-06 (km^2/kg/s) surface geopotential height [gpm] ', 133),
 ('hgtneg2pv',
  '** pv=-2e-06 (km^2/kg/s) surface geopotential height [gpm] ',
  133),
 ('hgtmwl', '** max wind geopotential height [gpm] ', 133),
 ('hgttop0c',
  '** highest tropospheric 

In [17]:
f.get(variables = ['pressfc', 'preslclb'], date_time = "20211120", lat = 80, lon = 30).variables

{'preslclb': <getgfs.decode.Variable at 0x224282e4cc0>,
 'pressfc': <getgfs.decode.Variable at 0x224282e4e10>}

In [27]:
import numpy as np
import xarray as xr
import pandas as pd

# File Details
# TODO: pass as arguments to the script
dt = '20211115'
res = 25
step = '1hr'
run = '{:02}'.format(0)
lat_toplot = np.arange(-43, -17.25, 0.25) # last number is exclusive
lon_toplot = np.arange(135, 152.25, 0.25) # last number is exclusive

# ******************************
# SELECT GFS FILE
# ******************************
# URL
URL = f'http://nomads.ncep.noaa.gov:80/dods/gfs_0p{res}_{step}/gfs{dt}/gfs_0p{res}_{step}_{run}z'

variables = ['ugrd100m']

dataset = xr.open_dataset(URL)[variables]
time = dataset.variables['time']
lat = dataset.variables['lat'][:]
lon = dataset.variables['lon'][:]



In [28]:
dataset

In [2]:
import sys
sys.path.append('../src/')
from dashboard.utils.data import *
from dashboard.utils.plotting import *
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
targets = load_MF_targets.__wrapped__(season = 'Winter')
REGIMES = ['AR', 'NAO+', 'NAO-', 'SB']
targets

Unnamed: 0_level_0,Correlation,Correlation,Correlation,Correlation,Correlation,Distance,Distance,Distance,Distance,Distance
Unnamed: 0_level_1,AR,NAO+,NAO-,Prediction,SB,AR,NAO+,NAO-,Prediction,SB
1981-06-01,-0.57612,-0.11510,0.25877,SB,0.35389,701.82,579.65,597.20,SB,411.83
1981-06-02,-0.73879,0.12253,0.23817,NAO-,0.20268,688.56,484.79,584.82,SB,420.61
1981-06-03,-0.72549,0.47476,0.17309,NAO+,-0.20203,667.16,375.42,595.04,NAO+,494.22
1981-06-04,-0.26695,0.63156,0.03101,NAO+,-0.64391,608.58,323.49,611.56,NAO+,622.15
1981-06-05,-0.13715,0.41763,0.11754,NAO+,-0.57682,689.66,474.01,634.11,NAO+,746.68
...,...,...,...,...,...,...,...,...,...,...
2020-08-27,0.37930,-0.40540,0.41800,NAO-,-0.29460,499.80,681.81,515.87,AR,646.16
2020-08-28,0.29970,-0.51150,0.52160,NAO-,-0.20810,539.31,721.43,474.17,NAO-,645.67
2020-08-29,0.08580,-0.54880,0.46300,NAO-,0.09890,586.55,674.40,477.97,NAO-,566.64
2020-08-30,-0.30960,-0.30280,0.19410,SB,0.43740,608.54,548.15,532.94,SB,478.57


In [34]:
correlation =\
    targets.xs('Correlation', axis =1, level = 0).loc[:,REGIMES].\
    apply(lambda x: (x - x.min())/ (x.max() - x.min()), axis =1)

correlation = correlation.apply(lambda x: x / x.sum(), axis = 1)
correlation.columns = pd.MultiIndex.from_product([['Correlation'], correlation.columns])
correlation

Unnamed: 0_level_0,Correlation,Correlation,Correlation,Correlation
Unnamed: 0_level_1,AR,NAO+,NAO-,SB
1981-06-01,0.000000,0.207114,0.375076,0.417809
1981-06-02,0.000000,0.309855,0.351456,0.338689
1981-06-03,0.000000,0.457711,0.342670,0.199619
1981-06-04,0.161970,0.548035,0.289995,0.000000
1981-06-05,0.206565,0.467211,0.326223,0.000000
...,...,...,...,...
2020-08-27,0.456513,0.000000,0.479027,0.064460
2020-08-28,0.377706,0.000000,0.481026,0.141267
2020-08-29,0.276623,0.000000,0.441044,0.282333
2020-08-30,0.000000,0.005408,0.400557,0.594036


In [35]:
distance =\
    targets.xs('Distance', axis =1, level = 0).loc[:,REGIMES].\
    apply(lambda x: 1 / x, axis =1)

distance = distance.apply(lambda x: x / x.sum(), axis = 1)
distance.columns = pd.MultiIndex.from_product([['Distance'], distance.columns])
distance

Unnamed: 0_level_0,Distance,Distance,Distance,Distance
Unnamed: 0_level_1,AR,NAO+,NAO-,SB
1981-06-01,0.196460,0.237867,0.230876,0.334797
1981-06-02,0.191031,0.271326,0.224917,0.312727
1981-06-03,0.190540,0.338610,0.213634,0.257215
1981-06-04,0.205990,0.387527,0.204986,0.201497
1981-06-05,0.223905,0.325770,0.243519,0.206806
...,...,...,...,...
2020-08-27,0.287737,0.210926,0.278774,0.222563
2020-08-28,0.268802,0.200945,0.305730,0.224523
2020-08-29,0.242011,0.210486,0.296989,0.250515
2020-08-30,0.221063,0.245417,0.252421,0.281099


In [36]:
pd.concat([correlation, distance], axis = 1).\
    to_csv('W:/UK/Research/Private/WEATHER/STAGE_ABALDO/scripts/predictions/predictions_MF_winter.csv')