In [1]:
import pandas as pd
import numpy as np
import requests
import io
import os
from urllib.parse import quote 
import urllib

import sys
sys.path.insert(0,'./support')

from ipynb.fs.defs.Coordinates import Coordinates
from ipynb.fs.defs.Sites import wave_columns, Site
from ipynb.fs.defs.Periods import Period
from ipynb.fs.defs.time_index import convert_index, stitch, remove_duplicate_inputs, clean_data

In [2]:
import certifi
import urllib3
http = urllib3.PoolManager(
    cert_reqs='CERT_REQUIRED',
    ca_certs=certifi.where())

In [3]:
'''
Erddap Data Server
For data access form, see 'https://erddap.marine.ie/erddap/tabledap/IWaveBNetwork_spectral.html'
'''
db_name = 'EP_ERD_INT_%s_AL_TS_NRT.csv'

#important variable order. Period will be Tp if available. If not, Te will be the type. Lastly Tz...
varlist=['VTPK','VTM10','VTZA','VTM02']
param_max_vals={'VHM0' : 50, 'VTPK' : 30, 'VTM10' : 30, 'VTZA' : 30, 'VTM02' : 30}
#height cannot be higher than 50m and period cannot be higher than 30s...

In [4]:
def get_platforms_erddap(param):
    
    dbID = 'EP_ERD_INT_%s_AL_TS_NRT.subset?EP_PLATFORM_ID,EP_PLATFORM_CODE,EP_PLATFORM_TYPE&.viewDistinctData=10000&.viewRelatedData=0&distinct()'
    
    url='https://erddap.emodnet-physics.eu/erddap/tabledap/'+dbID % param
    response = requests.get(url, verify=False).text

    platforms = pd.read_html(str(response),skiprows=[1],header=[0],attrs={"class":"erd commonBGColor nowrap"})[0]
    platforms.drop_duplicates(subset='EP_PLATFORM_ID', inplace=True)
    platforms.columns=['PlatformID','PlatformName','PlatformType']
    platforms['PlatformID']=platforms['PlatformID'].astype(str)
    platforms.reset_index(inplace=True, drop=True)
    return platforms

In [5]:
def remove_outliers(df, param):
    df = df[df[param].map(format).astype('float') < param_max_vals[param]]
    df.reset_index(inplace=True, drop=True)
    return df

In [6]:
def merge(df1,df2):
    if len(df1)==0:return df2
    if len(df2)==0:return df1
    f1=df1.index.freq
    f2=df2.index.freq
    if f1!=f2:
        print('These 2 dataframe do not have the same frequency and cannot be stitched together!',f1,f2)
        print(df2)
        if(len(df1)>len(df2)):return df1
        else:return df2
    tz1=df1.index.tz
    tz2=df2.index.tz
    if tz1!=tz2:#Sometimes tz are defined as pytz or pandas.tz
        df2.index=df2.index.tz_convert(tz1)
    if (df2.index[0] - df1.index[-1]) < f1:
        return df1.append(df2)
    #some stitching is required to fill the gap between df1 and df2
    newindex = pd.date_range(start=df1.index[0],end=df2.index[-1],freq=f1,tz=df1.index.tz)
    newindex.freq=f1
    newindex.name=df1.index.name
    if isinstance(df1, pd.DataFrame):
        df=pd.DataFrame(index=newindex,columns=df1.columns)
    else:
        df=pd.Series(index=newindex,name=df1.name)
    df.loc[df1.index]=df1
    df.loc[df2.index]=df2
    return df

In [7]:
'''
A function that returns wave data belonging to a specific year and a station taken as parameters.
If the data is not saved in the system, fetches it from ERDDAP server via request URL.
'''

def get_year_data(station_id, year, variable):
    
    filename = str(station_id)+'_'+str(year)+'_'+db_name % variable
    path = 'data/EMODnet/'
    
    if os.path.split(os.getcwd())[1] == 'site_data':here='./'
    else:here='./site_data/'
    
    path = here + path
    if not os.path.exists(path):
        os.makedirs(path)
        
    filename = path + filename
    if os.path.isfile(filename):
        wave_data = pd.read_csv(filename)
      
    else:
        url = 'https://erddap.emodnet-physics.eu/erddap/tabledap/'+db_name % variable+'?'
        variables = 'EP_PLATFORM_ID,EP_PLATFORM_CODE,time,depth,latitude,longitude,%s,area' % variable        
        time_start = str(year) + '-01-01T00:00:00Z'
        time_end = str(year) + '-12-31T23:59:00Z'
        url_add = variables
        url_add += '&EP_PLATFORM_ID="' + str(station_id) + '"' 
        url_add += '&time>=' + time_start
        url_add += '&time<=' + time_end

        #encode query in order to assure correct url format
        url += urllib.parse.quote(url_add , safe='=&-')
        #print(url)
        response = requests.get(url, verify=False)
        
        #If no data has queued (possibly due to no data at a given date etc.)
        #code 200 = ok
        if response.status_code != 200 : 
            print('station ' + str(station_id) + ' data not available at given times')
            return pd.DataFrame()
        
        response = response.content
        #first row of the data is reserved for variable units 
        wave_data = pd.read_csv(io.StringIO(response.decode('utf-8')) , sep = ',' , header=0, skiprows=[1])
        #print(wave_data)
        if not wave_data.empty:
            wave_data.columns = ['station_id','station_name','time (UTC)','depth','latitude','longitude',variable,'area']             
            wave_data.drop(['station_id','station_name','depth','latitude','longitude','area'], axis=1, inplace=True)
            
            #eliminate unusual values
            wave_data = remove_outliers(wave_data, variable)
            #wave_data[variable] = wave_data[variable].round(2)
            wave_data.to_csv(filename, index=False)
    return wave_data

In [8]:
'''
Main function that takes user inputs as parameter 
and returns dictionary structure containing Ireland data with the desired stations and the time frame.

Inputs are; a site object(site_name, *coordinates), start time, end time

Station id is assigned according to the given coordinates. 
If that coordinate is not in Ireland stations list, then the closest one within distance limit is calculated and assigned. 

Current distance limit is 200 km.
'''
def get_Emodnet_data(time_start, time_end, station_id, swell=False):
    print('Getting data from Emodnet')
    if swell:
        print('Station ' + station_id + ' has no swell data available')
    
    start_year = int(time_start[0:4])
    end_year = int(time_end[0:4])
    years = list(range(start_year, end_year+1))        
    site_data=pd.DataFrame()
    
    #height and period has separate databases. 
    #only height variable is vhm0 but period differs.
    #find the parameter that station belongs to...
    for var in varlist:
        platforms= get_platforms_erddap(var)
        if (platforms['PlatformID'] == str(station_id)).any():
            param=var
            for y in years: 
                height_data = get_year_data(station_id , y, 'VHM0')
                #if no data returned, go next year
                if height_data.empty: continue
                period_data = get_year_data(station_id , y, param)
                if period_data.empty: continue
               
                if not height_data.index.equals(period_data.index): 
                    print('two parameter indexes are not matching')
                    display(height_data.index, period_data.index)
                #site_data = site_data.join(period_data, how='inner', lsuffix=('Hs'), rsuffix=('T'))
                #site_data = pd.concat([site_data,period_data], axis=1)
                #mergedDf = site_data.merge(period_data, left_index=True, right_index=True)
                height_data[param] = period_data[param]
                if site_data.empty: site_data = clean_data(height_data, station_id)[time_start:time_end]
                else: site_data = stitch(site_data , clean_data(height_data, station_id)[time_start:time_end])
            if not site_data.empty: 
                site_data.columns= ['Hs(m)_Emodnet(vhm0)_'+str(station_id), 'T(s)_Emodnet('+param+')_'+str(station_id)]
            break           
        
    return site_data