In [1]:
#data processing
import requests, copy, math
import numpy as np
import pandas as pd
import scipy.interpolate

#data visualization
import matplotlib.pylab as plt
from matplotlib import ticker

#used for map projections
import cartopy.crs as ccrs
import cartopy.feature as cft
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER

import warnings
warnings.filterwarnings('ignore')

In [92]:

# To see what this polygon looks like, visit https://argovis.colorado.edu/ng/home?mapProj=WM&presRange=%5B0,2000%5D&selectionStartDate=2022-03-31T22:30:45Z&selectionEndDate=2022-04-14T22:30:45Z&threeDayEndDate=2022-04-12T22:30:45&shapes=%5B%5B%5B22.105999,-76.289063%5D,%5B26.902477,-72.597656%5D,%5B26.084682,-66.395555%5D,%5B25.005973,-60.292969%5D,%5B10.833306,-65.566406%5D,%5B11.942098,-72.133142%5D,%5B22.105999,-76.289063%5D%5D%5D&includeRealtime=true&onlyBGC=false&onlyDeep=false&threeDayToggle=false
def check_error_message(ans,writeFlag=False):
    # ans: response JSON from an API query
    # writeFlag: bool, true == print verbose errors, if found
    # returns error code if found, or NaN if not.
    if isinstance(ans,dict) and 'message' in ans.keys() and 'code' in ans.keys():
        if writeFlag:
            print(str(ans['code']) + ': ' + ans['message'])
        ##### NOTE: we should include here below all the codes that do not return data as the user expects
        if ans['code'] >= 400 and ans['code'] != 404:
            print('Data were not returned')
            print(ans)
            raise Exception('No data')
        return ans['code']        
    elif ans:
        return np.nan
#####
# check if the object is a list of dictionaries
def check_list_of_dict(lst,writeFlag=False):
    # lst: object to check if is a list of dicts
    # writeFlag: bool, true == verbose mode
    # return 1 if lst is a list of dicts, 0 ow
    if lst and isinstance(lst,list):
        if all(isinstance(i, dict) for i in lst):
            if writeFlag:
                print('Number of items: '+str(len(lst)))
            return 1
        else:
            if writeFlag:
                print(lst) 
            return 0
    else:
        if writeFlag:
            print(lst) 
        return 0 
######
def get_data_from_url(url,myAPIkey,writeFlag=False):
    # url: string url to attempt to query
    # myAPIkey: string API key, get yours at https://argovis-apikey-manager-atoc-argovis-dev.apps.containers02.colorado.edu/ 
    # myAPIkey can also be left empty '', yet in this case the user is more likely to exceed API request limits (and get HTTP 403 errors).
    # returns a dictionary representation of the reponse from the endpoint hit in url; empty list if 404.
    try:
        d_raw = requests.get(url,headers={"x-argokey": myAPIkey}).json()
        ans = check_error_message(ans=d_raw,writeFlag=writeFlag)
    except:
        print(url)
        raise Exception('No data')
    # check that data are a list of dictionaries as expected
    if ans == 404:
        return []
    elif np.isnan(ans) and check_list_of_dict(lst=d_raw,writeFlag=writeFlag) == 1:
        if writeFlag:
            print(url)
        return d_raw
    else:
        print(ans)
        raise Exception('Check object type and error code')
#####
def create_url(url_prefix, \
               startDate='',endDate='', \
               radius_km=[],center=[], \
               polygon=[],data='',presRange='', \
               source='',platform_id='',woceline='',profile_id=''):
    # url_prefix: string root of API routes
    # startDate [endDate]: string start [end] date to filter documents on, in ISO 8601 UTC datestrings ie 1999-12-31T00:00:00Z
    # radius_km: float distance to search in proximity search; must be passed with center
    # center: [lon, lat] list of center of proximity search; must be passed with radius_km
    # polygon: [[lon0, lat0], [lon1, lat1], ... [lon0, lat0]] list of lists of lon/lat pairs describing polygon bounding box for region search; first coord must == last coord
    # data: comma delimited string of data variables to seatch for ANDed together, ie 'pres,temp,doxy'. Admits negation ('pres,temp,~doxy'); will return the actual measurements listed and filter for profiles that have them. Get metadata only by including 'metadata-only'.
    # presRange: comma delimited string indicating min and max pressure to return levels for, ie '0,100' for top 100 dbar
    # source: comma delimited string of data sources, ANDed together, such as 'argo_core' or 'cchdo_go-ship'. Accepts negation, ie 'argo_core,~argo_bgc'
    # platform_id: string indicating ID of Argo platform to search for
    # woceline: string indicating WOCE line to search for
    # profile_id: string indicating profile ID to search for
    # returns: string URL for performing the desired search (note all filters are ANDed together).

    url = url_prefix
    
    if startDate:
        url = url + '&startDate=' + startDate
    if endDate:
        url = url + '&endDate=' + endDate
        
    # regional queries
    if radius_km and center:
        url = url + '&radius=' + radius_km + '&center=' + center
    elif polygon:
        url = url + '&polygon=' + polygon
        
    # queries by variable data
    if data:
        url = url + '&data=' + data
    
    # queries by pressure range
    if presRange:
        url = url + '&presRange=' + presRange
    
    # queries by source
    if source:
        url = url + '&source=' + source
    
    # queries by platform id
    if platform_id:
        url = url + '&platform_id=' + platform_id
    
    # queries by woceline
    if woceline:
        url = url + '&woceline=' + woceline

    # queries by _id
    if profile_id:
        url = url + '&id=' + profile_id
        
    return url
#####
def get_data_for_timeRange(startDate,endDate,url_prefix, \
                     myAPIkey,\
                     radius_km=[],center=[], \
                     polygon=[],data='',presRange='', \
                     source='',platform_id='',woceline='', \
                     dt_tag='d',writeFlag=False):
    # all inputs as create_url, excpet:
    # myAPIkey: string API key for Argovis API
    # dt_tag: frequency tag as defined at https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases; 
    # determines how much data is downloaded per request (we suggest 'd' to avoid requesting too much data all at once; it should be tuned based e.g. 
    # on the size of the region of interest)
    # returns a dataframe describing the data returned by the specified query string filters

    list_of_days = create_list_of_days(startDate,endDate,dt_tag=dt_tag)
    info_ALL = []
    for i in np.arange(0,len(list_of_days)-1):
        url_to_use = create_url(url_prefix=url_prefix, \
                               startDate=list_of_days[i], \
                               endDate=list_of_days[i+1], \
                               radius_km=radius_km,center=center, \
                               polygon=polygon,data=data,presRange=presRange, \
                               source=source,platform_id=platform_id,woceline=woceline)
        #print(url_to_use)
        info_ALL   = info_ALL + get_data_from_url(url=url_to_use,myAPIkey=myAPIkey,writeFlag=writeFlag)
                               
    info_ALL = pd.DataFrame(info_ALL)
    
    return info_ALL

def create_list_of_days(startDate,endDate,dt_tag='d'): 
    # dt_tag could be '30T', 'd', ...
    list_of_days = (pd.DataFrame(columns=['NULL'],
                            index=pd.date_range(startDate,endDate,
                                                freq=dt_tag)) #'d'
                                   .between_time('00:00','23:59')
                                   .index.strftime('%Y-%m-%dT%H:%M:%SZ')
                                   .tolist()
                )
    if list_of_days[-1] != endDate:
        list_of_days.append(endDate[0:11]+'23:59:59Z')
    return list_of_days

def get_info_from_df(df,info_to_store):
    # df: dataframe as returned by ie get_data_for_timeRange
    # info_to_store: list of strings indicating variables of interest
    # returns dictionary packing of listed info from dataframe

    if isinstance(df,pd.DataFrame):
        lon  = []
        lat  = []
        date = []
        cols_bySource=[]
        ids  = []
        woce_line = []
        
        lst_out = []
        
        for i in np.arange(0,len(df),1):
            #
            if any("lon" in s for s in info_to_store) or  any("lat" in s for s in info_to_store):
                lon.append(df.geolocation[i]['coordinates'][0])
                lat.append(df.geolocation[i]['coordinates'][1])
            #
            if any("date" in s for s in info_to_store):
                date.append(df.timestamp[i][0:-5]+'Z')
            #
            if any("ids" in s for s in info_to_store):
                ids.append(df._id[i])
            #
            if any("cols_bySource" in s for s in info_to_store):
                bfr_source= []
                for jsource in df.source_info[i]:
                    bfr_source = bfr_source + jsource['source']
                cols_bySource.append('y')
            # 
            if any("woce_line" in s for s in info_to_store):
                if "woce_line" in df.keys():
                    woce_line.append(df.woce_line[i])
                    
        for i in info_to_store:
            if len(eval(i)) ==len(df) or not eval(i):
                eval('lst_out.append('+ i +')')
            else:
                raise Exception('check length')
            
        dict_info = {}
        for i,ival in zip(info_to_store,lst_out):
            if ival:
                dict_info[i] = ival
    return dict_info

In [None]:
def expand_lon_lat(df):
    
    def get_lon_lat(cell):

        return cell['coordinates']

    df['lon_lat'] = df['geolocation'].apply(get_lon_lat)

    df_lon_lat = pd.DataFrame(df['lon_lat'].to_list(), columns=['lon', 'lat'])
    
    df['lon'] = df_lon_lat['lon']
    df['lat'] = df_lon_lat['lat']
    
    df = df.drop(columns=['geolocation', 'lon_lat'])
    
    return df

In [None]:
def expand_source_info(df):
    
    def get_source(cell):
        
        source = cell[0]['source']
        source_str = ','.join(source)

        return source_str
    
    df['source'] = df['source_info'].apply(get_source)
    
    df = df.drop(columns=['source_info'])
    
    return df

In [None]:
def lmm_parse_df(df):
    
    columns = list(df.columns)
    
    rows = df.shape[0]
    master = pd.DataFrame()
    
    for i in range(rows):
        
        profile = pd.DataFrame( df.iloc[i]['data'])
        
        #profile['profile_id'] = df.iloc[i]['_id']
        
        for name in columns:
            
            if name == 'data':
                continue
            
            if name == 'source':
                continue
                
            try:
                profile[name] = df.iloc[i][name]
            except:
                print(f"error with {name}")
                
        master = pd.concat([master, profile])
        
    master['source'] = df['source']
    
    master = master.rename(columns={'_id': 'profile_id'})
        
    return master

In [136]:
# #cell to change

# atlantic_coords = [[-40.078125,29.840644],[-33.368671,30.338837],[-26.614528,30.492027],
#                     [-19.863281,30.297018],[-20.039063,-30.145127],[-26.724822,-30.384017],
#                     [-33.419918,-30.281826],[-40.078125,-29.840644],[-40.078125,29.840644]]
# presRange ='[0,500]'
# shape = str(atlantic_coords)
# URL_PREFIX = 'https://argovis-api.colorado.edu'
# startDate = '2021-04-20T00:00:00Z'
# endDate   = '2021-05-02T00:00:00Z'

In [137]:
# df = get_data_for_timeRange(startDate=startDate,endDate=endDate, \
#                                 url_prefix=URL_PREFIX+'/profiles?', \
#                                 myAPIkey='', \
#                                 source='argo_core', \
#                                 polygon=shape, data='psal,temp,psal_argoqc,pres_argoqc,temp_argoqc', \
#                                 dt_tag='365d',writeFlag=True)

# #if we need more variables add to data=

Number of items: 309
https://argovis-api.colorado.edu/profiles?&startDate=2021-04-20T00:00:00Z&endDate=2021-05-02T23:59:59Z&polygon=[[-40.078125, 29.840644], [-33.368671, 30.338837], [-26.614528, 30.492027], [-19.863281, 30.297018], [-20.039063, -30.145127], [-26.724822, -30.384017], [-33.419918, -30.281826], [-40.078125, -29.840644], [-40.078125, 29.840644]]&data=psal,temp,psal_argoqc,pres_argoqc,temp_argoqc&source=argo_core


In [138]:
def parse_df(df):
    rows = df.shape[0]
    master = pd.DataFrame()
    for i in range(rows):
        profile = pd.DataFrame( df.iloc[i]['data'])
        profile['profile_id'] = df.iloc[i]['_id']
        master = pd.concat([master, profile])
    return master

In [139]:
# parse_df(df)

Unnamed: 0,pres,pres_argoqc,psal,psal_argoqc,temp,temp_argoqc,profile_id
0,1.120000,1,37.294998,1.0,26.806999,1.0,3901237_157
1,2.040000,1,37.293999,1.0,26.806,1.0,3901237_157
2,3.000000,1,37.293999,1.0,26.809,1.0,3901237_157
3,3.960000,1,37.293999,1.0,26.808001,1.0,3901237_157
4,4.960000,1,37.293999,1.0,26.808001,1.0,3901237_157
...,...,...,...,...,...,...,...
505,1798.649902,1,34.972088,1.0,3.989,1.0,5905148_130
506,1848.250000,1,34.969093,1.0,3.897,1.0,5905148_130
507,1898.549927,1,34.968094,1.0,3.821,1.0,5905148_130
508,1947.949951,1,34.967094,1.0,3.738,1.0,5905148_130


In [None]:
presRange ='[0,500]'

atlantic_coords = [[-40.078125,29.840644],[-33.368671,30.338837],[-26.614528,30.492027],
                [-19.863281,30.297018],[-20.039063,-30.145127],[-26.724822,-30.384017],
                [-33.419918,-30.281826],[-40.078125,-29.840644],[-40.078125,29.840644]]

shape = str(atlantic_coords)


startDate = '2021-04-25T00:00:00Z'
endDate   = '2021-05-01T00:00:00Z'



def loop_fetch(shape, presRange, date_range):
    URL_PREFIX = 'https://argovis-api.colorado.edu'
    
    df_list = []
    
    for startDate, endDate in date_range:

        df = get_data_for_timeRange(startDate=startDate,endDate=endDate, \
                                        url_prefix=URL_PREFIX+'/profiles?', \
                                        myAPIkey='', \
                                        source='argo_core', \
                                        polygon=shape, data='psal,temp,psal_argoqc,pres_argoqc,temp_argoqc', \
                                        dt_tag='365d',writeFlag=True)
        
        start_date = startDate.split('T')[0]
        
        df.to_csv(f'2021_argo_core_dataframe_{start_date}.csv', index=False)
        
        df_list.append(df)
    
    
    df_all = pd.concat(df_list, ignore_index=True)
    
    return df_all
    
