In [15]:
import xarray as xr
import netCDF4 as nc
import pandas as pd
import urllib.request
from urllib.parse import urlparse
import os, json, requests
from bs4 import BeautifulSoup
import ipywidgets as widgets
from collections import defaultdict
import re

# Get available spurs datasets using podaac URL.
spurs_source='https://podaac.jpl.nasa.gov/api/cmr/dataset?format=umm_json&page_size=38&sortField=score&ids=Projects&values=SPURS&search='
series={}
with urllib.request.urlopen(spurs_source) as url:
    data = json.loads(url.read().decode())
    for item in data['items']:
        for element in item['umm']['RelatedUrls']:
            if (element['Type']=='USE SERVICE API'):
                dataset_name = item['umm']['CollectionCitations'][0]['SeriesName']
                series[dataset_name] = element['URL']
print ('Read all URLs.')
                
varz_to_mapped_name = {}
mapped_names = set()
skip = True
with open("remapped_varialbes.txt") as fp:
    for line in fp:
        if skip:
            skip = False
            continue
        groups = re.match(r"(.*) : (.*)", line).groups()
        varz_to_mapped_name[groups[0].strip()] = groups[1].strip()
        mapped_names.add(groups[1].strip())
sorted_mapped_names = sorted(mapped_names)
print ('Finished reading remapped_varialbes.txt - sorted_mapped_names: ' + str(len(sorted_mapped_names)))
                
all_coords = set()
if os.path.isfile('all_coords.txt'):
    print ('Found all_coords.txt file.')
    with open('all_coords.txt', 'r') as f:
        for line in f:
            all_coords.add(line.strip())
    print (all_coords)

processed_datasets = set()
if os.path.isfile('coords_processed_datasets.txt'):
    print ('Found coords_processed_datasets.txt file.')
    with open('coords_processed_datasets.txt', 'r') as f:
        for line in f:
            processed_datasets.add(line.strip())
    print (processed_datasets)

skip_datasets = ['SPURS-2 research vessel along track SEA-POL rain radar imaging data for E. Tropical Pacific R/V Revelle-2 cruise', 'SPURS-2 Rawinsonde meteorological data for the E. Tropical Pacific field campaign R/V Revelle cruises']    

for dataset_name, url in series.items():
    if dataset_name in processed_datasets:
        continue
    if dataset_name in skip_datasets:
        continue
    print ('processing dataset: ' + dataset_name)
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')
    for a in soup.find_all('a', href=True):
        if a['href'].endswith('.nc') and 'viewers' not in a['href']:
            netcdf=url+a['href']
            base_dir = 'tmp/'
            file_name = os.path.basename(urlparse(netcdf).path)
            urllib.request.urlretrieve(netcdf, base_dir + file_name)
            netcdf_data = xr.open_dataset(base_dir + file_name,decode_times=False)
            standard_name = lambda v: v is not None
            netcdf_data = netcdf_data.filter_by_attrs(standard_name=standard_name)
            with open('all_coords.txt', 'a') as f:
                for coord in netcdf_data.coords:
                    if coord not in all_coords:
                        all_coords.add(coord)
                        f.write(coord + '\n')
            print ('Finished processing file: ' + file_name)
    print ('Finished writing coords for dataset: ' + dataset_name)
    with open('coords_processed_datasets.txt', 'a') as f:
        f.write(dataset_name + '\n')
    processed_datasets.add(dataset_name)


print ('Done.')

Read all URLs.
Finished reading remapped_varialbes.txt - sorted_mapped_names: 64
Found all_coords.txt file.
{'HEIGHT_RHAT', 'fast_depth', 'ctd_depth', 'DEPTH', 'lat', 'HEIGHT_SW', 'wind_longitude', 'adcp_depth', 'longitude', 'LATITUDE', 'z', 'trajectory', 'latitude', 'ctd_time', 'wind_time', 'HEIGHT_WND', 'fast_time', 'altitude', 'rain_latitude', 'lon', 'TIME', 'adcp_time', 'depth', 'time', 'HEIGHT_LW', 'float_depth', 'salinity_depth', 'gps_time', 'LONGITUDE', 'rain_longitude', 'T', 'rain_time', 'HEIGHT_RAIN', 'HEIGHT_ATMS', 'wind_latitude', 'temperature_depth'}
Found coords_processed_datasets.txt file.
{'SPURS-2 Drifter data for the E. Tropical Pacific field campaign', 'SPURS-2 Central mooring CTD, surface flux and meterorological data for the E. Tropical Pacific field campaign', 'Tenuse Glider CTD data for the SPURS-1 N. Atlantic field campaign', 'SPURS-2 research vessel CTD profile data for E. Tropical Pacific R/V Revelle cruises', 'Argo float CTD profile data within the scope of th