In [1]:
import requests
import json
from Config import api_key

In [2]:
import pandas as pd
import concurrent.futures
import time

# Get list of all files

In [None]:
#Synchronous solution - MUCH SLOWER
from IPython.display import clear_output
file_paths = []


empty = 0
for i in range(1, 10000):
    url = f'https://cchdo.ucsd.edu/api/v1/file/{i}'
    re = requests.get(url, headers = {"X-Authentication-Token": api_key})
    file_json = json.loads(re.text)
    if file_json['file_path'] == '':
        empty += 1
    else:
        file_paths.append(file_json['file_path'])
    clear_output(wait=True)
    print(str(i) + ' - ' +  str(empty))

In [None]:
file_paths

In [3]:
#Get list all file ids
url = 'https://cchdo.ucsd.edu/api/v1/file'
re = requests.get(url, headers = {"X-Authentication-Token": api_key})
raw_json = json.loads(re.text)
file_ids = [x['id'] for x in raw_json['files']]

In [6]:
file_metadata_urls = [f'https://cchdo.ucsd.edu/api/v1/file/{file_id}' for file_id in file_ids]

In [7]:
#Asynchronous solution
#Get all file metadata
out = []
CONNECTIONS = 100
TIMEOUT = 50

urls = file_metadata_urls

def load_url(url, timeout):
    re = requests.get(url, headers = {"X-Authentication-Token": api_key})
    if re.status_code >= 300:
        return ['ERROR', url, re.status_code]
    else:
        raw_json = json.loads(re.text)
        return raw_json

with concurrent.futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as executor:
    future_to_url = (executor.submit(load_url, url, TIMEOUT) for url in urls)
    time1 = time.time()
    for future in concurrent.futures.as_completed(future_to_url):
        try:
            data = future.result()
        except Exception as exc:
            data = str(type(exc))
        finally:
            out.append(data)

            print(str(len(out)),end="\r")

    time2 = time.time()

print(f'Took {time2-time1:.2f} s')

Took 326.92 s


In [8]:
#Verify there are no errors
errors = [x for x in out if (not isinstance(x, dict))]
len(errors)

0

In [9]:
with open('data.json', 'w') as outfile:
    json.dump(out, outfile)

In [3]:
with open('data.json', 'r') as f:
    out = json.load(f)

https://cchdo.ucsd.edu/api/v1/file/all Doesn't return file paths, is this intentional?

In [None]:
# #Get list of all cruises and their ids
# url = 'https://cchdo.ucsd.edu/api/v1/file/all'
# re = requests.get(url, headers = {"X-Authentication-Token": api_key})
# raw_json = json.loads(re.text)

In [None]:
#raw_json

### Roles, data formats, and data types

Is there a specific subset of this we want to check?
Currently only pulling files where the `data_type` is `bottle` or `ctd`

In [43]:
set([x['role'] for x in out])

{'',
 'ancillary',
 'archive',
 'dataset',
 'hidden',
 'intermediate',
 'merged',
 'product',
 'raw',
 'unprocessed'}

In [44]:
set([x['data_format'] for x in out])

{'',
 'cf_netcdf',
 'exchange',
 'hrp_netcdf',
 'joa_binary',
 'matlab',
 'ods',
 'pdf',
 'sbe_ascii_cnv',
 'sbe_hex_xmlcon_24hz',
 'text',
 'whp_netcdf',
 'woce'}

In [45]:
set([x['data_type'] for x in out])

{'',
 'bottle',
 'ctd',
 'documentation',
 'float',
 'hrp',
 'large_volume',
 'summary',
 'trace_metals'}

In [46]:
sum([x['file_size'] for x in out if x['data_format'] == 'exchange'])

1947976580

In [5]:
exchange_urls = ['https://cchdo.ucsd.edu' + x['file_path'] for x in out if x['data_format'] == 'exchange']
exchange_urls

['https://cchdo.ucsd.edu/data/45/pr16_2004b_hy1.csv',
 'https://cchdo.ucsd.edu/data/4/ar08_c_hy1.csv',
 'https://cchdo.ucsd.edu/data/18/ais01_09AR20010101_hy1.csv',
 'https://cchdo.ucsd.edu/data/11/64TR19900417_hy1.csv',
 'https://cchdo.ucsd.edu/data/9/ar28_74CH19850120_ct1.zip',
 'https://cchdo.ucsd.edu/data/181/ar16_e_hy1.csv',
 'https://cchdo.ucsd.edu/data/190/33H420090412_ars20_ct1.zip',
 'https://cchdo.ucsd.edu/data/90/58JH19980801_hy1.csv',
 'https://cchdo.ucsd.edu/data/42/35MF20091219_hy1.csv',
 'https://cchdo.ucsd.edu/data/56/35LL19910103_hy1.csv',
 'https://cchdo.ucsd.edu/data/121/320620110219_hy1.csv',
 'https://cchdo.ucsd.edu/data/224/33KI133_1_ct1.zip',
 'https://cchdo.ucsd.edu/data/103/pr21_a_ct1.zip',
 'https://cchdo.ucsd.edu/data/88/ir06_a_ct1.zip',
 'https://cchdo.ucsd.edu/data/176/32MW105_1_ct1.zip',
 'https://cchdo.ucsd.edu/data/162/06AQANTXIII4_hy1.csv',
 'https://cchdo.ucsd.edu/data/92/pr02_i_hy1.csv',
 'https://cchdo.ucsd.edu/data/220/pr17_n_ct1.zip',
 'https://cch

In [6]:
#Download files
out = []
errors = []
CONNECTIONS = 100
TIMEOUT = 50

urls = exchange_urls


def load_url(url, timeout):
    re = requests.get(url, allow_redirects=True, headers = {"X-Authentication-Token": api_key})
    if re.status_code >= 300:
        return ['ERROR', url, re.status_code]
    else:
        #Download file
        file_name = url.split('/')[-1]
        open('data/' + file_name, 'wb').write(re.content)
        return 'Success'

with concurrent.futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as executor:
    future_to_url = (executor.submit(load_url, url, TIMEOUT) for url in urls)
    time1 = time.time()
    for future in concurrent.futures.as_completed(future_to_url):
        try:
            data = future.result()
        except Exception as exc:
            data = str(type(exc))
        finally:
            if isinstance(data, list):
                errors.append(data)
            out.append(data)
            print(str(len(out)),end="\r")

    time2 = time.time()

print(f'Took {time2-time1:.2f} s')

Took 119.35 s


# Select bottle/ctd files

In [None]:
to_download = []
for entry in raw_json:
    if entry['data_type'] == 'bottle' or entry['data_type'] == 'ctd':
        to_download.append(entry)

In [None]:
#Check total size to download
total_size = 0
for entry in to_download:
    total_size += int(entry['file_size'])
    
print(str(round(total_size / 1000000000, 2)) + ' GB')

Most files (8683/8798) don't have a path, what to do about these files?

In [None]:
empty_path_count = 0
for entry in to_download:
    if entry['file_path'] == '':
        empty_path_count += 1
empty_path_count

In [None]:
len(to_download)

In [None]:
for entry in to_download:
    if entry['file_path']:
        file_name = entry['file_name']
        file_path = entry['file_path']
        
        url = f'https://cchdo.ucsd.edu{file_path}'
        re = requests.get(url, allow_redirects=True, headers = {"X-Authentication-Token": api_key})
        open('data/' + file_name, 'wb').write(re.content)

In [None]:
to_download