In [1]:
import requests
import math
import time
from pprint import pprint

In [2]:
CMR_OPS = 'https://cmr.earthdata.nasa.gov/search'

In [3]:
provider = 'LPCLOUD'

In [4]:
url = f'{CMR_OPS}/{"granules"}'

In [12]:
collections = ['C2021957657-LPCLOUD', 'C2021957295-LPCLOUD']
datetime_range = '2021-10-17T00:00:00Z,2021-10-17T23:59:59Z'
page_size = 2000

---

## CMR API Request: Get information from multiple pages 

CMR Paging Details - <https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html#paging-details>

In [13]:
def get_page_total(collections, datetime_range, page_size):
    hits = requests.get(url, 
                        params={
                            'concept_id': collections,
                            'temporal': datetime_range,
                            'page_size': page_size,
                            },
                        headers={
                            'Accept': 'application/json'
                            }
                       ).headers['CMR-Hits']
    return math.ceil(int(hits)/page_size)

In [14]:
page_numbers = list(range(1, get_page_total(collections, datetime_range, page_size)+1))
page_numbers

[1, 2, 3, 4, 5]

In [15]:
data_urls = []

In [16]:
start = time.time()
for n in page_numbers:
    print(f'Page: {n}')
    response = requests.get(url,
                            params={
                                'concept_id': collections,
                                'temporal': datetime_range,
                                'page_size': page_size,
                                'page_num': n
                            },
                            headers={
                                'Accept': 'application/json'
                            }
                           )
    print(f'Page {n} Resonse Code: {response.status_code}')
    
    granules = response.json()['feed']['entry']
    print(f'Number of Granules: {len(granules)}')
    
    for g in granules:
        data_urls.extend([x['href'] for x in g['links'] if 'https' in x['href'] and '.tif' in x['href']])
end = time.time()
print(f'Total time: {end-start}')

Page: 1
Page 1 Resonse Code: 200
Number of Granules: 2000
Page: 2
Page 2 Resonse Code: 200
Number of Granules: 2000
Page: 3
Page 3 Resonse Code: 200
Number of Granules: 2000
Page: 4
Page 4 Resonse Code: 200
Number of Granules: 2000
Page: 5
Page 5 Resonse Code: 200
Number of Granules: 121
Total time: 9.523819208145142


In [17]:
len(data_urls)

139005

In [18]:
data_urls[:50]

['https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T59ULA.2021289T235621.v2.0/HLS.S30.T59ULA.2021289T235621.v2.0.VZA.tif',
 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T59ULA.2021289T235621.v2.0/HLS.S30.T59ULA.2021289T235621.v2.0.B01.tif',
 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T59ULA.2021289T235621.v2.0/HLS.S30.T59ULA.2021289T235621.v2.0.B06.tif',
 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T59ULA.2021289T235621.v2.0/HLS.S30.T59ULA.2021289T235621.v2.0.B02.tif',
 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T59ULA.2021289T235621.v2.0/HLS.S30.T59ULA.2021289T235621.v2.0.B03.tif',
 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30.020/HLS.S30.T59ULA.2021289T235621.v2.0/HLS.S30.T59ULA.2021289T235621.v2.0.SAA.tif',
 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSS30

---

## Parallel Request - NOT COMPLETE

In [21]:
import aiohttp
import asyncio

In [44]:
url = 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=20&page_num=1'

In [45]:
async def main():
    async with aiohttp.ClientSession() as session:
        async with session.get('http://httpbin.org/get') as resp:
            print(resp.status)
            print(await resp.text())

await main()

200
{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "Python/3.9 aiohttp/3.8.1", 
    "X-Amzn-Trace-Id": "Root=1-61bb56c6-1506eddc2b38dfe7164f1247"
  }, 
  "origin": "34.211.243.1", 
  "url": "http://httpbin.org/get"
}



In [46]:
async def main():
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            print(resp.status)
            print(await resp.json())

await main()

200
{'feed': {'updated': '2021-12-16T15:10:04.618Z', 'id': 'https://cmr.earthdata.nasa.gov:443/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=20&page_num=1', 'title': 'ECHO granule metadata', 'entry': [{'producer_granule_id': 'HLS.S30.T59ULA.2021289T235621', 'time_start': '2021-10-17T00:01:27.616Z', 'updated': '2021-10-19T21:17:09.302Z', 'dataset_id': 'HLS Sentinel-2 Multi-spectral Instrument Surface Reflectance Daily Global 30m v2.0', 'data_center': 'LPCLOUD', 'title': 'HLS.S30.T59ULA.2021289T235621.v2.0', 'coordinate_system': 'GEODETIC', 'day_night_flag': 'DAY', 'time_end': '2021-10-17T00:01:27.616Z', 'id': 'G2146838075-LPCLOUD', 'original_format': 'ECHO10', 'browse_flag': True, 'polygons': [['54.0214307 167.9468576 54.0523119 169.6221725 55.0388029 169.5885203 55.0067836 167.872404 54.0214307 167.9468576']], 'collection_concept_id': 'C2021957295-LPCLOUD', 'online_ac

In [26]:
async def fn():
  print('hello')
  await asyncio.sleep(1)
  print('world')

await fn()

hello
world


In [19]:
urls = ['https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=1', 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=2', 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=3', 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=4', 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=5'] 

In [20]:
urls

['https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=1',
 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=2',
 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=3',
 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=4',
 'https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LP

In [63]:
async def get(url, session):
    try:
        async with session.get(url=url) as response:
            resp = await response.status_code
            print(f'Successfully got url {url} with status code {resp}')
    except Exception as e:
        print("Unable to get url {} due to {}.".format(url, e.__class__))

In [64]:
async def main(urls):
    async with aiohttp.ClientSession() as session:
        ret = await asyncio.gather(*[get(url, session) for url in urls])
    print("Finalized all. Return is a list of len {} outputs.".format(len(ret)))

In [66]:
start = time.time()
#asyncio.run(main(urls))
await main(urls)
end = time.time()

print("Took {} seconds to pull {} websites.".format(end - start, len(urls)))

Unable to get url https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=5 due to <class 'AttributeError'>.
Unable to get url https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=2 due to <class 'AttributeError'>.
Unable to get url https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&page_size=2000&page_num=3 due to <class 'AttributeError'>.
Unable to get url https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id=C2021957295-LPCLOUD&collection_concept_id=C2021957657-LPCLOUD&temporal=2021-10-17T00:00:00Z,2021-10-17T23:59:59Z&pag