# Data Download

This notebook downloads metadata used in the coverage notebooks.

1. Run top cell
2. Navigate to product of interest
3. Ensure credentials are set in text file for the corresponding API. An example credentials file is provided (credentials.txt). This should be placed in a 'credentials' folder.
4. Configures settings in the corresponding section
5. Run

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import datetime
from datetime import date
import shapely.wkt
import geopandas as gpd
import cartopy.crs as ccrs
import cartopy
import rasterio
from rasterio.features import rasterize
import collections
import json
import os
import pandas as pd
import math
import geojson
import time
from shapely.geometry.polygon import Polygon
import requests
from tqdm import tqdm
import numpy as np

# metadata API's
from sentinelsat import SentinelAPI
from landsatxplore.api import API as landsatAPI
import cartopy.crs as ccrs
from cmr import CollectionQuery, GranuleQuery, ToolQuery, ServiceQuery, VariableQuery
import earthaccess
from eodms_rapi import EODMSRAPI


pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Global Settings

In [None]:
metadata_folder = 'metadata' # where to save metadata
credentials_folder = 'credentials' # where to look for corresponding credential file

# create folders
fs = [metadata_folder, credentials_folder]
for f in fs:
    if not os.path.exists(f):
        os.makedirs(f)

# Functions

In [None]:
def save_metadata(filepath, results):
    """save metadata in the download loops

    Args:
        filepath (str): fill path of file
        results (_type_): dictionary of results indexed by an id that 
        can be used to exclude duplicates
    
    Returns
        duplicate_count (int), added_count (int)
    """
    duplicate_count, added_count = 0, 0
    print(f'saving data to file {filepath}')
    if not os.path.exists(filepath):
        with open(filepath, "w") as fp:
            fp.write(json.dumps(results, indent=4, sort_keys=True, default=str))
        added_count = len(results)
        print(f'file created, {added_count} products added')
    else:
        with open(filepath, 'r') as f:
            saved_data = json.load(f)
        for k in results.keys():
            if k not in saved_data:
                saved_data[k] = results[k]
                added_count+=1
            else: 
                duplicate_count += 1
        print('updating metadata file...')
        print(f'{added_count} products added, {duplicate_count} already exist and are ignored')
        with open(filepath, "w") as fp:
            fp.write(json.dumps(saved_data, indent=4, sort_keys=True, default=str))
    return duplicate_count, added_count

def combine_subfolders(subfolders, outpath):
    process = True
    if process:
        for i,sf in enumerate(subfolders):
            files = os.listdir(f'{sf}')
            results = {}
            print(sf)
            for file in tqdm(files):
                file_name = f'{sf}/{file}'
                with open(file_name, 'r') as f:
                    saved_data = json.load(f)
                for k in saved_data.keys():
                    if k not in results:
                        results[k] = saved_data[k]
            
            # naming for viirs and modis
            # sat, prod, start, end, nth, ext = file.split('_')
            #all_path = folder + '/' + '_'.join([sat,prod,nth,ext])
            
            with open(outpath, "w") as fp:
                fp.write(json.dumps(results, indent=4, sort_keys=True, default=str))
            print(outpath)
            
def search_cmr_data(filters, min_granules=0):
    
    # search the common metadata repository for AMSR level 1 data
    df = pd.read_xml('cmr_provider_holdings.xml')
    # check all conditions are met - see if the filter values in entry title
    df_filt = np.array([df['entry-title'].str.upper().str.contains(x) for x in filters])
    df_filt = np.prod(df_filt, axis=0)
    df_filt = [bool(x) for x in df_filt] # to bools
    data = df[df_filt]
        
    # get the provider name so we can search for the shortnames needed for downloading
    providers = data['provider-id'].unique()
    # Short names not provided here, so query a provider to get the shortnames for the granules
    dfs = []
    for provider in providers:
        capi = CollectionQuery()
        collections = capi.provider(provider) #.keyword("AST_L1*").get(5)
        provider_df = collections.get()
        provider_df = pd.DataFrame(provider_df)
        if 'dataset_id' not in list(provider_df):
            continue
        df_filt = np.array([provider_df['dataset_id'].str.upper().str.contains(x) for x in filters])
        df_filt = np.prod(df_filt, axis=0)
        df_filt = [bool(x) for x in df_filt] # to bools
        data = provider_df[df_filt]
        dfs.append(data)
    data = pd.concat(dfs)
    # filter for the datasets which have granules for analysis 
    has_granules = {}
    for i, s in enumerate(data.short_name.values):
        api = GranuleQuery()
        granules = api.short_name(s)
        n_found = granules.hits()
        if n_found >= min_granules:
            has_granules[s] = n_found
    # filter for where we have granules
    datasets = data[data['short_name'].isin(has_granules)]
    datasets['granules'] = datasets['short_name'].map(has_granules)
    return datasets


# Sentinel

## Credentials

In [None]:
# read credentials from file
with open(f"{credentials_folder}/credentials_cophub.txt", "r") as f:
   txt = str(f.read())
   user = txt.split('\n')[1].split('login')[-1][1:]
   password = txt.split('\n')[2].split('password')[-1][1:]

In [None]:
api = SentinelAPI(user, password)

## Satellite and Product Definitions

In [None]:
# sentinel 3
platformname = "Sentinel-3"
producttype = None #'OLCI'
instrumentshortname =  "OLCI"

# sentinel 2
# platformname = "Sentinel-2"
# #producttype = 'S2MSI1C'
# producttype = 'S2MSI2Ap'
#instrumentshortname =  None

# # sentinel 1
# platformname = "Sentinel-1"
# producttype = 'GRD'#'SLC'#'GRD'
#instrumentshortname =  None

# dates
start_date=date(2017, 10, 1)
end_date=date(2018, 8, 1)
chunk_days = 10 #set to a value if split the search into parts else False
save_every = 3

#northing
bounds = east, west, south, north = -180, 180, -90, -50

## ESA using sentinelsat Python API

In [None]:
footprint = f"POLYGON (({east} {north}, {east} {south}, {west} {south}, {west} {north}, {east} {north}))"
envelope = f"ENVELOPE({east}, {west}, {north}, {south})"

# plot the search footprint
plt.rcParams["figure.figsize"] = [10,8]
ax = plt.axes(projection=ccrs.PlateCarree(), title='Search Area')
ax.add_feature(cartopy.feature.LAND)
ax.add_feature(cartopy.feature.OCEAN)
ax.add_geometries(shapely.wkt.loads(footprint), crs=ccrs.PlateCarree(), alpha=0.5)

import ast
from shapely.wkt import loads
from shapely.geometry import mapping
import geojson

geojson_string = geojson.dumps(mapping(loads(footprint)))
geojson_dict = ast.literal_eval(geojson_string)

In [None]:
print(f'searching for {platformname} {producttype} from {start_date} to {end_date}')
count = api.count(envelope, 
                  date=(start_date,end_date), 
                  producttype=producttype, 
                  platformname=platformname,
                  instrumentshortname=instrumentshortname)
print(f'{count} products found')

## Download Data

In [None]:
download_metadata = True
save_results = True
if download_metadata:
    results = {}
    if not chunk_days:
        # search the daterange as is
        results = api.query(envelope, 
                            date=(start_date,end_date), 
                            producttype=producttype, 
                            platformname=platformname,
                            instrumentshortname=instrumentshortname)
    else:
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        for d in range(0,len(dates)-1):
            print(f'downloding chunk {d+1} of {len(dates)} ({dates[d]} to {dates[d+1]})')
            results_chunk = api.query(envelope, 
                                      date=(dates[d],dates[d+1]), 
                                      producttype=producttype, 
                                      platformname=platformname,
                                      instrumentshortname=instrumentshortname)
            for k in results_chunk.keys():
                if k not in results:
                    results [k] = results_chunk[k]

            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                prod_id = producttype if producttype != None else instrumentshortname
                filepath = os.path.join(metadata_folder,f'{platformname}_{prod_id}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}


# Landsat

| Dataset Name | Dataset ID |
|---|---|
| Landsat 5 TM Collection 2 Level 1 | landsat_tm_c2_l1 |
| Landsat 5 TM Collection 2 Level 2 | landsat_tm_c2_l2 |
| Landsat 7 ETM+ Collection 2 Level 1 | landsat_etm_c2_l1 |
| Landsat 7 ETM+ Collection 2 Level 2 | landsat_etm_c2_l2 |
| Landsat 8 Collection 2 Level 1 | landsat_ot_c2_l1 |
| Landsat 8 Collection 2 Level 2 | landsat_ot_c2_l2 |
| Landsat 9 Collection 2 Level 1 | landsat_ot_c2_l1 |
| Landsat 9 Collection 2 Level 2 | landsat_ot_c2_l2 |

## Credentials and API

In [None]:
# read credentials from file
with open("credentials/credentials_usgs_eros.txt", "r") as f:
   txt = str(f.read())
   uid = txt.split('\n')[1].split('login')[-1][1:]
   pswd = txt.split('\n')[2].split('password')[-1][1:]
   email = txt.split('\n')[3].split('email')[-1][1:]

In [None]:
# Initialize a new API instance
api = landsatAPI(uid, pswd)

# Perform a check request
response = api.request(endpoint="dataset-catalogs")
print(response)

## Settings

In [None]:
#start_date='2022-01-01'
#end_date='2022-12-31'
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
max_results=50_000
start_date=date(2020, 1, 1)
end_date=date(2023, 12, 31)
chunk_days = 10 #set to a value if split the search into parts else False
# dataset = 'landsat_etm_c2_l1' # lsat 7
dataset = 'landsat_ot_c2_l2' # lsat 8/9
save_every = 3
# dataset = 'landsat_tm_c2_l2' #lsat 5
max_results = 50_000 #50000 is the max, use chunk days to break up downloads
folder = 'metadata/'

## Download Data

In [None]:
download_metadata = True
save_results = True
if download_metadata:
    results = {}
    if not chunk_days:
        # search the daterange as is
        results = api.search(
            dataset=dataset,
            bbox=bbox,
            start_date=start_date.date(),
            end_date=end_date.date(),
            max_results=max_results,
        )

    else:
        # search chunks of data
        new_added = 0
        dates = list(pd.date_range(start=start_date,end=end_date+datetime.timedelta(days=chunk_days), freq=f'{chunk_days}D'))
        #dates.append(end_date)
        for d in range(0,len(dates)-1):
            print(f'downloding chunk {d+1} of {len(dates)} ({dates[d].date()} to {dates[d+1].date()})')
            time.sleep(5) # sleep for 5 seconds, stop timeout
            results_chunk = api.search(
                dataset=dataset,
                bbox=bbox,
                start_date=str(dates[d].date()),
                end_date=str(dates[d+1].date()),
                max_results=max_results,
            )
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['landsat_product_id']
                if id_ not in results:
                    results[id_] = r

            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                filepath = os.path.join(metadata_folder, f'{dataset}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}

In [None]:
# Sample download
# from landsatxplore.earthexplorer import EarthExplorer
# ee = EarthExplorer(uid, pswd)
# #ee.download('LC08_L1GT_008113_20221123_20221205_02_T2', output_dir='data')
# #ee.download('LC08_L1GT_041112_20180204_20201016_02_T2', output_dir='data')
# #ee.download('LC08_L2SR_041112_20180204_20201016_02_T2', output_dir='data')
# #LC08_L2SR_058118_20191028_20201016_02_T2
# #LC08_L1GT_058118_20191028_20201016_02_T2
# ee.download('LC08_L2SR_058118_20191028_20201016_02_T2', output_dir='data')
# ee.logout()

# MODIS and VIIRS

## Settings

In [None]:
# Settings 
api = GranuleQuery()
start_date=date(2018, 1, 1)
end_date=date(2021, 1, 1)
chunk_days = 30
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'

## Download

In [None]:
modis_sats = ['AQUA']#,'TERRA']
modis_list = [
# '01', # Level 1A Scans of raw radiances in counts		
# '02QKM', # Level 1B Calibrated Radiances - 250m		
# '02HKM', # Level 1B Calibrated Radiances - 500m		
# '021KM', # Level 1B Calibrated Radiances - 1km	(2010-07-02 start AQUA)
# '02SSH', # Level 1B Subsampled Calibrated Radiances 5km		
# '02OBC', # Level 1B Onboard Calibrator/Engineering Data		
# '03' # Geolocation - 1km		
'09' # L2 Surface Reflectance, 5-Min Swath 250m, 500m, and 1km
 ] # Moderate Resolution Terrain-Corrected Geolocation 6-Min L1 Swath 750m Light	

In [None]:
viirs_sats = ['JPSS1'] #SUOMI_NPP',
viirs_list = [
#  '02IMG', # Imagery Resolution 6-Min L1B Swath 375m		
#  '02MOD', # Moderate Resolution 6-Min L1B Swath 750m		
#  '02DNB', # Day/Night Band 6-Min L1B Swath 750m		
#  '03IMG', # Imagery Resolution Terrain-Corrected Geolocation 6-Min L1 Swath 375m		
#  '03MOD', # Moderate Resolution Terrain-Corrected Geolocation 6-Min L1 Swath 750m		
#  '03DNB', # Day/Night Band Moderate Resolution Terrain-Corrected Geolocation 6-Min L1 Swath 750m		
#  '03IMG', # Imagery Resolution Terrain-Corrected Geolocation 6-Min L1 Swath 375m Light		
#  '03MOD' # Moderate Resolution Terrain-Corrected Geolocation 6-Min L1 Swath 750m Light	
  '09GA' # Atmospherically Corrected Surface Reflectance 6-Min L2 Swath IP 375m, 750m NRT
 ] # Moderate Resolution Terrain-Corrected Geolocation 6-Min L1 Swath 750m Light	


In [None]:
# MODIS
# plist = modis_list
# satlist = modis_sats

# VIIRS
plist = viirs_list
satlist = viirs_sats

for collection_short_name in plist:
    for sat in satlist:
        
        # MODIS
        if sat in ['TERRA', 'AQUA']:
            dataset = 'MOD' + collection_short_name if sat == 'TERRA' else collection_short_name
            dataset = 'MYD' + collection_short_name if sat == 'AQUA' else dataset
        # VIIRS
        if sat in ['SUOMI_NPP', 'JPSS1']:
            dataset = 'VNP' + collection_short_name if sat == 'SUOMI_NPP' else collection_short_name
            dataset = 'VJ1' + collection_short_name if sat == 'JPSS1' else dataset

        print(f'Searching {sat} {dataset} between {start_date} and {end_date}')
        if download_metadata:
            new_added = 0
            # search chunks of data
            dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
            dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
            results = {}
            for d in range(0,len(dates)-1):
                api = GranuleQuery()
                print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
                granules = (api.short_name(dataset)
                            .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                            .bounding_box(*bbox)
                            #.point(179.9, 85)
                )
                n_found = granules.hits()
                print(f'{n_found} granules found')
                results_chunk = granules.get(n_found)
                print(f'number of products downloaded: {len(results_chunk)}')
                for r in results_chunk:
                    id_ = r['id']
                    if id_ not in results:
                        results[id_] = r
                
                if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                    filepath = os.path.join(metadata_folder,f'{sat}_{dataset}_{north}N_products.json')
                    save_metadata(filepath, results)
                    results = {}

# IceSat-1 and IceSat-2

## Earth Access

In [None]:
# read credentials from file
with open("credentials/credentials_earthaccess.txt", "r") as f:
   txt = str(f.read())
   uid = txt.split('\n')[1].split('login')[-1][1:]
   pswd = txt.split('\n')[2].split('password')[-1][1:]
   email = txt.split('\n')[3].split('email')[-1][1:]

# set env variables for earthacces
os.environ['EARTHDATA_USERNAME'] = uid
os.environ['EARTHDATA_PASSWORD'] = pswd

auth = earthaccess.login()

In [None]:
def get_latest_version(short_name):
    # Get json response from CMR collection metadata
    # use to search for data 
    params = {'short_name': short_name}
    cmr_collections_url = 'https://cmr.earthdata.nasa.gov/search/collections.json'
    response = requests.get(cmr_collections_url, params=params)
    results = json.loads(response.content)
    # Find all instances of 'version_id' in metadata and print most recent version number
    versions = [el['version_id'] for el in results['feed']['entry']]
    latest_version = max(versions)
    print('The most recent version of ', short_name, ' is ', latest_version)
    return latest_version

In [None]:
start_date=date(2018, 1, 1) 
end_date=date(2024, 1, 1)
chunk_days = 50
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 5
folder = 'metadata/'

### IceSat-1

In [None]:
icesat1_list = [
# 'GLAH01', # GLAS/ICESat L1A Global Altimetry Data (HDF5), Version 33 (GLAH01)
# 'GLAH02', # GLAS/ICESat L1A Global Atmosphere Data (HDF5), Version 33 (GLAH02)
# 'GLAH03', # GLAS/ICESat L1A Global Engineering Data (HDF5), Version 33 (GLAH03)
# 'GLAH04', # GLAS/ICESat L1A Global Laser Pointing Data (HDF5), Version 33 (GLAH04)
# 'GLAH05', # GLAS/ICESat L1B Global Waveform-based Range Corrections Data (HDF5), Version 34 (GLAH05)
# 'GLAH06', # GLAS/ICESat L1B Global Elevation Data (HDF5), Version 34 (GLAH06)
# 'GLAH07', # GLAS/ICESat L1B Global Backscatter Data (HDF5), Version 33 (GLAH07)
#'GLAH08', # AEROSOL PARTICLE PROPERTIES PLANETARY BOUNDARY LAYER HEIGHT
#'GLAH09', #	CLOUD HEIGHT CLOUD VERTICAL DISTRIBUTION
#'GLAH10', #	AEROSOL BACKSCATTER AEROSOL EXTINCTION CLOUD REFLECTANCE TRANSMITTANCE
'GLAH11', #	AEROSOL OPTICAL DEPTH/THICKNESS CLOUD OPTICAL DEPTH/THICKNESS
'GLAH12', #	GLACIER ELEVATION/ICE SHEET ELEVATION GLACIER TOPOGRAPHY/ICE SHEET TOPOGRAPHY ICE SHEETS REFLECTANCE
'GLAH13', #	ICE ROUGHNESS REFLECTANCE SEA ICE ELEVATION
'GLAH14', #	REFLECTANCE TERRAIN ELEVATION
'GLAH15', #	REFLECTANCE SEA SURFACE HEIGHT SEA SURFACE SLOPE
]
sat = 'IceSat1'

### IceSat-2

In [None]:
# https://read-icesat-2.readthedocs.io/en/latest/getting_started/ICESat-2-Data-Products.html 

icesat2_list = [
#'ATL01' # Raw ATLAS/ICESat-2 data are decompressed, ordered in time, and reformatted to HDF5. Lowest level accessable
#'ATL02', # ATLAS/ICESat-2 L1B Converted Telemetry Data, Version 6 (ATL02)
#'ATL03', # Global Geolocated Photon Data (L2) 
'ATL04' # Normalized Relative Backscatter Profiles (L2)
]
sat = 'IceSat2'

In [None]:

for short_name in icesat1_list:
    dataset = short_name
    print(f'Searching {sat} {dataset} between {start_date} and {end_date}')
    version = get_latest_version(short_name)
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            time.sleep(5)
            results_chunk = earthaccess.search_data(
                short_name=short_name,
                version=version,
                cloud_hosted=True,
                bounding_box=bbox,
                temporal=(str(dates[d].date()),str(dates[d+1].date())),
            )
            for r in results_chunk:
                id_ = r.uuid
                if id_ not in results:
                    results[id_] = r['umm']
            
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                    filepath = os.path.join(metadata_folder,f'{sat}_{dataset}_{north}N_products.json')
                    save_metadata(filepath, results)
                    results = {}


## CMR 

In [None]:
# Settings 
api = GranuleQuery()
shortnames = ['ATL04']

In [None]:
for dataset in shortnames:
    print(f'Searching {dataset} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            api = GranuleQuery()
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            granules = (api
                        .short_name(dataset)
                        .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                        .bounding_box(*bbox)
            )
            n_found = granules.hits()
            print(f'{n_found} granules found')
            results_chunk = granules.get(n_found)
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['id']
                if id_ not in results:
                    results[id_] = r
            
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                filepath = os.path.join(metadata_folder,f'{sat}_{dataset}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}


# Spot

- Use the thia_downloader tool (https://github.com/olivierhagolle/theia_download/tree/master)
- Command line tool, we generate and pass the arguments here
- NOTE I added an extra command line param called metadata_file so I could specify where the metadata is being saved

## THEIA

In [None]:
# Period : 1986 – 2010
start_date=date(1980, 1, 1)
end_date=date(2010, 1, 1)
chunk_days = 100
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/SPOTWORLDHERITAGE'

# make a subfolder to save page by page and later combine
swh_folder = os.path.join(metadata_folder,'SWH')
if not os.path.exists(swh_folder):
    os.makedirs(swh_folder)

In [None]:
# args = [
# "python theia_download/theia_download.py",
# #"-c SPOTWORLDHERITAGE",
# "-c SWH1",
# #f'-p SPOT5',
# f"-d {start_date}",
# f"-f {end_date}",
# f"--latmin {south}",
# f"--latmax {north}",
# f"--lonmin {east}",
# f"--lonmax {west}",
# "-n",
# "-a theia_download/config_theia.cfg",
# f"--metadata_file {folder}/spot.json"
# ]

# import os
# command_str = " ".join(args)
# print(command_str)
# os.system(command_str)

## Regards

In [None]:
import requests
bounding_box = [-180, -90, 180, -50]
bbox_str = ",".join(map(str, bounding_box))

total_pages = 1
page = 0

while page <= total_pages:
    print(f'Getting results from page {page} of {total_pages}')
    r = requests.get(
        "https://regards.cnes.fr/api/v1/rs-access-project/dataobjects/search?" \
        f"page={page}&" \
        "size=1000&" \
        "q=last:true AND tags:(\"URN:AIP:DATASET:swh:4e26c9fc-2d94-4c12-ad18-74a758e3e2ea:V1\" OR \"URN:AIP:DATASET:swh:4ddf18ab-9a49-4f59-aa23-7a8ec8fdf824:V1\") AND properties.DataDate:[1980-09-01T00:00:00.000Z TO 2016-07-26T00:00:00.000Z]&" \
        "g=POLYGON((-180 -90,180 -90,180 -50,-180 -50,-180 -90))&" \
        "facets=properties.PlatformName&" \
        "facets=properties.InstrumentName&" \
        "facets=properties.SensorCode&" \
        "facets=properties.Station&" \
        "facets=properties.CoupledMode&" \
        "facets=properties.CouplingModes&" \
        "sort=properties.DataID,ASC",
        #headers={"Authorization": "Bearer {access_token}".format(access_token=access_token)}
        headers={"Scope": "swh"}
        )
    data = r.json()
    results = data['content']
    file_name = os.path.join(swh_folder,f'SPOT_page_{page}.json')
    #add key to results
    results = {r['content']['properties']['SceneName'] : r for r in results}
    print(f"saving {len(results)} results to {file_name}")
    with open(file_name, "w") as fp:
        fp.write(json.dumps(results, indent=4, sort_keys=True, default=str))
    total_pages = int(data['metadata']['totalPages'])
    page += 1
    time.sleep(1)
    

In [None]:
# combine the subfiles
combine_subfolders([swh_folder], f'{metadata_folder}/SPOT_L1A.json')

# AMSR

## Level 1

In [None]:
# search the common metadata repository metadata for AMSR level 1 data
datasets = search_cmr_data(filters=[
    'AMSR',
    'L1|LEVEL 1|LEVEL-1|LEVEL1'
], min_granules=1)
datasets[['dataset_id','short_name','granules']]

## Level 2

In [None]:
datasets = search_cmr_data(filters=[
    'AMSR',
    'L2A|L2B'
    #'L2|LEVEL 2|LEVEL-2|LEVEL2'
], min_granules=1)


In [None]:
AMSR_L2 = datasets[((datasets['short_name'].str.contains('Ocean|Rain|Land|L2A'))
                   & (~datasets['short_name'].str.contains('NRT')))][['dataset_id','short_name','granules']]
AMSR_L2

## Settings

In [None]:
# Settings 
api = GranuleQuery()
start_date=date(1999, 1, 1) #start at 2018-01
end_date=date(2024, 1, 1)
chunk_days = 150
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'

## Download

In [None]:
L2 = True
#for short_name in datasets.short_name.values:
for short_name in AMSR_L2.short_name.values:
    print(f'Searching {short_name} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            api = GranuleQuery()
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            granules = (api
                        .short_name(short_name)
                        .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                        .bounding_box(*bbox)
                        #.point(179.9, 85)
            )
            n_found = granules.hits()
            print(f'{n_found} granules found')
            results_chunk = granules.get(n_found)
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['id']
                if id_ not in results:
                    results[id_] = r
            
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                str_ = 'L2' if L2 else ''
                filepath = os.path.join(metadata_folder,f'AMSR_{str_}_{short_name}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}

# RADARSAT-1

- https://py-eodms-rapi.readthedocs.io/en/latest/

In [None]:
# search the common metadata repository metadata for AMSR level 1 data
datasets = search_cmr_data(filters=[
    'RADARSAT',
    'L1|LEVEL 1|LEVEL-1|LEVEL1'
], min_granules=1)
datasets[['dataset_id','short_name','granules']]

In [None]:
# Settings 
api = GranuleQuery()
start_date=date(1994, 1, 1)
end_date=date(2016, 1, 1)
chunk_days = 100
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'

In [None]:
for entry_title in ['RADARSAT-1_LEVEL1']:
    print(f'Searching {entry_title} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            api = GranuleQuery()
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            granules = (api
                        .entry_title(entry_title)
                        #.entry_title(entry_title)
                        .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                        .bounding_box(*bbox)
            )
            n_found = granules.hits()
            print(f'{n_found} granules found')
            results_chunk = granules.get(n_found)
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['id']
                if id_ not in results:
                    results[id_] = r
            
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                filepath = os.path.join(metadata_folder,f'{entry_title}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}



### NRCAN

- https://www.eodms-sgdot.nrcan-rncan.gc.ca/index-en.html
- https://github.com/eodms-sgdot/py-eodms-rapi

In [None]:
# read credentials from file
with open("credentials/credentials_nrcan.txt", "r") as f:
   txt = str(f.read())
   uid = txt.split('\n')[1].split('login')[-1][1:]
   pswd = txt.split('\n')[2].split('password')[-1][1:]
   email = txt.split('\n')[3].split('email')[-1][1:]

In [None]:
from eodms_rapi import EODMSRAPI
# Create the EODMSRAPI object
rapi = EODMSRAPI(uid, pswd)
# get a list of collections
print(rapi.get_collections(as_list=True))

In [None]:
datasets = ['Radarsat2RawProducts'] #'Radarsat1', 'Radarsat1RawProducts', 

In [None]:
# Settings 
start_date=date(2007, 1, 1)
end_date=date(2023, 1, 1)
chunk_days = 50
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)

# Create the EODMSRAPI object
rapi = EODMSRAPI(uid, pswd)
# Add a point to the search
#feat = [('intersects', "POINT (-96.47 62.4)")]
feat = [
       ('intersects', [
           (-180, -50.00),
           (180, -50.00),
           (180, -90.00),
           (-180, -90.00),
           (-180, -50.00)
       ]
   )
]


In [None]:
for dataset in datasets:
    print(f'Searching {dataset} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            # convert dates to format for api
            s = str(dates[d].date()).replace('-','') + '_000000'
            e = str(dates[d+1].date()).replace('-','') + '_000000'
            # Set a date range for the search
            date_range = [{"start": s, "end": e}]
            # Submit the search to the EODMSRAPI, specifying the Collection
            rapi.search(dataset, features=feat, dates=date_range)
            # Get the results from the search
            results_chunk = rapi.get_results('full')
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['recordId']
                if id_ not in results:
                    results[id_] = r
            
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                # save the results to a product file
                # dont save duplicates
                dup = 0
                file_name = folder + f'{dataset}_{north}N_products.json'
                if not os.path.exists(file_name):
                    with open(file_name, "w") as fp:
                        fp.write(json.dumps({}, indent=4, sort_keys=True, default=str))
                #else:
                with open(file_name, 'r') as f:
                    saved_data = json.load(f)
                for k in results.keys():
                    if k not in saved_data:
                        saved_data[k] = results[k]
                        new_added+=1
                    else: 
                        dup += 1
                print(f'{dup} already exist and are ignored')
                print(f'Saving data to {file_name}')
                with open(file_name, "w") as fp:
                    fp.write(json.dumps(saved_data, indent=4, sort_keys=True, default=str))
                results = {}
                saved_data = None

    print(f'{new_added} new products added')


In [None]:
# Submit an order using results
order_res = rapi.order(res[0:1])

# Specify a folder location to download the images
dest = "data"

# Download the images from the order
dn_res = rapi.download(order_res, dest)

# AVHRR

In [None]:
AVHRR = search_cmr_data(filters=['AVHRR'], min_granules=0)
AVHRR[['dataset_id','short_name','granules']]

# ERS
- Products from ASF may only be Amplitude/Backscatter (https://asf.alaska.edu/data-sets/sar-data-sets/ers-1/)

In [None]:
ERS = search_cmr_data(filters=['ERS-1|ERS-2|ERS1|ERS2','L1|LEVEL 1|LEVEL-1|LEVEL1'], min_granules=1)
ERS[['dataset_id','short_name','granules']]

## ASF

In [None]:
# Settings 
api = GranuleQuery()
start_date=date(1991, 1, 1)
end_date=date(2013, 1, 1)
chunk_days = 100
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'

In [None]:
for entry_title in ['ERS-1_LEVEL1','ERS-2_LEVEL1']:
    print(f'Searching {entry_title} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            api = GranuleQuery()
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            granules = (api
                        #.short_name(entry_title)
                        .entry_title(entry_title)
                        .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                        .bounding_box(*bbox)
            )
            n_found = granules.hits()
            print(f'{n_found} granules found')
            results_chunk = granules.get(n_found)
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['id']
                if id_ not in results:
                    results[id_] = r
        
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                filepath = os.path.join(metadata_folder,f'{entry_title}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}

## ESA

In [None]:
# read credentials from file
with open("credentials/credentials_esa_eo.txt", "r") as f:
   txt = str(f.read())
   uid = txt.split('\n')[1].split('login')[-1][1:]
   pswd = txt.split('\n')[2].split('password')[-1][1:]
   email = txt.split('\n')[3].split('email')[-1][1:]

In [None]:
from datetime import datetime
from shapely.geometry import Point
from asarapi.catalog import query
from asarapi.download import log_in, log_out, request_download

username = uid
password = pswd
output_dir = 'data'
location = Point(16.84, -0.04)

results = query(
    area=location.wkt,
    start=datetime(1999, 1, 1),
    stop=datetime(2002, 1, 1),
    orbit='ascending'
)

# JERS-1

In [None]:
# Settings 
api = GranuleQuery()
start_date=date(1991, 1, 1)
end_date=date(1999, 1, 1)
chunk_days = 100
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'

In [None]:
data = search_cmr_data(filters=['JERS','L1|LEVEL 1|LEVEL-1|LEVEL1'], min_granules=1)
data[['dataset_id','short_name','granules']]

In [None]:
for entry_title in ['JERS-1_LEVEL1']:
    print(f'Searching {entry_title} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            api = GranuleQuery()
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            granules = (api
                        .entry_title(entry_title)
                        .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                        .bounding_box(*bbox)
            )
            n_found = granules.hits()
            print(f'{n_found} granules found')
            results_chunk = granules.get(n_found)
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['id']
                if id_ not in results:
                    results[id_] = r
            
        if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                filepath = os.path.join(metadata_folder,f'{entry_title}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}

# Envisat

In [None]:
data = search_cmr_data(filters=['ENVISAT'], min_granules=1)
data[['dataset_id','short_name','granules']]

# ALOS (PALSAR)

In [None]:
data = search_cmr_data(filters=['ALOS|PALSAR','C1|L1|LEVEL 1|LEVEL-1|LEVEL1'], min_granules=0)
data[['dataset_id','short_name','granules']]

In [None]:
data = search_cmr_data(filters=['ALOS|PALSAR','C2|L2|LEVEL 2|LEVEL-2|LEVEL2'], min_granules=0)
data[['dataset_id','short_name','granules']]

In [None]:
# Settings 
api = GranuleQuery()
start_date=date(2005, 1, 1)
end_date=date(2015, 1, 1)
chunk_days = 100
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'

In [None]:
#for entry_title in ['ALOS_PALSAR_LEVEL1.5','ALOS_PALSAR_LEVEL1.1','ALOS_PALSAR_LEVEL1.0']:
for entry_title in ['ALOS_PALSAR_LEVEL2.2']:
    print(f'Searching {entry_title} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            api = GranuleQuery()
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            granules = (api
                        .entry_title(entry_title)
                        .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                        .bounding_box(*bbox)
            )
            n_found = granules.hits()
            print(f'{n_found} granules found')
            results_chunk = granules.get(n_found)
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['id']
                if id_ not in results:
                    results[id_] = r
            
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                filepath = os.path.join(metadata_folder,f'{entry_title}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}

# CRYOSAT

In [None]:
data = search_cmr_data(filters=['CRYOSAT','C1|L1|LEVEL 1|LEVEL-1|LEVEL1'], min_granules=1)
data[['dataset_id','short_name','granules']]

In [None]:
# Settings 
api = GranuleQuery()
start_date=date(2009, 1, 1)
end_date=date(2024, 1, 1)
chunk_days = 100
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'

In [None]:
for short_name in datasets.short_name.values:
    print(f'Searching {short_name} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            api = GranuleQuery()
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            granules = (api
                        .short_name(short_name)
                        .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                        .bounding_box(*bbox)
            )
            n_found = granules.hits()
            print(f'{n_found} granules found')
            results_chunk = granules.get(n_found)
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['id']
                if id_ not in results:
                    results[id_] = r
            
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                filepath = os.path.join(metadata_folder,f'{short_name}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}

# GRACE

In [None]:
data = search_cmr_data(filters=['GRACE'], min_granules=0)
data[['dataset_id','short_name','granules']]

## SSMI

In [None]:
data = search_cmr_data(filters=[
    'SSM|SMMR',
    'C1|L1|LEVEL 1|LEVEL-1|LEVEL1',
    ], min_granules=1)

data = data[(~data['dataset_id'].str.upper().str.contains('PRECIP_SSMI'))]
data[['dataset_id','short_name','granules']]

In [None]:
# Settings 
api = GranuleQuery()
start_date=date(2000, 1, 1)
end_date=date(2001, 1, 1)
chunk_days = 100
bounds = east, west, south, north = -180, 180, -90, -50
bbox=(east,south,west,north)
download_metadata = True
save_results = True
save_every = 10
folder = 'metadata/'

In [None]:
#dates by sat to iterate through
dates_by_gen = {
    'NIMBUS7':[date(1978, 1, 1), date(1988, 1, 1)],
    'F08':[date(1987, 1, 1), date(1992, 1, 1)],
    'F10':[date(1992, 1, 1), date(1998, 1, 1)],
    'F11':[date(1991, 1, 1), date(1996, 1, 1)],
    'F13':[date(1995, 1, 1), date(2009, 1, 1)],
    'F15':[date(2000, 1, 1), date(2024, 1, 1)],
    'F16':[date(2005, 1, 1), date(2024, 1, 1)],
    'F17':[date(2006, 1, 1), date(2024, 1, 1)],
    'F08':[date(2010, 1, 1), date(2024, 1, 1)],
}

In [None]:
for dataset in data.short_name.values:
    for gen in dates_by_gen.keys():
        if gen in dates_by_gen:
            start_date, end_date = dates_by_gen[gen]
            break
    print(f'Searching {dataset} between {start_date} and {end_date}')
    if download_metadata:
        new_added = 0
        # search chunks of data
        dates = list(pd.date_range(start=start_date,end=end_date, freq=f'{chunk_days}D'))
        dates.append(datetime.datetime(end_date.year, end_date.month, end_date.day))
        results = {}
        for d in range(0,len(dates)-1):
            api = GranuleQuery()
            print(f'downloding chunk {d+1} of {len(dates)-1} ({dates[d].date()} to {dates[d+1].date()})')
            granules = (api
                        .short_name(dataset)
                        .temporal(date_from=dates[d].date(), date_to=dates[d+1].date())
                        .bounding_box(*bbox)
                        #.point(179.9, 85)
            )
            n_found = granules.hits()
            print(f'{n_found} granules found')
            results_chunk = granules.get(n_found)
            print(f'number of products downloaded: {len(results_chunk)}')
            for r in results_chunk:
                id_ = r['id']
                if id_ not in results:
                    results[id_] = r
    
            if save_results and ((d%save_every==0) or ((d+1)==(len(dates)-1))):
                filepath = os.path.join(metadata_folder,f'SSM_{dataset}_{north}N_products.json')
                save_metadata(filepath, results)
                results = {}
