# Automated ordering, download, indexing of Landsat USGS data for AGDCv2

## Import the requried libraries

In [1]:
import os, json, requests, time, getpass
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
import gzip
try:
    from urllib.parse import urlparse, urljoin
except ImportError:
    from urlparse import urlparse, urljoin

## Setup the working directory to store the downloaded data

In [48]:
working_dir = os.path.abspath('/g/data/v10/projects/ARD_interoperability')
data_dir = os.path.abspath('/g/data/v10/projects/ARD_interoperability/L2/092084')
#data_dir = os.path.abspath('/g/data/u46/users/tty547/ARD')

## Determine which WRS path/rows intersect the area of interest

### Define the spatial extent (AOI) and temporal extent of interest

In [40]:
#ul_lon, ul_lat = 77.83, 17.84 # upper left longitude, latitude
#lr_lon, lr_lat = 78.00, 17.67 # lower right longitude, latitude
#date_start, date_end = "2000-01-01"  , "2007-12-31" # start date and end date for time range selection  
date_start = '19870101'
date_end = '20021231'

### Determine which WRS2 path/rows cover the AOI

In [33]:
#polygon_list = [[ul_lon, ul_lat], [lr_lon, ul_lat],[lr_lon, lr_lat],[ul_lon, lr_lat],[ul_lon, ul_lat]]

#wrs_query = 'http://api.remotepixel.ca/wrs2tiles?search=poly:'+str(polygon_list)
#post_query = requests.get(wrs_query)
#wrs_search_result = json.loads(post_query.text)

#path_row = []
#for item in wrs_search_result['results']:
#    path_row.append(str(item['path'])+"_"+str(item['row']))

#path_row = ['109_82']
#path_row = ['144_048']
#path_row
path = '110'
row = '082'

## Download the latest Level 1 inventory from USGS and find the available scenes 

In [None]:
landsat_csv_list = ["LANDSAT_8_C1.csv.gz",
                    "LANDSAT_ETM_C1.csv.gz",
                    "LANDSAT_TM_C1.csv.gz"
                   ]

metadata_file_url = "https://landsat.usgs.gov/landsat/metadata_service/bulk_metadata_files/"

### Download the inventory data from USGS

In [9]:
def download_file(url, output_dir):
    local_filename = os.path.join(output_dir,url.split('/')[-1])
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
    return local_filename

In [None]:
download = False
if download:
    for csv in landsat_csv_list:
        download_file(urljoin(metadata_file_url, csv), working_dir)

### Unzip inventory files and generate the scene product ID list of the interest

In [None]:
scene_list = []

for csv in landsat_csv_list:
    collection = gzip.open(os.path.join(working_dir, csv), 'rb')

    data_inventory = pd.read_csv(collection , usecols=['acquisitionDate', "LANDSAT_PRODUCT_ID", "path", "row"]) # limit the columns to only the ones we need
    data_inventory["path_row"] = data_inventory["path"].map(str) + "_" + data_inventory["row"].map(str)
    data_inventory['acquisitionDate'] = data_inventory['acquisitionDate'].apply(pd.to_datetime)
    data_inventory = data_inventory.loc[(data_inventory['acquisitionDate'] >= pd.to_datetime(date_start)) &
                                        (data_inventory['acquisitionDate'] <= pd.to_datetime(date_end)) &
                                        data_inventory['path_row'].isin(path_row)]
    scene_list.extend(data_inventory['LANDSAT_PRODUCT_ID'].tolist())
    
scene_list  
print (len(scene_list))

### Alternatively use another methond from Wenjun and preprocessed product list to speed up the process to derive the scene list

In [34]:
def extract_products(fn, path, row, ymd1, ymd2):
  pw = "{path:3}{row:3}".format(path = path, row = row)
  lines = [line.strip() for line in open(fn).readlines() if pw in line]
  result = []
  for line in lines:
     if ymd1 <= line.split('_')[3] <= ymd2: result.append(line)
  return result

In [41]:
product_id_file = os.path.join(working_dir, 'Product_ID.csv')
scene_list = extract_products(product_id_file, path, row, date_start, date_end)
print (len(scene_list))

299


## Order from ESPA for the wanted products (sr, bt and pixel_qa) for the scene list

### Register with ESPA and enter your credentials below https://espa.cr.usgs.gov

In [12]:
username = getpass.getpass()
password = getpass.getpass()

········
········


### The ESPA-API interacting with the ESPA ordering system

In [13]:
host = 'https://espa.cr.usgs.gov/api/v1/'

In [14]:
def espa_api(endpoint, verb='get', body=None, uauth=None):
    auth_tup = uauth if uauth else (username, password)
    response = getattr(requests, verb)(host + endpoint, auth=auth_tup, json=body)
    print('{} {}'.format(response.status_code, response.reason))
    data = response.json()
    
    if isinstance(data, dict):
        messages = data.pop("messages", None)
        if messages:
            print(json.dumps(messages, indent=4))
    try:
        response.raise_for_status()
    except Exception as e:
        print (e)
        return None
    else:
        return data

### Define the order with product types and scenes

In [42]:
prod_types = ['sr', 'bt', 'pixel_qa']

request_data = {
    'inputs': scene_list
}

order = espa_api('available-products', body=request_data)
#print (json.dumps(order, indent=4))
print (order.keys())

collection = {'LT05': 'tm5_collection',
              'LE07': 'etm7_collection',
              'LC08': 'olitirs8_collection',
              'LO08': 'oli8_collection'}

# delete problem scenes from order
problems = ['date_restricted', 'not_implemented', 'oli8_collection']
problem_scenes = []
for a_problem in problems:
    if a_problem in order.keys():
        #print (order[a_problem])
        if a_problem == 'date_restricted':
            problem_scenes.extend(order[a_problem]['sr'])
            print (a_problem, len(order[a_problem]['sr']))
        elif a_problem == 'not_implemented':
            print (a_problem, len(order[a_problem])) 
        else:
            print (a_problem, len(order[a_problem]['inputs']))
        del order[a_problem]
    
if len(problem_scenes) > 0:    
    for a_problem_scene in problem_scenes:
        problem_collection = collection[a_problem_scene[:4]]
        order[problem_collection]['inputs'].remove(a_problem_scene)   

order_no = 0        
for sensor in order.keys():
    if isinstance(order[sensor], dict) and order[sensor].get('inputs'):
        order[sensor]['products'] = prod_types
        print (sensor, len(order[sensor]['inputs']))
        order_no += len(order[sensor]['inputs'])

print ('total order number', order_no)
        
order['format'] = 'gtiff'
#print (json.dumps(order, indent=4))

200 OK
dict_keys(['etm7_collection', 'tm5_collection'])
total order number 299


### Submit the order

In [43]:
print ('POST /api/v1/order')
post_resp = espa_api('order', verb='post', body=order)

POST /api/v1/order
201 CREATED


### Check the order status

In [44]:
orderid = post_resp['orderid']
order_status_resp = espa_api('order-status/{}'.format(orderid))
print(json.dumps(order_status_resp, indent=4))

200 OK
{
    "orderid": "espa-tina.yang@ga.gov.au-11152017-203805-110",
    "status": "ordered"
}


### Find the complete order and download by individual items

In [49]:
def check_n_download(ordered_items_to_download):
    #resp = espa_api('item-status/{0}'.format(orderid), body={'status': 'complete'})
    #print(json.dumps(resp[orderid], indent=4))

    print (len(ordered_items_to_download))
    items_not_complete = []
    
    item_status_resp = espa_api('item-status/{0}'.format(orderid))
    for item in item_status_resp[orderid]:    
        if item['name'] in ordered_items_to_download: 
            if item['status'] == 'complete':
                dload_url = item.get('product_dload_url')
                print ('URL: {0}'.format(dload_url))
                download_file(dload_url, data_dir)
            else:
                items_not_complete.append(item['name'])
    
    print (len(items_not_complete))
    if len(items_not_complete) > 0:
        time.sleep(300)
        print ('check status again after 5 mins')
        check_n_download(items_not_complete)            

In [None]:
orderid = 'espa-tina.yang@ga.gov.au-11132017-155717-343'
item_status_resp = espa_api('item-status/{0}'.format(orderid))
print (len(item_status_resp[orderid]))
#print(json.dumps(item_status_resp, indent=4))
check_n_download([x['name'] for x in item_status_resp[orderid]])

200 OK
310
310
200 OK
URL: https://edclpdsftp.cr.usgs.gov/orders/espa-tina.yang@ga.gov.au-11132017-155717-343/LT050920842010042401T1-SC20171113174105.tar.gz
URL: https://edclpdsftp.cr.usgs.gov/orders/espa-tina.yang@ga.gov.au-11132017-155717-343/LT050920842009021601T1-SC20171113174056.tar.gz
URL: https://edclpdsftp.cr.usgs.gov/orders/espa-tina.yang@ga.gov.au-11132017-155717-343/LE070920842010051801T1-SC20171113174035.tar.gz
URL: https://edclpdsftp.cr.usgs.gov/orders/espa-tina.yang@ga.gov.au-11132017-155717-343/LT050920842009040501T1-SC20171113173937.tar.gz
URL: https://edclpdsftp.cr.usgs.gov/orders/espa-tina.yang@ga.gov.au-11132017-155717-343/LE070920842003032801T1-SC20171113172406.tar.gz
URL: https://edclpdsftp.cr.usgs.gov/orders/espa-tina.yang@ga.gov.au-11132017-155717-343/LT050920842010122001T1-SC20171113173756.tar.gz
URL: https://edclpdsftp.cr.usgs.gov/orders/espa-tina.yang@ga.gov.au-11132017-155717-343/LE070920842003010701T1-SC20171113173746.tar.gz
URL: https://edclpdsftp.cr.usgs.g

### Alternatively Check the order status in the whole, if complete, then bulk-download

In [47]:
email = username
def check_n_download_whole():
    order_status_resp = espa_api('order-status/{}'.format(orderid))
    if order_status_resp['status'] == 'complete':
        !python bulk-downloader/download_espa_order.py -e $email -o orderid -d $data_dir -u $username -p $password
    else:
        time.sleep(300)
        print ('check status again after 5 minutes')
        check_n_download_whole()
check_n_download_whole()        

200 OK
ERROR: Order ID orderid not found
