In [1]:
import os
import requests
from google.cloud import bigquery
from google.oauth2 import service_account
from datetime import datetime

<b>Landsat</b> is an ongoing mission of Earth observation satellites developed under a joint program of the USGS and NASA. The Landsat mission provides the longest continuous space-based record of Earth's land, dating back to 1972 and the Landsat 1 satellite. Starting with Landsat 4, each of the satellites imaged the Earth's surface at a 30-meter resolution about once every two weeks using multispectral and thermal instruments.
<br>
<br>This <b>Cloud Storage</b> dataset includes the Collection 1 USGS archive from Landsat 4, 5, 7, and 8:<os>
    <li>Landsat 4: 1982 - 1993</li>
    <li>Landsat 5: 1984 - 2013</li>
    <li>Landsat 7: 1999 - 2021</li>
    <li>Landsat 8: 2013 - 2021</li>

In [199]:
def download_file(url, dst_name):
    """
    Downloads a file from a specified URL and saves it to the local destination.

    Parameters:
    - url (str): URL of the file to be downloaded.
    - dst_name (str): Local path where the file will be saved.

    Returns:
    None
    """
    try:
        data = requests.get(url, stream=True)
        with open(dst_name, 'wb') as out_file:
            for chunk in data.iter_content(chunk_size=100 * 100):
                out_file.write(chunk)
    except:
        print ('\t ... {f} FAILED!'.format(f=url.split('/')[-1]))
    return

In [200]:
def query_landsat(BASE_URL, key_json, project_id, start, end, spacecraft_id, cloud, wrs_row, wrs_path):
    """
    Queries the Landsat dataset in BigQuery to find scenes matching specified criteria.

    Parameters:
    - BASE_URL (str): Base URL for accessing Google Cloud Storage.
    - key_json (str): Path to the JSON file containing service account credentials.
    - project_id (str): Google Cloud project ID.
    - start (str): Start date for filtering scenes in the format "YYYY-MM-DD".
    - end (str): End date for filtering scenes in the format "YYYY-MM-DD".
    - spacecraft_id (str): Landsat spacecraft identifier (e.g., "LANDSAT_8").
    - cloud (float): Maximum cloud cover percentage allowed for scenes.
    - wrs_row (int): WRS row number for filtering scenes.
    - wrs_path (int): WRS path number for filtering scenes.

    Returns:
    - good_scenes (list): List of URLs for scenes that meet the specified criteria.
    """
    
    credentials = service_account.Credentials.from_service_account_file(key_json)
    client = bigquery.Client(credentials=credentials, project=project_id)
    query = client.query("""
                SELECT * FROM `bigquery-public-data.cloud_storage_geo_index.landsat_index` 
                    WHERE wrs_path = ({t}) AND wrs_row = ({r})
                    AND spacecraft_id = ("{i}")
                    AND DATE(sensing_time) BETWEEN DATE("{s}") AND DATE("{e}")
                """.format(t=wrs_path, r=wrs_row, i=spacecraft_id, s=start, e=end))
    results = query.result()
    df = results.to_dataframe()
    good_scenes = []
    for i, row in df.iterrows():
        if float(row['cloud_cover']) <= cloud:
            print (row['product_id'], '; cloud cover:', row['cloud_cover'])
            good_scenes.append(row['base_url'].replace('gs://', BASE_URL))
    return good_scenes

In [201]:
def make_safe_dirs(scene, outpath): 
    """
    Downloads and organizes Landsat scene files into a directory structure.

    Parameters:
    - scene (str): URL of the Landsat scene.
    - outpath (str): Output directory where the scene files will be organized.

    Returns:
    - download_links (list): List of tuples containing online and local paths for downloaded files.
    """
    scene_name = os.path.basename(scene)
    scene_path = os.path.join(outpath, scene_name)
    manifest = os.path.join(scene_path, scene_name+'_MTL.txt')
    manifest_url = scene + '/'+scene_name+'_MTL.txt'
    if os.path.exists(manifest):
        os.remove(manifest)
    download_file(manifest_url, manifest)
    
    manifest_ang = os.path.join(scene_path, scene_name+'_ANG.txt')
    manifest_ang_url = scene + '/'+scene_name+'_ANG.txt'
    if os.path.exists(manifest_ang):
        os.remove(manifest_ang)
    download_file(manifest_ang_url, manifest_ang)
    
    with open(manifest, 'r') as f:
        manifest_lines = f.read().split("\n")
    download_links = []
    data = requests.get(manifest_url, stream=True)
    with open(manifest, 'wb') as out_file:
        for chunk in data.iter_content(chunk_size=100 * 100):
            out_file.write(chunk)
    bands = ['    FILE_NAME_BAND_1 ','    FILE_NAME_BAND_2 ','    FILE_NAME_BAND_3 ','    FILE_NAME_BAND_4 ',
            '    FILE_NAME_BAND_5 ','    FILE_NAME_BAND_6 ','    FILE_NAME_BAND_7 ','    FILE_NAME_BAND_8 ',
            '    FILE_NAME_BAND_9 ','    FILE_NAME_BAND_10 ','    FILE_NAME_BAND_11 ', '    FILE_NAME_BAND_QUALITY ']
    for line in manifest_lines:
        for band in bands:
            if band in line:
                local_path = scene_path+'/'+line[line.find('"')+1:-1]
                online_path = scene +'/'+line[line.find('"')+1:-1]
                download_links.append((online_path, local_path))
    return download_links

In [202]:
def download_landsat(scene, dst, outpath):   
    """
    Downloads Landsat scene files and organizes them into a directory structure.

    Parameters:
    - scene (str): URL of the Landsat scene.
    - dst (str): Destination directory for the downloaded scene files.
    - outpath (str): Output directory where the scene files will be organized.

    Returns:
    None
    """
    scene_name = os.path.basename(scene)
    scene_path = os.path.join(outpath, scene_name)
    if not os.path.exists(scene_path):
        os.mkdir(scene_path)        
    print ('Downloading scene {s} ...'.format(s=scene_name))
    download_links = make_safe_dirs(scene, dst)
    
    for l in download_links:
        if l[1].endswith('.TIF'):
            print ('\t ... *{b}'.format(b=l[1].split('_')[-1]))
        if download_file(l[0], l[1]) is False:
            print ('\t ... {f} failed to download! Download for this scene is cancelled here!'.format(f=l[0]))
            return

In [203]:
BASE_URL = 'http://storage.googleapis.com/'
key_json = ".../path-to-key.json"
project_id = '-'.join(key_json.split('/',-1)[-1].split('.',-1)[0].split('-',-1)[:-1])
start =  datetime.strptime('2018/01/18',"%Y/%m/%d") 
end =  datetime.strptime('2018/05/18',"%Y/%m/%d")
cloud = 20
wrs_row = 122
wrs_path = 233
spacecraft_id = 'LANDSAT_8'
outdir = ".../path-to-output" #desired output folder

scene_list = query_landsat(BASE_URL, key_json, project_id, start, end, spacecraft_id, cloud, wrs_row, wrs_path)

for s in scene_list:
    download_landsat(s, outdir)

LC08_L1GT_233122_20180120_20180206_01_T2 ; cloud cover: 0.0
Downloading scene LC08_L1GT_233122_20180120_20180206_01_T2 ...
	 ... *B1.TIF
	 ... *B2.TIF
	 ... *B3.TIF
	 ... *B4.TIF
	 ... *B5.TIF
	 ... *B6.TIF
	 ... *B7.TIF
	 ... *B8.TIF
	 ... *B9.TIF
	 ... *B10.TIF
	 ... *B11.TIF
	 ... *BQA.TIF
