In [None]:
# Authenticate GCS
from google.colab import auth
auth.authenticate_user()

In [None]:
# Initialize Google Cloud Storage client
from google.cloud import storage
project_id = 'skytruth-tech'
client = storage.Client(project=project_id)

# Access bucket
bucket_name = 'mountaintop_mining'
bucket = client.get_bucket(bucket_name)

In [None]:
import pandas as pd
import requests
from multiprocessing import Pool
import time

In [None]:
# Set info for lidar aquisition project
state = 'tn'
project_name = 'TN_27_County_B2'
file_path = 'TN_27_County_QL2_LiDAR_Cumberland_Plateau_BAA/TN_27County_blk2_2015'
file_prefix = 'USGS_LPC_TN_27_County_QL2_LiDAR_Cumberland_Plateau_BAA'

In [None]:
# Get list of tile IDs to scrape
csv_name = f'{project_name}.csv'
csv = bucket.blob(f'lidar_data/tile_IDs/{csv_name}')
csv.download_to_filename(f'/content/{csv_name}')
df = pd.read_csv(f'/content/{csv_name}', header=0)
tile_IDs = df.iloc[:, 0].tolist()

In [None]:
# Check tile IDs
print(tile_IDs)

['2024661NW', '2024669SW', '2024669NW', '2024677SW', '2024677NW', '2024685NW', '2024661NE', '2024669SE', '2024669NE', '2024677SE', '2024677NE', '2024685SE', '2024685NE', '2024693SE', '2024693NE', '2038661NW', '2038669SW', '2038669NW', '2038677SW', '2038677NW', '2038685SW', '2038685NW', '2038693SW', '2038661NE', '2038669SE', '2038669NE', '2038677SE', '2038677NE', '2038685SE', '2038685NE', '2052661NW', '2052669SW', '2052669NW', '2052677SW', '2052677NW', '2052685SW', '2052685NW', '2052661NE', '2052669SE', '2052669NE', '2052677SE', '2052677NE', '2052685SE', '2052685NE', '2066661NW', '2066669SW', '2066669NW', '2066677SW', '2066677NW', '2066685SW', '2066661NE', '2066669SE', '2066669NE', '2066677SE', '2066677NE', '2066685SE', '2080661NW', '2080669SW', '2080669NW', '2080677SW', '2080677NW', '2080685SW', '2080661NE', '2080669SE', '2080669NE', '2080677SE', '2080677NE', '2080685SE', '2080685NE', '2094661NW', '2094669SW', '2094669NW', '2094677SW', '2094677NW', '2094685SW', '2094685NW', '2094661NE'

In [None]:
print(len(tile_IDs))

2124


In [None]:
# Define scraper function
def scrape(tile_ID):
  url = f'https://rockyweb.usgs.gov/vdelivery/Datasets/Staged/Elevation/LPC/Projects/{file_path}/LAZ/{file_prefix}_{tile_ID}.laz'
  uploaded_file_name = f'lidar_data/{state}/{project_name}_{tile_ID}.laz'
  uploaded_file = bucket.blob(uploaded_file_name)
  if not uploaded_file.exists():
    max_retries = 3
    for attempt in range(max_retries):
      try:
        with requests.get(url, stream=True) as response: # Avoids downloading the file locally
                if response.status_code == 200:
                    uploaded_file.upload_from_file(response.raw)
                    break
      except requests.exceptions.RequestException as e: # Tries again if after waiting 5 sec if there is a connection error
        time.sleep(5)

In [None]:
# Parallelize for speed
num_processes = 10

with Pool(num_processes) as pool:
        pool.map(scrape, tile_IDs)