In [None]:
# Authenticate GCS
from google.colab import auth
auth.authenticate_user()

In [None]:
# Initialize Google Cloud Storage client
from google.cloud import storage
project_id = 'skytruth-tech'
client = storage.Client(project=project_id)

# Access bucket
bucket_name = 'mountaintop_mining'
bucket = client.get_bucket(bucket_name)

In [None]:
import pandas as pd
import requests
from multiprocessing import Pool
import time
import math

In [None]:
# Get table of tile IDs with Phase 1 and 2 download links
csv = bucket.blob(f'lidar_data/tile_IDs/KY_tile_index_intersect.csv')
csv.download_to_filename(f'/content/KY_tile_index_intersect.csv')
df = pd.read_csv(f'/content/KY_tile_index_intersect.csv')
rows = df.to_dict('records')

In [None]:
print(df['Tile_ID'].nunique())

15007


In [None]:
# test if the Phase2 download url exists
def is_nan(value):
    if isinstance(value, float) and math.isnan(value):
        return True
    elif isinstance(value, str) and value.lower() == 'nan':
        return True
    else:
        return False

In [None]:
# Define scraper function
def scrape(row):
  tile_ID = row['Tile_ID']
  if is_nan(row['Phase2_download_url']):
    url = row['Phase1_download_url']
    year = row['Phase1_year']
  else:
    url = row['Phase2_download_url']
    year = row['Phase2_year']
  uploaded_file_name = f'lidar_data/ky/KY_{year}_{tile_ID}.laz'
  uploaded_file = bucket.blob(uploaded_file_name)
  if not uploaded_file.exists():
    max_retries = 3
    for attempt in range(max_retries):
      try:
        with requests.get(url, stream=True) as response: # Avoids downloading the file locally
                if response.status_code == 200:
                    uploaded_file.upload_from_file(response.raw)
                    break
      except requests.exceptions.RequestException as e: # Tries again if after waiting 5 sec if there is a connection error
        time.sleep(5)

In [None]:
# Parallelize for speed
num_processes = 8

with Pool(num_processes) as pool:
        pool.map(scrape, rows)