In [1]:
# Convenient jupyter setup
%load_ext autoreload
%autoreload 2

In [2]:
import dotenv
import requests
import re
import os
import bs4
import pandas as pd
import datetime
from collections import deque

dotenv.load_dotenv()


#browser = get_browser(executable_path='/home/sm/Desktop/chromedriver')
username = os.environ.get("PAISAGENSLIDAR_USERNAME")
password = os.environ.get("PAISAGENSLIDAR_PASSWORD")
url = "www.paisagenslidar.cnptia.embrapa.br/dados/SL_download/"

base_url = f"https://{username}:{password}@{url}/"

In [3]:
def contains_dot(link_ending):
    return "." in link_ending

def get_parsed_html(url):
    response = requests.get(url)
    return bs4.BeautifulSoup(response.content)

def find_potential_links(html_page):
    tags = html_page.find("pre").find_all("a", text=re.compile("^(Mission|[A-Z]{3})\w*"))
    links = [tag.text for tag in tags]
    return links

def get_next_links(url):
    html_page = get_parsed_html(url)
    return find_potential_links(html_page)

In [11]:
# Initialize empty list of links
download_links = []

# Load missions into queue
queue = deque([mission for mission in get_next_links(base_url)])
print(queue)

# Perform breadth first search until queue is empty
iter_count = 0
while len(queue) > 0:
    current_node = queue.popleft()
    #print(f"{iter_count}: Visiting {current_node}")
    # Check if current node is a file
    if contains_dot(current_node):
        download_links.append(base_url + current_node)
    # Else it is current node is a folder and we have to look at all children     
    else:
        child_nodes = [current_node + link for link in get_next_links(base_url + current_node)]
        queue.extendleft(child_nodes)
    iter_count += 1 

deque(['Mission_2008/', 'Mission_2011/', 'Mission_2012/', 'Mission_2013/', 'Mission_2014/', 'Mission_2015/', 'Mission_2016/', 'Mission_2017/', 'Mission_2018/'])
0: Visiting Mission_2008/
1: Visiting Mission_2008/TAP_A06_2008_LiDAR/
2: Visiting Mission_2008/TAP_A06_2008_LiDAR/TAP_A06_2008_LiDAR.zip
3: Visiting Mission_2008/TAP_A05_2008_LiDAR/
4: Visiting Mission_2008/TAP_A05_2008_LiDAR/TAP_A05_2008_LiDAR.zip
5: Visiting Mission_2008/TAP_A04_2008_LiDAR/
6: Visiting Mission_2008/TAP_A04_2008_LiDAR/TAP_A04_2008_LiDAR.zip
7: Visiting Mission_2008/TAP_A01_2008_LiDAR/
8: Visiting Mission_2008/TAP_A01_2008_LiDAR/TAP_A01_2008_LiDAR.zip
9: Visiting Mission_2008/PRE_A07_2008_LiDAR/
10: Visiting Mission_2008/PRE_A07_2008_LiDAR/PRE_A07_2008_LiDAR.zip
11: Visiting Mission_2008/PRE_A06_2008_LiDAR/
12: Visiting Mission_2008/PRE_A06_2008_LiDAR/PRE_A06_2008_LiDAR.zip
13: Visiting Mission_2008/PRE_A05_2008_LiDAR/
14: Visiting Mission_2008/PRE_A05_2008_LiDAR/PRE_A05_2008_LiDAR.zip
15: Visiting Mission_200

In [71]:
# Create download overview table from links
download_table = pd.DataFrame(download_links, columns=["link"])

# Extract mission name and year
download_table["mission"] = download_table.link.str.extract("(Mission_\d{4})")
download_table["year"] = download_table.link.str.extract("Mission_(\d{4})")

# Extract mission region
download_table["region"] = download_table.link.str.extract("Mission_\d{4}/([A-Z]{3})")

# Extract mission plot code
download_table["plot_code"] = download_table.link.str.extract("Mission_\d{4}/[A-Z]{3}_([A-Z0-9a-z]+)_")

# Extract mission type
download_table["type"] = None
lidar_types = download_table.link.str.contains("LiDAR") & download_table.link.str.contains(".zip|.las|.laz") 
inventory_types = download_table.link.str.contains("[iI]nventory")
dem_types = download_table.link.str.contains(".grd")
metadata_types = download_table.link.str.contains(".txt")
download_table["type"][lidar_types] = "lidar"
download_table["type"][inventory_types] = "inventory"
download_table["type"][dem_types] = "dem"
download_table["type"][metadata_types] = "metadata"

# Extract mission saving folder structure
download_table["folder_structure"] = download_table.link.str.extract("/SL_download//(Mission_[\w/.]*)")
download_table.to_csv(f"./paisagenslidar_download_links_v{datetime.date.today()}.csv")

In [4]:
download_table = pd.read_csv("./paisagenslidar_download_links_v2021-05-20.csv", index_col=0)

In [6]:
download_table[download_table.columns[1:]]  # exclude link for publishing to github

Unnamed: 0,mission,year,region,plot_code,type,folder_structure
0,Mission_2008,2008,TAP,A06,lidar,Mission_2008/TAP_A06_2008_LiDAR/TAP_A06_2008_L...
1,Mission_2008,2008,TAP,A05,lidar,Mission_2008/TAP_A05_2008_LiDAR/TAP_A05_2008_L...
2,Mission_2008,2008,TAP,A04,lidar,Mission_2008/TAP_A04_2008_LiDAR/TAP_A04_2008_L...
3,Mission_2008,2008,TAP,A01,lidar,Mission_2008/TAP_A01_2008_LiDAR/TAP_A01_2008_L...
4,Mission_2008,2008,PRE,A07,lidar,Mission_2008/PRE_A07_2008_LiDAR/PRE_A07_2008_L...
...,...,...,...,...,...,...
775,Mission_2018,2018,ANA,A01,dem,Mission_2018/ANA_A01_2018_LiDAR/ANA_A01_2018_D...
776,Mission_2018,2018,ANA,A01,dem,Mission_2018/ANA_A01_2018_LiDAR/ANA_A01_2018_D...
777,Mission_2018,2018,ANA,A01,dem,Mission_2018/ANA_A01_2018_LiDAR/ANA_A01_2018_D...
778,Mission_2018,2018,ANA,A01,dem,Mission_2018/ANA_A01_2018_LiDAR/ANA_A01_2018_D...


## Test download

In [27]:
from src.constants import PAISAGENSLIDAR_PATH
from src.utils.download import download

2021-05-19 23:47:26,372 DEBUG: Logger /home/users/svm/Code/gedi_biomass_mapping/src/utils/download.py already set up. [in get_logger at /home/users/svm/Code/gedi_biomass_mapping/src/utils/logging.py:50]


In [28]:
%%time 
download(url = download_table.link.iloc[0], file_path = PAISAGENSLIDAR_PATH / download_table.folder_structure.iloc[0], overwrite=False)

2021-05-19 23:47:43,042 INFO: File already exists at /gws/nopw/j04/forecol/data/Paisagenslidar/Mission_2008/TAP_A06_2008_LiDAR/TAP_A06_2008_LiDAR.zip [in download at /home/users/svm/Code/gedi_biomass_mapping/src/utils/download.py:18]
CPU times: user 1.13 ms, sys: 1.14 ms, total: 2.26 ms
Wall time: 4.42 ms


True