In [1]:
import time
import requests
from bs4 import BeautifulSoup

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
}

def get_soup_from_url(url):
    try:
        web_page = requests.get(url, headers=headers).content
        return BeautifulSoup(web_page, 'lxml')
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None

In [3]:
url = "https://developers.google.com/earth-engine/datasets/catalog"

soup = get_soup_from_url(url)

In [4]:
base_url = "https://developers.google.com"

urls = [li.find("a", href=True)['href'] for li in soup.find_all("li", class_="ee-sample-image ee-cards devsite-landing-row-item-description")]


In [5]:
len(urls)

600

In [6]:
urls[:10]

['/earth-engine/datasets/catalog/AAFC_ACI',
 '/earth-engine/datasets/catalog/ACA_reef_habitat_v2_0',
 '/earth-engine/datasets/catalog/AHN_AHN2_05M_INT',
 '/earth-engine/datasets/catalog/AHN_AHN2_05M_NON',
 '/earth-engine/datasets/catalog/AHN_AHN2_05M_RUW',
 '/earth-engine/datasets/catalog/ASTER_AST_L1T_003',
 '/earth-engine/datasets/catalog/AU_GA_AUSTRALIA_5M_DEM',
 '/earth-engine/datasets/catalog/AU_GA_DEM_1SEC_v10_DEM-H',
 '/earth-engine/datasets/catalog/AU_GA_DEM_1SEC_v10_DEM-S',
 '/earth-engine/datasets/catalog/BIOPAMA_GlobalOilPalm_v1']

In [7]:
for url in urls[:10]:
    print(base_url + url)

https://developers.google.com/earth-engine/datasets/catalog/AAFC_ACI
https://developers.google.com/earth-engine/datasets/catalog/ACA_reef_habitat_v2_0
https://developers.google.com/earth-engine/datasets/catalog/AHN_AHN2_05M_INT
https://developers.google.com/earth-engine/datasets/catalog/AHN_AHN2_05M_NON
https://developers.google.com/earth-engine/datasets/catalog/AHN_AHN2_05M_RUW
https://developers.google.com/earth-engine/datasets/catalog/ASTER_AST_L1T_003
https://developers.google.com/earth-engine/datasets/catalog/AU_GA_AUSTRALIA_5M_DEM
https://developers.google.com/earth-engine/datasets/catalog/AU_GA_DEM_1SEC_v10_DEM-H
https://developers.google.com/earth-engine/datasets/catalog/AU_GA_DEM_1SEC_v10_DEM-S
https://developers.google.com/earth-engine/datasets/catalog/BIOPAMA_GlobalOilPalm_v1


In [8]:
# Print the BeautifulSoup object obtained from the URL

# print(get_soup_from_url(base_url + urls[0]))

In [9]:
def extract_data(soup):
    data = {}
    
    data['title'] = soup.find("h1", class_="devsite-page-title").text
    
    info_box = soup.find("dl")
    
    data['availability'] = info_box.find("dt", string="Dataset Availability").find_next("dd").text
    
    data['provider_name'] = info_box.find("dt", string="Dataset Provider").find_next("span", itemprop="name").text
    
    data['provider_url'] = info_box.find("dt", string="Dataset Provider").find_next("a")["href"]
    
    data['tags'] = [tag.text for tag in info_box.find("dt", string="Tags").find_next("span", class_="ee-tag-buttons").find_all("a", class_="ee-chip")]
    
    table_info = soup.find_all('table')
    
    first_col_values = []

    for table in table_info:
        for row in table.find_all('tr'):
            columns = row.find_all(['th', 'td'])
            if columns and columns[0].name == 'td':
                first_col_values.append(columns[0].get_text(strip=True))
    
    data['first_col_values'] = first_col_values
    
    return data

In [10]:
def write_data_to_file(index, data):
    filename = f"E:/Workspace/Earth-Engine-Data-Scraping/Web Scraping Results/file{index}.txt"
    
    with open(filename, "w") as f:
        f.write(data['title'] + "\n\n")
        f.write(data['availability'] + "\n\n")
        f.write(data['provider_name'] + "\n")
        f.write(data['provider_url'] + "\n\n")
        
        for tag in data['tags']:
            f.write(tag + "\n")
            
        f.write("\n")
        
        for i in data['first_col_values']:
            f.write(i + "\n")


In [11]:
# for index in range(len(urls)):
for index in range(10):
    soup = get_soup_from_url(base_url + urls[index])

    if soup is None:
        print(f"Skipping index {index} due to failed URL retrieval.")
        continue
        
    try:
        data = extract_data(soup)
        write_data_to_file(index, data)
        print(f"Data written to file {index} successfully.")
        
    except Exception as e:
        print(f"An error occurred while processing index {index}: {e}")
    
    time.sleep(5)


Data written to file 0 successfully.
Data written to file 1 successfully.
Data written to file 2 successfully.
Data written to file 3 successfully.
Data written to file 4 successfully.
Data written to file 5 successfully.
Data written to file 6 successfully.
Data written to file 7 successfully.
Data written to file 8 successfully.
Data written to file 9 successfully.
