In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://developers.google.com/earth-engine/datasets/catalog/AAFC_ACI#bands"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
}

web_page = requests.get(url, headers=headers).content

soup = BeautifulSoup(web_page, 'lxml')

In [3]:
title = soup.find("h1", class_="devsite-page-title").text

title

'Canada AAFC Annual Crop Inventory'

In [4]:
info_box = soup.find("dl")

info_box

<dl>
<dt>Dataset Availability</dt>
<dd>2009-01-01T00:00:00Z–2022-01-01T00:00:00</dd>
<dt>Dataset Provider</dt>
<dd>
<span itemprop="provider" itemscope="" itemtype="http://schema.org/Organization">
<a href="https://open.canada.ca/data/en/dataset/ba2645d5-4458-414d-b196-6303ac06c1c9" itemprop="url">
<span itemprop="name">Agriculture and Agri-Food Canada</span>
</a>
</span>
</dd>
<dt>Earth Engine Snippet</dt>
<dd>
<span>
<code class="devsite-click-to-copy prettyprint lang-js" dir="ltr" translate="no">
      ee.ImageCollection("AAFC/ACI")
    </code>
<button class="devsite-dialog-button" data-modal-dialog-id="ee-signup-modal-dialog">
<span aria-hidden="true" class="material-icons" translate="no">open_in_new</span>
</button>
<div class="devsite-dialog ee-signup-modal-dialog" id="ee-signup-modal-dialog">
<div class="devsite-dialog-contents inline open-in-ee">
<h3 data-text="Sign up for Earth Engine" id="sign-up-for-earth-engine">Sign up for Earth Engine</h3>
<p>Earth Engine is free to use f

In [5]:
dataset_availability = info_box.find("dt", string="Dataset Availability").find_next("dd").text

dataset_availability

'2009-01-01T00:00:00Z–2022-01-01T00:00:00'

In [6]:
provider_name = info_box.find("dt", string="Dataset Provider").find_next("span", itemprop="name").text

provider_url = info_box.find("dt", string="Dataset Provider").find_next("a")["href"]

provider_name, provider_url

('Agriculture and Agri-Food Canada',
 'https://open.canada.ca/data/en/dataset/ba2645d5-4458-414d-b196-6303ac06c1c9')

In [7]:
tags = [tag.text for tag in info_box.find("dt", string="Tags").find_next("span", class_="ee-tag-buttons").find_all("a", class_="ee-chip")]

tags

['canada', 'crop', 'landcover']

In [8]:
table_info = soup.find_all('table')

# table_info

In [9]:
# This code extracts and collects the values from the first column of each table present in the page's content.
# The purpose of collecting these values is in response to a client's request for specific data from the web page.

first_col_values = []

for table in table_info:
    for row in table.find_all('tr'):
        columns = row.find_all(['th', 'td'])
        if columns and columns[0].name == 'td':
            first_col_values.append(columns[0].get_text(strip=True))

print(first_col_values)

['landcover', '10', '20', '30', '34', '35', '50', '80', '85', '110', '120', '122', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '160', '161', '162', '163', '167', '168', '174', '175', '176', '177', '178', '179', '180', '181', '182', '183', '185', '188', '189', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '200', '210', '220', '230', 'landcover_class_names', 'landcover_class_palette', 'landcover_class_values']


In [10]:
with open("E:\Workspace\Earth-Engine-Data-Scraping\Web Scraping Results\sample.txt", "w") as f:
    f.write(f"{title}\n\n")
    f.write(f"{dataset_availability}\n\n")
    f.write(f"{provider_name}\n")
    f.write(f"{provider_url}\n\n")
    
    for tag in tags:
        f.write(f"{tag}\n")
    
    f.write("\n")
    
    for value in first_col_values:
        f.write(f"{value}\n")