In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from multiprocessing.dummy import Pool  # Using a thread pool for parallel processing

In [None]:
# Function to extract information from a single URL
def extract_info(url):
    print(url, "started")
    response = requests.get(url)
    
    if response.status_code != 200:
        # Sleep for 5 seconds and then retry
        print(f"Received a non-200 status code ({response.status_code}) for URL: {url}. Skipped")
        data = []
        data.append(['-', '-', '-', '-', '-', '-', '-', '-', '-'])
        return data
    
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    
    tree_panels = soup.find_all('div', class_='panel panel-default')
    
    data = []
    for panel in tree_panels:
        p_elements = panel.find_all('p')
        
        # Filter the <p> elements to find the one with "Nome Popular"
        nome_popular_element = next((p for p in p_elements if 'Nome Popular:' in p.get_text()), None)
        nome_popular = nome_popular_element.find('strong').text if nome_popular_element else None

        # Filter the <p> elements to find the one with "Nome Científico"
        nome_cientifico_element = next((p for p in p_elements if 'Nome Científico:' in p.get_text()), None)
        nome_cientifico = nome_cientifico_element.find('i').text if nome_cientifico_element else None

        # Filter the <p> elements to find the one with "DAP"
        dap_element = next((p for p in p_elements if 'DAP (Diâmetro à altura do peito):' in p.get_text()), None)
        dap = dap_element.find('strong').text if dap_element else None

        # Filter the <p> elements to find the one with "Altura"
        altura_element = next((p for p in p_elements if 'Altura:' in p.get_text()), None)
        altura = altura_element.find('strong').text if altura_element else None

        # Filter the <p> elements to find the one with "Data da Coleta"
        data_coleta_element = next((p for p in p_elements if 'Data da Coleta:' in p.get_text()), None)
        data_coleta = data_coleta_element.find('strong').text if data_coleta_element else None

        # Find the element with latitude and longitude information
        lat_long_element = next((p for p in p_elements if 'Latitude:' in p.get_text()), None)

        if lat_long_element:
            strong_elements = lat_long_element.find_all('strong')
            if len(strong_elements) == 2:
                latitude = strong_elements[0].text.strip()
                longitude = strong_elements[1].text.strip()
            else:
                latitude = None
                longitude = None
        else:
            latitude = None
            longitude = None


        # Extract links to laudos
        laudo_links = [a['href'] for a in panel.find_all('a', href=True) if "Laudo Nº" in a.get_text()]

        # Extract image sources
        image_sources = [img['src'] for img in panel.find_all('img', alt=True)]
        
        data.append([nome_popular, nome_cientifico, dap, altura, data_coleta, latitude, longitude, ", ".join(laudo_links), ", ".join(image_sources)])
        
        print(url, "success")
    
    return data

In [None]:
# Create a list of URLs to fetch (1 to 20)
base_url = 'https://arvores.sjc.sp.gov.br/'
urls = [f'{base_url}{i}' for i in range(5000, 10000)]

# Set up a thread pool for parallel processing with 10 workers
pool = Pool(32)

# Use the pool to process the URLs and extract information
results = pool.map(extract_info, urls)
    
# Close the pool
pool.close()
pool.join()

In [None]:
# Create a Pandas DataFrame
columns = ['Nome Popular', 'Nome Cientifico', 'DAP', 'Altura', 'Data Coleta', 'Latitude', 'Longitude', 'Laudos', 'Image Sources']
df = pd.DataFrame(columns=columns)

# Append the results to the DataFrame with an ID column
for i, result in enumerate(results, 1):
    df = pd.concat([df, pd.DataFrame(result, columns=columns)])
    
df['ID'] = range(5000, len(df)+5000)
df.set_index('ID', inplace=True)

print(df)

In [None]:
df.to_csv('trees2.csv',sep=';',index=True)