In [None]:
# Selenium, allows you to control a browser programmatically. Can be used to scrape data.
# Already installed Selenium in virtual environment via terminal. Only imports neccessary here.
%pip install pandas as pd
%pip install --upgrade six
%pip install selenium==3.141.0

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import pandas as pd
import re
import numpy as np

In [46]:
# Unlimited amount of rows displayed
pd.set_option('display.max_rows', None)

In [47]:
# Path for the driver saved on my windows computer
driver_path = r"C:\Windows\chromedriver\chromedriver.exe"

# Sets up the webdriver with the provided path
driver = webdriver.Chrome(executable_path=driver_path)

# Navigate to the website
driver.get("https://www.scrapethissite.com/")

# Perform Actions to get to the required data or text.

button = driver.find_element_by_css_selector("a.btn.btn-lg.btn-default")
#potential simplier syntax for later: button1 = driver.find_element_by_css_selector("button.Explore Sandbox")
button.click() 
time.sleep(1) # Wait for the content to load before moving on in this code.  

link = driver.find_element_by_css_selector("h3.page-title a[href='/pages/simple/']")
link.click()
time.sleep(1)  

# Create an empty DataFrame with the columns you want
columns = ['Country Name', 'Capital', 'Population', 'Area (km\u00b2)']
data = pd.DataFrame(columns=columns)

# Find all country elements
all_countries = driver.find_elements_by_css_selector("div.col-md-4.country")

# Loop through each country element and scrape the data
for country in all_countries:
    country_name = country.find_element_by_css_selector("h3.country-name").text
    country_capital = country.find_element_by_css_selector("span.country-capital").text
    country_population = int(country.find_element_by_css_selector("span.country-population").text)
    area_text = country.find_element_by_css_selector("span.country-area").text
    
    # Use regular expressions to check if the area value is in scientific notation
    if re.match(r'^\d+(?:\.\d+)?E\d+$', area_text):
        # Convert scientific notation to a float using float()
        country_area = float(area_text)
    else:
        # Convert decimal notation to a float using float()
        country_area = float(area_text)
    
# Create a temporary DataFrame for the current country
    temp_df = pd.DataFrame({
        'Country Name': [country_name],
        'Capital': [country_capital],
        'Population': [country_population],
        'Area (km\u00b2)': [country_area]
    })

    # Concatenate the temporary DataFrame with the main DataFrame
    data = pd.concat([data, temp_df], ignore_index=True)
# Close the browser
driver.quit()
data['Area (km\u00b2)'] = data['Area (km\u00b2)'].replace([0, 0.0], np.nan)
data['Persons per area'] = data['Population'] / data['Area (km\u00b2)']
data = data.set_index('Country Name')
# Print the DataFrame
print(data)

                                                          Capital  Population  \
Country Name                                                                    
Andorra                                          Andorra la Vella       84000   
United Arab Emirates                                    Abu Dhabi     4975593   
Afghanistan                                                 Kabul    29121286   
Antigua and Barbuda                                    St. John's       86754   
Anguilla                                               The Valley       13254   
Albania                                                    Tirana     2986952   
Armenia                                                   Yerevan     2968000   
Angola                                                     Luanda    13068161   
Antarctica                                                   None           0   
Argentina                                            Buenos Aires    41343201   
American Samoa              

In [48]:
tot_pop = data['Population'].sum()
avg_pop = data['Population'].mean()
min_pop = data['Population'].min()
max_pop = data['Population'].max()

tot_area = data['Area (km\u00b2)'].sum()
avg_area = data['Area (km\u00b2)'].mean()
min_area = data['Area (km\u00b2)'].min()
max_area = data['Area (km\u00b2)'].max()

avg_pop_per_km2 = data['Persons per area'].mean()
min_pop_per_km2 = data['Persons per area'].min()
max_pop_per_km2 = data['Persons per area'].max()

# Calculate the non-zero minimum value for 'Persons per area'
min_pop_per_km2 = data.loc[data['Persons per area'] > 0, 'Persons per area'].min()

# Create a new DataFrame to store the aggregated values
summary_data = pd.DataFrame({
    'Population': [tot_pop, avg_pop, min_pop, max_pop],
    'Area (km\u00b2)': [tot_area, avg_area, min_area, max_area],
    'Persons per area': [np.nan, avg_pop_per_km2, min_pop_per_km2, max_pop_per_km2]
}, index=['Total', 'Average', 'Minimum', 'Maximum'])

# Displays output floats in non-scientific notation
pd.options.display.float_format = '{:,.2f}'.format

# Print the summary DataFrame
print(summary_data)

              Population     Area (km²)  Persons per area
Total   6,861,418,895.00 149,909,229.69               NaN
Average    27,445,675.58     602,045.10            306.79
Minimum             0.00           0.44              0.01
Maximum 1,330,044,000.00  17,100,000.00         16,905.13
