In [None]:
# Selenium, allows you to control a browser programmatically. Can be used to scrape data.
# Already installed Selenium in virtual environment via terminal. Only imports neccessary here.
%pip install pandas as pd
%pip install --upgrade six
%pip install selenium==3.141.0

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

import time
import pandas as pd
import re
import numpy as np

# Path for the driver saved on my windows computer
driver_path = r"C:\Windows\chromedriver\chromedriver.exe"

# Unlimited amount of rows displayed
pd.set_option('display.max_rows', None)

In [70]:
# Country Project
driver = webdriver.Chrome(executable_path=driver_path)
# Navigate to the website
driver.get("https://www.scrapethissite.com/")

# Perform Actions to get to the required data or text.

button = driver.find_element_by_css_selector("a.btn.btn-lg.btn-default")
button.click() 
time.sleep(1) # Wait for the content to load before moving on in this code.  

link = driver.find_element_by_css_selector("h3.page-title a[href='/pages/simple/']")
link.click()
time.sleep(1)  

# Create an empty DataFrame with the columns you want
columns = ['Country Name', 'Capital', 'Population', 'Area (km\u00b2)']
data = pd.DataFrame(columns=columns)

# Find all country elements
all_countries = driver.find_elements_by_css_selector("div.col-md-4.country")

# Loop through each country element and scrape the data
for country in all_countries:
    country_name = country.find_element_by_css_selector("h3.country-name").text
    country_capital = country.find_element_by_css_selector("span.country-capital").text
    country_population = int(country.find_element_by_css_selector("span.country-population").text)
    area_text = country.find_element_by_css_selector("span.country-area").text
    
    # Use regular expressions to check if the area value is in scientific notation
    if re.match(r'^\d+(?:\.\d+)?E\d+$', area_text):
        # Convert scientific notation to a float using float()
        country_area = float(area_text)
    else:
        # Convert decimal notation to a float using float()
        country_area = float(area_text)
    
# Create a temporary DataFrame for the current country
    temp_df = pd.DataFrame({
        'Country Name': [country_name],
        'Capital': [country_capital],
        'Population': [country_population],
        'Area (km\u00b2)': [country_area]
    })

    # Concatenate the temporary DataFrame with the main DataFrame
    data = pd.concat([data, temp_df], ignore_index=True)
# Close the browser
driver.quit()
data['Area (km\u00b2)'] = data['Area (km\u00b2)'].replace([0, 0.0], np.nan)
data['Persons per area'] = data['Population'] / data['Area (km\u00b2)']
data = data.set_index('Country Name')
# Print the DataFrame
print(data)

                                                          Capital  Population  \
Country Name                                                                    
Andorra                                          Andorra la Vella       84000   
United Arab Emirates                                    Abu Dhabi     4975593   
Afghanistan                                                 Kabul    29121286   
Antigua and Barbuda                                    St. John's       86754   
Anguilla                                               The Valley       13254   
Albania                                                    Tirana     2986952   
Armenia                                                   Yerevan     2968000   
Angola                                                     Luanda    13068161   
Antarctica                                                   None           0   
Argentina                                            Buenos Aires    41343201   
American Samoa              

In [71]:
tot_pop = data['Population'].sum()
avg_pop = data['Population'].mean()
min_pop = data['Population'].min()
max_pop = data['Population'].max()

tot_area = data['Area (km\u00b2)'].sum()
avg_area = data['Area (km\u00b2)'].mean()
min_area = data['Area (km\u00b2)'].min()
max_area = data['Area (km\u00b2)'].max()

avg_pop_per_km2 = data['Persons per area'].mean()
min_pop_per_km2 = data['Persons per area'].min()
max_pop_per_km2 = data['Persons per area'].max()

# Calculate the non-zero minimum value for 'Persons per area'
min_pop_per_km2 = data.loc[data['Persons per area'] > 0, 'Persons per area'].min()

# Create a new DataFrame to store the aggregated values
summary_data = pd.DataFrame({
    'Population': [tot_pop, avg_pop, min_pop, max_pop],
    'Area (km\u00b2)': [tot_area, avg_area, min_area, max_area],
    'Persons per area': [np.nan, avg_pop_per_km2, min_pop_per_km2, max_pop_per_km2]
}, index=['Total', 'Average', 'Minimum', 'Maximum'])

# Displays output floats in non-scientific notation
pd.options.display.float_format = '{:,.2f}'.format

print(summary_data)

              Population     Area (km²)  Persons per area
Total   6,861,418,895.00 149,909,229.69               NaN
Average    27,445,675.58     602,045.10            306.79
Minimum             0.00           0.44              0.01
Maximum 1,330,044,000.00  17,100,000.00         16,905.13


In [72]:
# CMC project
driver = webdriver.Chrome(executable_path=driver_path)
driver.maximize_window()
# Get the url links for the top X number of Crypto's data you want to analyze.
def get_top_X_cryptocurrencies(driver):
    
    Top_X_Currencies = 5 # Input a number which will represent the top "X" cryptocurrencies by marketcap to analyze.
    num_links_ = Top_X_Currencies*4 # Adjustment needed to collect all the links neccessary.
    start_ = 1
    step_ = 4 # Step to ignore all but every 4th link that was gathered.
    wait_time = 20 # Maximum length you want driver to wait until timeout error.
    wait = WebDriverWait(driver, wait_time)
    url = 'https://coinmarketcap.com'
    driver.get(url)
    top_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'tbody tr td a[href^="/currencies/"]')))[start_:num_links_:step_]
    return [link.get_attribute('href') for link in top_links] # Gets the href (link) for each coin.

top_links = get_top_X_cryptocurrencies(driver)
driver.quit()
top_links

['https://coinmarketcap.com/currencies/bitcoin/markets/',
 'https://coinmarketcap.com/currencies/ethereum/markets/',
 'https://coinmarketcap.com/currencies/tether/markets/',
 'https://coinmarketcap.com/currencies/bnb/markets/',
 'https://coinmarketcap.com/currencies/usd-coin/markets/']

In [73]:
# CMC project
driver = webdriver.Chrome(executable_path=driver_path)
driver.maximize_window()
# Gathers the required data from the markets data table at the top_links.
def get_top_market_data_for_each_coin(driver, link):
    max_retries = 3
    for _ in range(max_retries):
        try:
            wait_time = 5
            wait = WebDriverWait(driver, wait_time)
            
            driver.get(link)  # goes to the market page for the current coin

            time.sleep(2)  # Sleep for 2 seconds to let the page load

            table_location = (By.CSS_SELECTOR, 'table.cmc-table')
            first_row_location = (By.CSS_SELECTOR, 'table.cmc-table tbody tr:nth-of-type(1)')

            # Verify the table with all its elements are present and loaded.
            wait.until(EC.presence_of_all_elements_located(table_location))

            # Scroll to the first row of the table
            first_row_element = driver.find_element(*first_row_location)

            # Verify the visibility of the first row of the table
            wait.until(EC.visibility_of_element_located(first_row_location))

            td_elements = first_row_element.find_elements_by_tag_name("td")
            td_elements = [td for td in td_elements if td.get_attribute("style") != "display: none;"]  # Filter out hidden elements

            exchange = td_elements[1].text.strip()
            pair = td_elements[2].text.strip()
            price = td_elements[3].text.strip()
            plus_2_depth = td_elements[4].text.strip()
            minus_2_depth = td_elements[5].text.strip()
            volume = td_elements[6].text.strip()
            volume_pct = td_elements[7].text.strip()

            return {
                'Exchange': exchange,
                'Pair': pair,
                'Price': price,
                'plus_2_depth': plus_2_depth,
                'minus_2_depth': minus_2_depth,
                'Volume': volume,
                'Volume_Pct': volume_pct,
            }
        except (TimeoutException, IndexError) as e:
            print(f"Error: {e}. Retrying...")
            time.sleep(2)  # Sleep for 2 seconds before retrying
        else:
            break
    else:
        print(f"Failed to fetch data for {link} after {max_retries} retries.")
        return None

data = []

for link in top_links:
    top_market_data = get_top_market_data_for_each_coin(driver, link)
    if top_market_data is not None:
        data.append(top_market_data)
    else:
        print(f"Skipping data for {link}")

driver.quit()
print(data)

[{'Exchange': 'Binance', 'Pair': 'BTC/USDT', 'Price': '$29,346.81', 'plus_2_depth': '$63,760,546', 'minus_2_depth': '$32,634,164', 'Volume': '$2,220,191,421', 'Volume_Pct': '9.75%'}, {'Exchange': 'Binance', 'Pair': 'ETH/USDT', 'Price': '$1,986.19', 'plus_2_depth': '$33,595,756', 'minus_2_depth': '$14,106,693', 'Volume': '$1,425,752,684', 'Volume_Pct': '11.08%'}, {'Exchange': 'Binance', 'Pair': 'BTC/USDT', 'Price': '$1.0008', 'plus_2_depth': '$63,762,405', 'minus_2_depth': '$32,635,115', 'Volume': '$2,218,302,947', 'Volume_Pct': '5.57%'}, {'Exchange': 'Binance', 'Pair': 'BNB/USDT', 'Price': '$327.95', 'plus_2_depth': '$1,724,888', 'minus_2_depth': '$2,354,128', 'Volume': '$166,414,381', 'Volume_Pct': '20.63%'}, {'Exchange': 'Binance', 'Pair': 'USDC/USDT', 'Price': '$1.0003', 'plus_2_depth': '$23,749,570', 'minus_2_depth': '$14,216,935', 'Volume': '$358,040,471', 'Volume_Pct': '6.31%'}]


In [74]:
df = pd.DataFrame(data)
df.set_index('Pair', inplace=True)
# Converting strings to floats
df[['Price', 'plus_2_depth', 'minus_2_depth', 'Volume']] = df[['Price', 'plus_2_depth', 'minus_2_depth', 'Volume']].applymap(lambda x: float(x.replace('$', '').replace(',', '')))
df['Volume_Pct'] = df['Volume_Pct'].apply(lambda x: float(x.replace('%', '')))
volume_sum = f"{df['Volume'].sum():,.2f}"
volume_pct_avg = round(df['Volume_Pct'].mean(), 2)
print(df) # The second 'BTC/USDT' Pair represents USDT, since that is the most traded pair on the most traded exchange for UDST. 
print()
print("Volume Stats:")
print("Volume Sum:", volume_sum)
print("Volume Pct Average:", volume_pct_avg)

          Exchange     Price  plus_2_depth  minus_2_depth           Volume  \
Pair                                                                         
BTC/USDT   Binance 29,346.81 63,760,546.00  32,634,164.00 2,220,191,421.00   
ETH/USDT   Binance  1,986.19 33,595,756.00  14,106,693.00 1,425,752,684.00   
BTC/USDT   Binance      1.00 63,762,405.00  32,635,115.00 2,218,302,947.00   
BNB/USDT   Binance    327.95  1,724,888.00   2,354,128.00   166,414,381.00   
USDC/USDT  Binance      1.00 23,749,570.00  14,216,935.00   358,040,471.00   

           Volume_Pct  
Pair                   
BTC/USDT         9.75  
ETH/USDT        11.08  
BTC/USDT         5.57  
BNB/USDT        20.63  
USDC/USDT        6.31  

Volume Stats:
Volume Sum: 6,388,701,904.00
Volume Pct Average: 10.67


In [75]:
# Hockey Project
driver = webdriver.Chrome(executable_path=driver_path)
driver.get("https://www.scrapethissite.com/")

explore_button = driver.find_element_by_css_selector("a.btn.btn-lg.btn-default")
explore_button.click() 
time.sleep(1)  

link = driver.find_element_by_css_selector("h3.page-title a[href='/pages/forms/']")
link.click()
time.sleep(1)  

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
dropdown = Select(driver.find_element_by_css_selector("select.form-control.col-md-3#per_page"))
dropdown.select_by_value('100')
time.sleep(3)

def process_page(driver):
    #time.sleep(3)
    # Find table rows
    table_rows = driver.find_elements_by_css_selector("table.table tbody tr.team")

    # Extract data from rows and append it to a list
    data_list = []
    for row in table_rows:
        team_data = [
            row.find_element_by_css_selector("td.name").text,
            int(row.find_element_by_css_selector("td.year").text),
            int(row.find_element_by_css_selector("td.wins").text),
            int(row.find_element_by_css_selector("td.losses").text),
            int(row.find_element_by_css_selector("td.ot-losses").text or 0),
            float(row.find_element_by_css_selector("td.pct").text),
            int(row.find_element_by_css_selector("td.gf").text),
            int(row.find_element_by_css_selector("td.ga").text),
            int(row.find_element_by_css_selector("td.diff").text),
        ]
        data_list.append(team_data)

    return data_list

# Click the "Next" button once to set the correct starting URL
next_button = driver.find_element_by_css_selector("ul.pagination li a[aria-label='Next']")
next_button.click()
time.sleep(1)

# Create empty list for all the data
all_data = []

# Loop until there are no more pages
while True:
    # Process the current page
    page_data = process_page(driver)
    all_data.extend(page_data)

    # Try to click the "Next" button to go to the next page
    try:
        next_button = driver.find_element_by_css_selector("ul.pagination li a[aria-label='Next']")
        next_button.click()
        time.sleep(1)
    except NoSuchElementException:
        # No "Next" button found, so this is the last page
        break

# Create a DataFrame with the extracted data and specified columns
columns = ['Team Name', 'Year', 'Wins', 'Losses', 'OT Losses', 'Win %', 'Goals For (GF)', 'Goals Against (GA)', '+/-']
data = pd.DataFrame(all_data, columns=columns)
data.set_index('Team Name', inplace=True)
print(data)
driver.quit()

                         Year  Wins  Losses  OT Losses  Win %  Goals For (GF)  \
Team Name                                                                       
Boston Bruins            1990    44      24          0   0.55             299   
Buffalo Sabres           1990    31      30          0   0.39             292   
Calgary Flames           1990    46      26          0   0.57             344   
Chicago Blackhawks       1990    49      23          0   0.61             284   
Detroit Red Wings        1990    34      38          0   0.42             273   
Edmonton Oilers          1990    37      37          0   0.46             272   
Hartford Whalers         1990    31      38          0   0.39             238   
Los Angeles Kings        1990    46      24          0   0.57             340   
Minnesota North Stars    1990    27      39          0   0.34             256   
Montreal Canadiens       1990    39      30          0   0.49             273   
New Jersey Devils        199

In [76]:
# Oscar Winners Project
driver = webdriver.Chrome(executable_path=driver_path)
driver.get("https://www.scrapethissite.com/")

explore_button = driver.find_element_by_css_selector("a.btn.btn-lg.btn-default")
explore_button.click() 
time.sleep(1)  

link = driver.find_element_by_css_selector("h3.page-title a[href='/pages/ajax-javascript/']")
link.click()
time.sleep(1)  

def process_page(driver):
    # Create empty df to store the data
    df = pd.DataFrame(columns=['Year', 'Film', 'Nominations', 'Awards', 'Best Picture'])
    # Iterate through the years 2010 to 2015
    for year in range(2010, 2016):

        year_link = driver.find_element_by_css_selector(f'a.year-link[id="{year}"]')
        year_link.click()

        WebDriverWait(driver, 30).until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.spinner')))
        time.sleep(2)

        # Scrape the data from the table
        table_rows = driver.find_elements_by_css_selector('tbody#table-body tr.film')
        for row in table_rows:
            film_title = row.find_element_by_css_selector('td.film-title').text
            film_nominations = int(row.find_element_by_css_selector('td.film-nominations').text)
            film_awards = int(row.find_element_by_css_selector('td.film-awards').text)
            best_picture_flag = len(row.find_elements_by_css_selector('td.film-best-picture i.glyphicon-flag')) > 0

            # Create a temporary DataFrame with the current row data
            temp_df = pd.DataFrame({
                'Year': [year],
                'Film': [film_title],
                'Nominations': [film_nominations],
                'Awards': [film_awards],
                'Best Picture': ['★' if best_picture_flag else '']
            })

            # Concatenate the temporary DataFrame with the main DataFrame
            df = pd.concat([df, temp_df], ignore_index=True)

    df.set_index('Film', inplace=True)
    driver.quit()
    print(df)

process_page(driver)

                                                   Year Nominations Awards  \
Film                                                                         
The King's Speech                                  2010          12      4   
Inception                                          2010           8      4   
The Social Network                                 2010           8      3   
The Fighter                                        2010           7      2   
Toy Story 3                                        2010           5      2   
Alice in Wonderland                                2010           3      2   
Black Swan                                         2010           5      1   
In a Better World                                  2010           1      1   
The Lost Thing                                     2010           1      1   
God of Love                                        2010           1      1   
The Wolfman                                        2010         