In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup

import requests
import time
import random
import pandas as pd

# Requirements
- To better understand how web scraping works, check out some basic tutorials on [HTML (Hyper Text Markup Language)](https://www.youtube.com/watch?v=qz0aGYrrlhU).

# BeautifulSoup

In [2]:
url = "https://quotes.toscrape.com/"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
soup.find_all()

In [9]:
quotes = []
for quote in soup.find_all(class_="quote"):

    quotes.append({
        "author": quote.find(class_="author").text,
        "quote": quote.find(class_="text").text,
        "url": url
    })

quotes = pd.DataFrame(quotes)

quotes

Unnamed: 0,author,quote,url
0,Albert Einstein,“The world as we have created it is a process ...,https://quotes.toscrape.com/
1,J.K. Rowling,"“It is our choices, Harry, that show what we t...",https://quotes.toscrape.com/
2,Albert Einstein,“There are only two ways to live your life. On...,https://quotes.toscrape.com/
3,Jane Austen,"“The person, be it gentleman or lady, who has ...",https://quotes.toscrape.com/
4,Marilyn Monroe,"“Imperfection is beauty, madness is genius and...",https://quotes.toscrape.com/
5,Albert Einstein,“Try not to become a man of success. Rather be...,https://quotes.toscrape.com/
6,André Gide,“It is better to be hated for what you are tha...,https://quotes.toscrape.com/
7,Thomas A. Edison,"“I have not failed. I've just found 10,000 way...",https://quotes.toscrape.com/
8,Eleanor Roosevelt,“A woman is like a tea bag; you never know how...,https://quotes.toscrape.com/
9,Steve Martin,"“A day without sunshine is like, you know, nig...",https://quotes.toscrape.com/


# Selenium

To better understand `__new__` have a look at the following links:
- Article - [`__new__` vs `__init__` Methods in Python](https://builtin.com/data-science/new-python)

- YouTube video -  [`__new__` vs `__init__` in Python](https://www.youtube.com/watch?v=-zsV0_QrfTw)

In [11]:
def sleep(start: int, stop: int):
    """
    Make the program sleep for a random number of seconds between start and stop

    Parameters:
        - start - the minimum number of seconds to sleep for
        - stop - the maximum number of seconds to sleep for
    """
    seconds = random.randint(start, stop)
    print(f"Sleeping for {seconds} seconds")
    time.sleep(seconds)

In [12]:
class WebDriver:

    def __new__(cls, download_path: str, headless: bool = True) -> webdriver.Chrome:
        """
        Initialize a web scraper for the browser

        Parameters:
            - download_path - the path where the files will be downloaded on your device
            - headless - if True, the browser will not show up when the script is runnning.
                         if False, the browser will show up when the script is running.
        """
        options = Options()
        # A headless system is a computer that operates without a monitor, 
        # graphical user interface (GUI) or peripheral devices, such as keyboard and mouse
        if headless:
            options.add_argument('--headless')

        # Setup custom download path

        options.add_experimental_option("prefs", {
            "download.default_directory": download_path,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        })

        service = ChromeService(ChromeDriverManager().install())
        driver = webdriver.Chrome(options, service)
        return driver

In [16]:
url = "https://uwlsu.native.fm/?allEvents=1"

# Set to False so you can see what is happening step by step
driver = WebDriver(r"C:\Users\Nikolai\Documents\GitHub\tutoring\3. Scraping", False)
driver_wait = WebDriverWait(driver, 10, 1)

## Scraping

In [17]:
driver.get(url)
# Sleep in case the page needs extra time to load and not trigger bot detection
sleep(5, 10)

Sleeping for 10 seconds


In [18]:
data = []

# Loop over all UWL SU events
loop_xpath = "/html/body/div[1]/div[4]/div"

visited = []

size = len(driver.find_element(By.XPATH, loop_xpath).find_elements(By.TAG_NAME, "div"))
for index in range(size):
    
    try:
        current_element = driver.find_element(By.XPATH, loop_xpath).find_elements(By.TAG_NAME, "div")[index]
    except:
        break

    if "View" in current_element.text and current_element.text not in visited:
        visited.append(current_element.text)
    
    # Open page for more information
    pages = current_element.find_elements(By.TAG_NAME, "div")

    # Skip jank links
    try:
        pages[1].click()
    except:
        continue

    # Wait for the Tickets button to show up
    date_xpath = '/html/body/div[1]/div/div[1]/div[2]/div/div[2]/div/div[2]/div[1]/div/div[1]/p[1]'
    element_present = EC.visibility_of_element_located((By.XPATH, date_xpath))
    driver_wait.until(element_present)
    
    name_xpath = '/html/body/div[1]/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[2]/div/div[1]/h2/span/span[1]/span'
    name = driver.find_element(By.XPATH, name_xpath).text

    price_xpath = '//*[@id="__next"]/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[2]/div/div[2]/div/p[2]/b'
    amount = driver.find_element(By.XPATH, price_xpath).text
    
    date = driver.find_element(By.XPATH, date_xpath).text

    location_xpath = "/html/body/div[1]/div/div[1]/div[2]/div/div[2]/div/div[1]/div/div[2]/div/div[2]/div/p[1]"
    location = driver.find_element(By.XPATH, location_xpath).text

    event_time_xpath = "/html/body/div[1]/div/div[1]/div[2]/div/div[2]/div/div[2]/div[1]/div/div[1]/p[2]"
    event_time = driver.find_element(By.XPATH, event_time_xpath).text

    current_url = driver.current_url

    # Go back to main page
    driver.back()
    element_present = EC.visibility_of_element_located((By.XPATH, loop_xpath))
    driver_wait.until(element_present)

    uwlsu_event = {
        "date": date,
        "name": name,
        "event_time": event_time,
        "location": location,
        "amount": amount,
        "url": current_url
    }

    # Ignore duplicates
    if uwlsu_event not in data:
        data.append(uwlsu_event)

data = pd.DataFrame(data)
data.to_csv("uwlsu.csv", index=False)

In [19]:
data

Unnamed: 0,date,name,event_time,location,amount,url
0,13 March 2024 - 24 April 2024,SU Active - Table Tennis,13:00 - 14:00,"UWL Sports Centre, London",Free,https://uwlsu.native.fm/event/su-active-table-...
1,14 March 2024 - 29 April 2024,SU Active - Indoor Cricket,15:00 - 17:00,"UWL Sports Centre, London",Sold Out,https://uwlsu.native.fm/event/su-active-indoor...
2,15 March 2024 - 26 April 2024,SU ACTIVE - Salsa,17:45 - 18:45,"UWL Sports Centre, London",Sold Out,https://uwlsu.native.fm/event/su-active-salsa/...
3,12 April 2024,Nepalese New Year - 2081,17:30 - 22:45,"Freddie's, London",From £2.00,https://uwlsu.native.fm/event/nepalese-new-yea...
4,15 April 2024,Cybersecurity Guest Lecture:,11:30 - 12:00,"WK.01.010, UWL, Ealing",Free,https://uwlsu.native.fm/event/cybersecurity-gu...
5,15 April 2024,Music Management Presents:,17:00 - 22:00,"Freddie's, London",Free,https://uwlsu.native.fm/event/music-management...
6,16 April 2024,LCM Sessions,19:00 - 22:00,"Freddie's, London",Free,https://uwlsu.native.fm/event/lcm-sessions-dup...
7,17 April 2024,Freddie's Jazz Night,18:00 - 20:30,"Freddie's, London",Free,https://uwlsu.native.fm/event/freddies-jazz-ni...
8,18 April 2024,BA (Hons) Acting & Theatre,19:00 - 22:00,"Lawrence Hall, London",From £5.00,https://uwlsu.native.fm/event/ba-hons-acting-t...
9,19 April 2024,Bollywood Movie Night,17:00 - 21:00,"Freddie's, London",Sold Out,https://uwlsu.native.fm/event/bollywood-movie-...


In [21]:
def get_amount(value: str) -> float | None:
    """
    Extract the UWLSU amount as a number.

    Parameters:
        - value - the raw text from the website
    
    Output:
        - amount - If "Sold Out" then None will be returned.
                   If "Free" then 0.0 will be returned
                   If "From £" then the scraped amount will be returned. 
                   NOTE: This field indicates the CHEAPEST tickets, not the price for a single ticker.
    """
    amount = 0.0
    
    if value == "Sold Out":
        amount = None

    elif "From £" in value:
        amount = float(value.split("From £")[-1])

    return amount

In [22]:
data = data.assign(
    start_date = pd.to_datetime(data["date"].str.split(" - ").str[0], format="%d %B %Y"),
    end_date = pd.to_datetime(data["date"].str.split(" - ").apply(lambda dates: dates[1] if len(dates) > 1 else None), format="%d %B %Y"),
    start_time = data["event_time"].str.split(" - ").str[0],
    end_time = data["event_time"].str.split(" - ").str[1],
    location = data["location"].str.split(", ").str[0],
    city = data["location"].str.split(", ").str[1],
    amount = data["amount"].apply(get_amount)
).drop(["date", "event_time"], axis=1)

data

Unnamed: 0,name,location,amount,url,start_date,end_date,start_time,end_time,city
0,SU Active - Table Tennis,UWL Sports Centre,0.0,https://uwlsu.native.fm/event/su-active-table-...,2024-03-13,2024-04-24,13:00,14:00,London
1,SU Active - Indoor Cricket,UWL Sports Centre,,https://uwlsu.native.fm/event/su-active-indoor...,2024-03-14,2024-04-29,15:00,17:00,London
2,SU ACTIVE - Salsa,UWL Sports Centre,,https://uwlsu.native.fm/event/su-active-salsa/...,2024-03-15,2024-04-26,17:45,18:45,London
3,Nepalese New Year - 2081,Freddie's,2.0,https://uwlsu.native.fm/event/nepalese-new-yea...,2024-04-12,NaT,17:30,22:45,London
4,Cybersecurity Guest Lecture:,WK.01.010,0.0,https://uwlsu.native.fm/event/cybersecurity-gu...,2024-04-15,NaT,11:30,12:00,UWL
5,Music Management Presents:,Freddie's,0.0,https://uwlsu.native.fm/event/music-management...,2024-04-15,NaT,17:00,22:00,London
6,LCM Sessions,Freddie's,0.0,https://uwlsu.native.fm/event/lcm-sessions-dup...,2024-04-16,NaT,19:00,22:00,London
7,Freddie's Jazz Night,Freddie's,0.0,https://uwlsu.native.fm/event/freddies-jazz-ni...,2024-04-17,NaT,18:00,20:30,London
8,BA (Hons) Acting & Theatre,Lawrence Hall,5.0,https://uwlsu.native.fm/event/ba-hons-acting-t...,2024-04-18,NaT,19:00,22:00,London
9,Bollywood Movie Night,Freddie's,,https://uwlsu.native.fm/event/bollywood-movie-...,2024-04-19,NaT,17:00,21:00,London
