# pipe

> Takes information from the Stanford Club Sports website and makes it available in code for a standalone application. Useful for websites.

In [None]:
#| default_exp pipe

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#| export
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime
import os, re, json

In [None]:
#| export
#| hide
def get_page(url: str):
    "Get BS4 soup object and base url helper"
    # Get the base URL
    parsed = urlparse(url)
    base_url = f"{parsed.scheme}://{parsed.netloc}"

    # Query the website
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    return base_url, soup

In [None]:
#| export
def get_roster(roster_url: str):
    "Get a formatted Club Sports roster"

    base_url, soup = get_page(roster_url)

    # Get roster
    coaches_section = soup.find("section", id="sidearm-roster-coaches")
    coaches = []
    for li in coaches_section.select("li.sidearm-roster-coach"):
        title = li.select_one(".sidearm-roster-coach-title span")
        name = li.select_one(".sidearm-roster-coach-name p")
        link = li.select_one(".sidearm-roster-coach-link a")

        coaches.append({
            "title": title.text.strip() if title else None,
            "name": name.text.strip() if name else None,
            "profile_url": os.path.join(base_url + link['href']) if link and link['href'] else None,
        })

    # TODO: This page may get updated with players, but as of now for cycling this is not relevant.

    return coaches

Let's try out this function with the Stanford Cycling team.

In [None]:
roster_url = "https://stanfordclubsports.com/sports/cycling/roster/"
get_roster(roster_url)[0]

{'title': 'Head Coach',
 'name': 'Adrian Bennett',
 'profile_url': 'https://stanfordclubsports.com/sports/cycling/roster/coaches/adrian-bennett/255'}

In [None]:
#| export
def get_staff(staff_url: str):
    "Get and format Club Sport staff"
    
    # Get page
    base_url, soup = get_page(staff_url)
    rows = soup.select("tr")
    headers = [th.get_text(strip=True).lower().replace(" ", "_") for th in rows[0].find_all("th")]
    
    # Parse Staff Table
    staff = []
    for row in rows[1:]:
        if "sidearm-coaches-coach" not in row.get("class", []):
            continue

        cells = row.find_all(["th", "td"])
        data = {}
        for i, header in enumerate(headers):
            if i >= len(cells):
                data[header] = None
                continue

            cell = cells[i]            
            if i == 0: # First column (name) with profile link
                a = cell.find("a")
                if a:
                    data["name"] = a.get_text(strip=True)
                    href = a.get("href", "")
                    data["profile_url"] = href if href.startswith("http") else base_url + href
                else:
                    data["name"] = cell.get_text(strip=True)
                    data["profile_url"] = None
            else: # Other columns
                data[header] = cell.get_text(strip=True)

        staff.append(data)
        
    return staff

Let's try this with the Stanford Cycling Team staff:

In [None]:
staff_url = "https://stanfordclubsports.com/sports/cycling/coaches"
get_staff(staff_url)[3]

{'name': 'George Wehner',
 'profile_url': 'https://stanfordclubsports.com/sports/cycling/roster/staff/george-wehner/18',
 'title': 'President',
 'email_address': 'gtwehner@stanford.edu',
 'phone': ''}

In [None]:
#| export
def get_academic_year():
    "Helper to get the academic year"
    now = datetime.now()
    if now.month >= 9:  # September or later
        return f"{now.year}-{str(now.year + 1)[-2:]}"
    else:
        return f"{now.year - 1}-{str(now.year)[-2:]}"

In [None]:
get_academic_year()

'2024-25'

In [None]:
#| export
def get_schedule(schedule_url:str, season:str=None, naive=False):
    "Get and parse a club sport schedule table."

    # Default to current season
    if not season:
        season = get_academic_year()

    # Query the page
    url = f"{schedule_url}/{season}?grid=true"
    base_url, soup = get_page(url)

    # Get the events rows
    rows = soup.select("tr", class_="sidearm-schedule-game")
    if not rows or len(rows) < 2:
        raise ValueError("No schedule rows found")

    # First row contains the headers
    headers = [th.get_text(strip=True).lower().replace(" ", "_") for th in rows[0].find_all("th")]

    schedule = []
    for row in rows[1:]:
        # Check valid row
        if "sidearm-schedule-game" not in row.get("class", []):
            continue  # Skip non-game rows
        cells = row.find_all("td")
        if not cells:
            continue
        
        # Naive == Match Cell Contents
        if naive:
            row_data = {h: cells[i].get_text().strip() for i,h in enumerate(headers)}
            schedule.append(row_data)
            continue
        
        # Non-Naive == Actually get the link address of <a> tags
        row_data = {}
        for i, h in enumerate(headers):
            if i >= len(cells):
                row_data[h] = None
                continue

            cell = cells[i]
            link = cell.find("a")
            
            if link and link.has_attr("href"):
                href = link["href"]
                if not href.startswith("http"): # relative URLs absolute
                    href = base_url + href
                row_data[h] = href
            else:
                row_data[h] = cell.get_text(strip=True)
        schedule.append(row_data)
        
    return schedule

Let's try this out for the cycling schedule 2022-2023 season. This is the actual table from the Stanford Club Sports website:

![Stanford Cycling Season 2022-23 Schedule](assets/schedule-example.png)

Now we fetch the schedule and display the second row:

In [None]:
schedule = get_schedule("https://stanfordclubsports.com/sports/cycling/schedule", "2022-23")
schedule[1]

{'date': 'October 30, 2022 (Sunday)',
 'time': '',
 'at': 'Away',
 'opponent': 'Surf City Cyclocross',
 'location': 'Santa Cruz, CA',
 'tv': '',
 'radio': '',
 'result': '4 Riders with Top 10 finishes',
 'links': 'https://cyclo-x.com/results'}

Notice the `links` field. It actually fetched the link. If we want to just get the cell contents, we can set `naive=True`.

In [None]:
schedule = get_schedule("https://stanfordclubsports.com/sports/cycling/schedule", "2022-23", naive=True)
schedule[1]["links"]

'Event Results'

In [None]:
#| hide
def get_story_component_id(home_url: str):
    "Get story component id, which may or may not be useful in the future."
    sc = soup.find("stories-component").get("params")
    sc_id = re.search(r"id:\s*'([^']+)'", sc).group(1)
    return sc_id

In [None]:
#| export
def get_stories(home_url: str):
    "Get story blurbs from Club Sports website"

    # Fetch page
    base_url, soup = get_page(home_url)

    # The stories are currently stored in a JavaScript Object
    ss = soup.find("div", class_="s-stories__inner")
    script = ss.find("script")
    obj_str = re.search(r"var obj\s*=\s*(\{.*\});", script.string, re.DOTALL)
    obj = json.loads(obj_str.group(1))

    stories = []
    for s_obj in obj["data"]:
        t_soup = BeautifulSoup(s_obj["teaser"], "html.parser")
        teaser = t_soup.get_text(strip=True)

        story = {
            "title": s_obj["content_title"],
            "date": s_obj["content_date"],
            "story_url": s_obj["content_url"],
            "image_url": s_obj["content_image_url"],
            "teaser": teaser
        }
        stories.append(story)

    return stories

Let's try with a few different teams, like Stanford Cycling and Women's Rugby.

In [None]:
home_url = "https://stanfordclubsports.com/sports/cycling"
get_stories(home_url)[:2]

[{'title': '2025 USA Cycling Collegiate Road National Championships',
  'date': '2025-07-05T16:01:00',
  'story_url': 'https://stanfordclubsports.com/news/2025/7/5/2025-usa-cycling-collegiate-road-national-championships.aspx',
  'image_url': 'https://stanfordclubsports.com/images/2025/7/5/nationals.jpg',
  'teaser': 'May 2-4, 2025'}]

In [None]:
home_url = "https://stanfordclubsports.com/sports/womens-rugby"
get_stories(home_url)[:2]

[{'title': "Stanford Women's Rugby Crowned CRAA Challenger 7s National Champions",
  'date': '2025-05-03T18:40:00',
  'story_url': 'https://stanfordclubsports.com/news/2025/5/3/stanford-womens-rugby-crowned-craa-challenger-7s-national-champions.aspx',
  'image_url': 'https://stanfordclubsports.com/images/2025/5/4/Natty_7s_SWR.jpeg',
  'teaser': 'May 3, 2025'},
 {'title': "Stanford Women's Rugby Advance to the Semi Finals in the CRAA 7s National Championship",
  'date': '2025-05-02T20:54:00',
  'story_url': 'https://stanfordclubsports.com/news/2025/5/2/stanford-womens-rugby-advance-to-the-semi-finals-in-the-craa-7s-national-championship.aspx',
  'image_url': 'https://stanfordclubsports.com/images/2025/4/5/0J5A4952_copy.jpg',
  'teaser': 'May 2, 2025'}]

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()