## beautiful soup

In [80]:
from bs4 import BeautifulSoup, NavigableString
import requests
import pandas as pd

In [4]:
url = 'https://www.tfrrs.org/results/xc/25186/Panorama_Farms_Invitational'

In [6]:
page = requests.get(url) #send a get request to the url, will return a response object

In [None]:
#204, 400, 404 are all bad request responses

In [8]:
soup = BeautifulSoup(page.text, 'html') #page is sending the request and .text is retrieving the raw HTML that we're going to use

In [11]:
print(soup.prettify()) #adds indentations - makes it look a little better

<!DOCTYPE html>
<html class="no-js js-menubar" lang="en">
 <head>
  <!-- Google tag (gtag.js) -->
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=G-0Z9Z2N6ZD0">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'G-0Z9Z2N6ZD0');
  </script>
  <!-- Google Analytics -->
  <script>
   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');

ga('create', 'UA-66287-6', 'auto');  
ga('send', 'pageview');
  </script>
  <!-- End Google Analytics -->
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0, user-scalable=0, minimal-ui" n

## find and find_all

In [28]:
table = soup.find_all('table')[1]

In [51]:
world_titles = table.find_all('th')

In [52]:
world_titles #this is a list, each col name is within the <th> tag

[<th data-tablesaw-priority="persist" scope="col">PL
         </th>,
 <th data-tablesaw-priority="persist" scope="col">NAME
         </th>,
 <th data-tablesaw-priority="2" scope="col">YEAR
         </th>,
 <th data-tablesaw-priority="persist" scope="col">TEAM
         </th>,
 <th data-tablesaw-priority="3" scope="col">Avg. Mile
         </th>,
 <th data-tablesaw-priority="1" scope="col">TIME
         </th>,
 <th data-tablesaw-priority="persist" scope="col">SCORE
         </th>]

In [53]:
world_table_titles = [title.text.strip() for title in world_titles]
print(world_table_titles)

['PL', 'NAME', 'YEAR', 'TEAM', 'Avg. Mile', 'TIME', 'SCORE']


### put it into a pandas dataframe

In [128]:
df = pd.DataFrame(columns = world_table_titles)
df

Unnamed: 0,PL,NAME,YEAR,TEAM,Avg. Mile,TIME,SCORE


### trying to add course name/year to dataframe:

In [138]:
df = pd.DataFrame(columns = world_table_titles + ['COURSE','YEAR'])
df

Unnamed: 0,PL,NAME,YEAR,TEAM,Avg. Mile,TIME,SCORE,COURSE,YEAR.1


### keep going

In [130]:
#actual rows of results (names with times and such)
column_data = table.find_all('tr')

In [131]:
#name of course
course_div = soup.find('div', class_ = 'panel-heading xc-heading')
course_name = course_div.find('h3').text.strip()

In [132]:
#year 
year_div = soup.find('div', class_ = 'panel-heading-normal-text inline-block')
year = year_div.text.strip()

In [140]:
#looping through column_data (stuff with <tr> tag) to find <td> tag
num_columns = len(df.columns)
for row in column_data[1:]: 
    if hasattr(row, 'find_all'):
        row_data = row.find_all('td') #individual (row) data
        individual_row_data = [data.text.strip() for data in row_data] #getting and stripping text for each row
        
        individual_row_data.append(course_name)
        individual_row_data.append(year)
            
        #current data frame:
        length = len(df)
        df.loc[length] = individual_row_data #appending each row of the information into the next position

In [141]:
df

Unnamed: 0,PL,NAME,YEAR,TEAM,Avg. Mile,TIME,SCORE,COURSE,YEAR.1
0,1,Jenny Schilling,JR-3,Virginia,05:24.4,20:09.7,1,Panorama Farms Invitational,"October 19, 2024"
1,2,Sophie Atkinson,SR-4,Virginia,05:29.3,20:27.9,2,Panorama Farms Invitational,"October 19, 2024"
2,3,Jette Beermann,?,Unattached,05:29.5,20:28.7,,Panorama Farms Invitational,"October 19, 2024"
3,4,Tatum David,SO-2,Virginia,05:32.1,20:38.2,3,Panorama Farms Invitational,"October 19, 2024"
4,5,Camryn Menninger,SR-4,Virginia,05:33.8,20:44.5,4,Panorama Farms Invitational,"October 19, 2024"
...,...,...,...,...,...,...,...,...,...
134,0,Charlotte Hudson,FR-1,Richmond,,DNF,,Panorama Farms Invitational,"October 19, 2024"
135,0,Iris Downes,SO-2,Duke,,DNF,,Panorama Farms Invitational,"October 19, 2024"
136,0,Katelyn Porter,SO-2,Coastal Carolina,,DNF,,Panorama Farms Invitational,"October 19, 2024"
137,0,Star Price,JR-3,Virginia Tech,,DNF,,Panorama Farms Invitational,"October 19, 2024"


## making it a function

In [148]:
def get_results(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    
    table = soup.find_all('table')[1]
    
    world_titles = table.find_all('th')
    world_table_titles = [title.text.strip() for title in world_titles]
    
    df = pd.DataFrame(columns = world_table_titles + ['COURSE', 'YEAR'])
    
    #course_div = soup.find('div', class_ = 'panel-heading xc-heading')
    #course_name = course_div.fine('h3').text.strip()
    
    #year_div = soup.find('div', class_ = 'panel-heading-normal-text inline block')
    #year = year_div.text.strip()
    
    column_data = table.find_all('tr')
    
    for row in column_data[1:]:
        if hasattr(row, 'find_all'):
            row_data = row.find_all('td')
            individual_row_data = [data.text.strip() for data in row_data]
            
            individual_row_data.append(course_name)
            individual_row_data.append(year)
            
            length = len(df)
            df.loc[length] = individual_row_data
            
    return df

In [149]:
get_results("https://www.tfrrs.org/results/xc/23362/2024_Pirate_Cross_Country_Invitational_")

Unnamed: 0,PL,NAME,YEAR,TEAM,Avg. Mile,TIME,SCORE,COURSE,YEAR.1
0,1,Peninah Mutisya,FR-1,Hampton,05:36.4,20:54.4,,Panorama Farms Invitational,"October 19, 2024"
1,2,Kyra Holland,SR-4,William & Mary,05:39.8,21:07.1,1,Panorama Farms Invitational,"October 19, 2024"
2,3,Sofia Istnick,JR-3,William & Mary,05:47.5,21:35.8,2,Panorama Farms Invitational,"October 19, 2024"
3,4,Arianna DeBoer,SR-4,William & Mary,05:48.5,21:39.3,3,Panorama Farms Invitational,"October 19, 2024"
4,5,Sarah McCartney,SR-4,Charleston Southern,05:49.3,21:42.4,4,Panorama Farms Invitational,"October 19, 2024"
...,...,...,...,...,...,...,...,...,...
204,205,Abigail Snodgrass,SO-2,George Mason,07:25.8,27:42.2,,Panorama Farms Invitational,"October 19, 2024"
205,206,Harrison Grooms,SR-4,Davidson,07:26.4,27:44.4,,Panorama Farms Invitational,"October 19, 2024"
206,207,Winter Oaster,FR-1,Elon,07:29.6,27:56.2,,Panorama Farms Invitational,"October 19, 2024"
207,208,Caitlin Peck,JR-3,UNCW,07:49.0,29:08.9,138,Panorama Farms Invitational,"October 19, 2024"
