In [1]:
#Importing necessary packages and libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as py
# Requests lets us grab info from a web page
import requests

# BeautifulSoup parses and searches that info
from bs4 import BeautifulSoup

Jacob and Bernie are both athletes on the Men's Soccer Team and one of the biggest challenges faced on any team is the division of minutes across a season that each team and subsequently player plays. For this study, we will examine two perennial powerhouse programs in the Northwest Conference: Whitman College (3rd place in NWC this 2023-2024 season) and Pacific Lutheran University (1st Place in NWC this 2023-2024 season). Scraping various components of both the roster and statistics pages on their respective athletics websites, we will delve into minutes played for each player, breaking things down by position and as above and beyond components attempting to understand elements of Coaching staffs decision and process for playing players within the league, non-conference games and NCAA tournament (if applicable).  

In [2]:
#First let's turn towards scraping the Whitman Men's Soccer Website!

# Create a variable called URL
url = "https://athletics.whitman.edu/sports/mens-soccer/stats/2023"

# Use requests.get to grab the html
my_html = requests.get(url)

#Turn the html into soup using BeautifulSoup
soup = BeautifulSoup(my_html.text, 'html')

In [3]:
#Creation of dataframe for scraped content. 
column_names = {"Player":[], "Minutes": [], "Number": []}
df = pd.DataFrame(column_names)

In [4]:
#Let's now iterate through to strip player name, minutes played and the player's respective number. 
players = soup.find_all('a', class_= 'hide-on-medium-down')
minutes = soup.find_all('td', {"data-label": "MIN"}, class_='text-center')
numbers = soup.find_all('td', class_= 'hide-on-medium-down')
for player, minute, player_number in zip(players, minutes, numbers):
    player_name = player.text.strip()
    minute_count = minute.text.strip()
    number = player_number.text.strip()
    df.loc[len(df.index)] = [player_name, minute_count, number]

In [5]:
df.head()

Unnamed: 0,Player,Minutes,Number
0,"McAllister, Lucas",1112,2
1,"Gomez, Christian",1119,8
2,"Romero, Edwin",1216,9
3,"Valero, Sawyer",1068,7
4,"Burrill, Jacob",909,11


In [6]:
#The dataframe currently continues after we want it to, to combat this lets cap the dataframe at the last player in our roster!
df_wc = df[:29]

In [7]:
df_wc

Unnamed: 0,Player,Minutes,Number
0,"McAllister, Lucas",1112,2
1,"Gomez, Christian",1119,8
2,"Romero, Edwin",1216,9
3,"Valero, Sawyer",1068,7
4,"Burrill, Jacob",909,11
5,"Gonzalez, Jr., Pablo",1498,4
6,"Stonier, Zachary",882,19
7,"Perez-Cuellar, Erick",1258,26
8,"Kobayashi, Riki",580,20
9,"Taylor, Noa",261,12


In [8]:
#We see here that there are duplicate rows for certain players. This is one of the seen inconsistencies within the athletics website. 
#First we must take the parsed string column "Minutes" and convert the values to an integer type to allow us to aggregate the values. 
df_wc["Minutes"] = df_wc["Minutes"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wc["Minutes"] = df_wc["Minutes"].astype(int)


In [9]:
#We now leverage groupby to summate minutes for each player. 
df_wc_minutes = df_wc.groupby('Player', as_index=False).agg({'Minutes': 'sum'})

In [10]:
#We now merge our two dataframes to get the numbers back into our df as they were eliminated in the group by (this is a workaround solution!)
df_wc_merged = pd.merge(df_wc, df_wc_minutes, on="Player", how = 'inner')
#Now we drop the duplicate pairs that exist within the df (Alex and Pablo)
df_wc_merged = df_wc_merged.drop_duplicates(subset = 'Player')
#Lastly we eliminate another duplicate column!
df_wc_merged = df_wc_merged.drop('Minutes_x', axis = 1)
df_wc_merged

Unnamed: 0,Player,Number,Minutes_y
0,"McAllister, Lucas",2,1112
1,"Gomez, Christian",8,1119
2,"Romero, Edwin",9,1216
3,"Valero, Sawyer",7,1068
4,"Burrill, Jacob",11,909
5,"Gonzalez, Jr., Pablo",4,1579
7,"Stonier, Zachary",19,882
8,"Perez-Cuellar, Erick",26,1258
9,"Kobayashi, Riki",20,580
10,"Taylor, Noa",12,261


## Scraping Whitman Positions

It's great that we have the minutes played per player, but we also now want to look at the position of each player and ultimately merge this with our previously created df. Let's now scrape the roster instead of the statistics page! We take the number here as a unique identifier due to uniqiue naming order differences! 

In [11]:
# Create a variable called URL
url_pos = "https://athletics.whitman.edu/sports/mens-soccer/roster"

# Use requests.get to grab the html
my_html_pos = requests.get(url_pos)

#Turn the html into soup using BeautifulSoup
soup_pos = BeautifulSoup(my_html_pos.text, 'html')

In [12]:
#Creation of dataframe for scraped content. 
column_names_pos = {"Player":[], "Position": [], "Number": []}
df_players = pd.DataFrame(column_names_pos)

In [13]:
#Similar to before we now iterate through to strip player name, position and the player's respective number. 
players = soup_pos.find_all('div', class_='sidearm-roster-player-name')
player_positions = soup_pos.find_all('div', class_='sidearm-roster-player-position')
numbers = soup_pos.find_all('span', class_='sidearm-roster-player-jersey-number')
for player, player_position, player_number in zip(players, player_positions, numbers):
    player_name = player.find('h3').text.strip()
    position = player_position.find('span', class_='sidearm-roster-player-position-long-short hide-on-small-down').text.strip()
    number = player_number.text.strip()
    df_players.loc[len(df_players.index)] = [player_name, position, number]

In [14]:
#Here we drop one of our teammates as he medically red-shirted the season and thus did not play!
df_players = df_players[1:]

In [33]:
#We now merge our two dataframes on the shared identifier of number utilizing an inner join. 
df_wc_stats = pd.merge(df_wc_merged, df_players, on = 'Number', how = 'inner')
#Removing duplicate column
df_wc_stats = df_wc_stats.drop('Player_y', axis = 1)
#For later comparison we now add a School "tag" to identify each player who plays for Whitman. We will do a similar action for PLU later on!
df_wc_stats['School']='WC'
#For consistency we now rename a couple columns. 
df_wc_stats = df_wc_stats.rename(columns={'Player_x': 'Player', 'Minutes_y':'Minutes'})
df_wc_stats

Unnamed: 0,Player,Number,Minutes,Position,School
0,"McAllister, Lucas",2,1112,Midfield/Forward,WC
1,"Gomez, Christian",8,1119,Midfielder,WC
2,"Romero, Edwin",9,1216,Forward,WC
3,"Valero, Sawyer",7,1068,Forward,WC
4,"Burrill, Jacob",11,909,Forward,WC
5,"Gonzalez, Jr., Pablo",4,1579,Defender,WC
6,"Stonier, Zachary",19,882,Forward,WC
7,"Perez-Cuellar, Erick",26,1258,Midfield,WC
8,"Kobayashi, Riki",20,580,Midfielder/Forward,WC
9,"Taylor, Noa",12,261,Forward,WC


With our scraping for Whitman now done we turn towards arming ourselves with information from PLU!

## Scraping for PLU soccer statistics

This time things are a little bit more streamlined as some of the inconsistencies of the Whitman Athletics Statistics page aren't here! 

In [16]:
# Create a variable called URL
url_plu = "https://golutes.com/sports/mens-soccer/stats/2023"

# Use requests.get to grab the html
my_html_plu = requests.get(url_plu)

#Turn the html into soup using BeautifulSoup
soup_plu = BeautifulSoup(my_html_plu.text, 'html')

In [17]:
#Creation of dataframe for scraped content. 
column_names_plu = {"Player":[], "Minutes": [], "Number": []}
df_plu = pd.DataFrame(column_names_plu)

In [18]:
#Let's now iterate through to strip player name, minutes played and the player's respective number. 
players = soup_plu.find_all('a', class_= 'hide-on-medium-down')
minutes = soup_plu.find_all('td', {"data-label": "MIN"}, class_='text-center')
numbers = soup_plu.find_all('td', class_= 'hide-on-medium-down')
for player, minute, player_number in zip(players, minutes, numbers):
    player_name = player.text.strip()
    minute_count = minute.text.strip()
    number = player_number.text.strip()
    df_plu.loc[len(df_plu.index)] = [player_name, minute_count, number]

In [21]:
#The dataframe currently continues after we want it to, to combat this lets cap the dataframe at the last player in our roster!
df_plu = df_plu[:32]
df_plu

Unnamed: 0,Player,Minutes,Number
0,"Johnson, Craig",1503,9
1,"Thompson, Trevor",1624,16
2,"Guyer, Robby",1457,42
3,"Bliskis, Owen",372,27
4,"Tafolla, Sammy",1498,10
5,"Helle, Dane",1097,7
6,"O'Brien, Connor",476,12
7,"Kamau, Alvin",1286,99
8,"Kelly, Brandt",443,20
9,"Ross, Jordan",502,15


## Scraping PLU Player Positions

Similar to with Whitman, it's great that we have the minutes played per player, but we also now want to look at the position of each player and ultimately merge this with our previously created df. Let's now scrape the roster instead of the statistics page! We take the number here as a unique identifier due to uniqiue naming order differences! 

In [22]:
# Create a variable called URL
url_plu_pos = "https://golutes.com/sports/mens-soccer/roster"

# Use requests.get to grab the html
my_html_plu_pos = requests.get(url_plu_pos)

#Turn the html into soup using BeautifulSoup
soup_plu_pos = BeautifulSoup(my_html_plu_pos.text, 'html')

In [24]:
#Creation of dataframe for scraped content. 
column_names_plu_pos = {"Player":[], "Position": [], "Number": []}
df_players_plu = pd.DataFrame(column_names_plu_pos)

In [25]:
#Similar to before we now iterate through to strip player name, position and the player's respective number. 
players = soup_plu_pos.find_all('div', class_='sidearm-roster-player-name')
player_positions = soup_plu_pos.find_all('div', class_='sidearm-roster-player-position')
numbers = soup_plu_pos.find_all('span', class_='sidearm-roster-player-jersey-number')
for player, player_position, player_number in zip(players, player_positions, numbers):
    player_name = player.find('h3').text.strip()
    position = player_position.find('span', class_='sidearm-roster-player-position-long-short hide-on-small-down').text.strip()
    number = player_number.text.strip()
    df_players_plu.loc[len(df_players_plu.index)] = [player_name, position, number]

In [26]:
df_players_plu

Unnamed: 0,Player,Position,Number
0,Garrett Sevison,Goalkeeper,0
1,Nicholas Gaston,Goalkeeper,1
2,Elijah Singleton,Midfield/Defense,2
3,Alex White,D,3
4,Alex Coope,M/F,4
5,Sam Erickson,Midfield,5
6,Mattias Anderson,Defense,6
7,Dane Helle,M,7
8,Isaiah Baer,Midfield,8
9,Craig Johnson,Midfield,9


In [34]:
#Merging on the minutes removes any players without statistics. This is different than before where we manually removed a player from our team. 
df_plu_stats = pd.merge(df_plu, df_players_plu, on='Number', how='inner')
#Removal of additional duplicate column
df_plu_stats = df_plu_stats.drop('Player_y', axis = 1)
#Adding our School 'tag' for later comparison. 
df_plu_stats['School']='PLU'
#Renaming column for later comparison and joining of dataframes!
df_plu_stats = df_plu_stats.rename(columns={'Player_x': 'Player'})
df_plu_stats

Unnamed: 0,Player,Minutes,Number,Position,School
0,"Johnson, Craig",1503,9,Midfield,PLU
1,"Thompson, Trevor",1624,16,Forward,PLU
2,"Guyer, Robby",1457,42,F,PLU
3,"Bliskis, Owen",372,27,F,PLU
4,"Tafolla, Sammy",1498,10,Midfield,PLU
5,"Helle, Dane",1097,7,M,PLU
6,"O'Brien, Connor",476,12,M/F,PLU
7,"Kamau, Alvin",1286,99,D,PLU
8,"Kelly, Brandt",443,20,Forward,PLU
9,"Ross, Jordan",502,15,Defense,PLU


We are now armed with information from both schools and can begin to do some analysis!

In [None]:
#Function lookup table and .apply and reapply 
