In [4]:
# Standard imports
import numpy as np
import pandas as pd

# For web scraping
import requests
from bs4 import BeautifulSoup

# For performing regex operations
import re

# For adding delays so that we don't spam requests
import time
import joblib

In [2]:
%pip install html5lib

Note: you may need to restart the kernel to use updated packages.Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
   ---------------------------------------- 0.0/112.2 kB ? eta -:--:--
   ---------- ----------------------------- 30.7/112.2 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 112.2/112.2 kB 1.6 MB/s eta 0:00:00
Installing collected packages: html5lib
Successfully installed html5lib-1.1





# Table of Contents

[UCLA](#ucla)

[Grand Canyon](#gc)

[Long Beach State](#lbs)

[Penn State](#penn)

[UCI](#uci)

[Hawaii](#uh)

[BYU](#byu)

[Stanford](#stan)

[Pepperdine](#pep)

[Loyola Chicago](#loy)

[Saint Francis (PA)](#sfu)

[Ball St.](#ball)

[George Mason](#gm)

[Lindenwood](#lw)

[USC](#usc)

[Purdue Fort Wayne](#pfw)

[Lewis University](#lu)

[Daemen](#dae)

[Belmont abbey college](#bac)

[McKendree](#mc)

[Lincoln Memorial](#lm)

[CSUN](#csun)

[UC San Diego](#ucsd)

[Princeton](#prin)

[UC Santa Barbara](#ucsb)

[North Greenville](#ng)

[LIU](#liu)

[Harvard](#har)

[NJIT](#njit)

[Other schools not included in final dataset](#other)



## Extracting all the roster data from each schools roster using different functions.

### Each function uses soup.findall for html element tags or classes to extract the name, height, and position from each schools mens volleyball roster. I ran the function for each year in each roster starting between 2020 and 2022. I then combined them all into a single dataframe, added a school column and the name of the school they played for and dropped duplicate names. At the end I combined all of the schools together into one data frame.

# UCLA <a class="anchor" id ="ucla"></a>

In [74]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <span> tags with class "s-person-details__bio-stats-item"
    bio_stats_items = soup.find_all('span', class_='s-person-details__bio-stats-item')

    # Loop through each <span> tag to extract height and position
    for item in bio_stats_items:
        # Extract height (assuming it's the first item found)
        if 'Height' in item.text:
            height_text = item.text.strip()  # Get the full text, e.g., "Height 5' 10'' "
            
            # Extract the height value directly without splitting by spaces
            height = height_text.split('Height ')[1].strip()  # Remove "Height" and any leading/trailing spaces
            
            # Append directly to heights list
            heights.append(height)

        # Extract position (assuming it's the second item found)
        if 'Position' in item.text:
            position = item.text.strip().split(' ')[-1]  # Extract last part, e.g., 'L'
            positions.append(position)

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names
    for tag in name_tags:
        if tag.parent.name == 'a':  # Ensure it's within an <a> tag
            name = tag.text.strip()
            names.append(name)

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [98]:
result_la2020 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2020')

In [99]:
result_la2020 #checking to see if it worked

Unnamed: 0,Name,Height,Position
0,Kyle Vom Steeg,6' 7'',Opp
1,J.R. Norris IV,6' 5'',Opp
2,Cole Ketrzynski,6' 8'',OH/Opp
3,Cole Pender,6' 2'',L/OH
4,Sam Kobrine,6' 3'',OH/S
5,Marcus Partain,6' 2'',S
6,Ian Parish,6' 9'',MB
7,Kevin Kobrine,6' 5'',Opp/OH
8,Grant Maleski,6' 9'',MB
9,Cole Johnson,6' 7'',OH


In [100]:
result_la2019 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2019')

In [101]:
result_la2018 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2018')

In [102]:
result_la2017 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2017')

In [103]:
result_la2016 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2016')

In [104]:
result_la2015 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2015')

In [105]:
result_la2014 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2014')

In [106]:
result_la2013 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2013')

In [107]:
result_la2010 = scrape_player_data('https://uclabruins.com/sports/mens-volleyball/roster/2010')

### Combining all the DataFrames together

In [108]:
ucla = pd.concat([result_la2020, result_la2019,result_la2018,result_la2017,result_la2016,result_la2015,result_la2014,result_la2013,result_la2010], axis=0)

In [176]:
ucla= ucla.drop_duplicates() #getting rid of repeat roster entries

In [182]:
ucla['School'] = 'UCLA' #adding UCLA to newly created column School

In [183]:
ucla

Unnamed: 0,Name,Height,Position,School
0,Kyle Vom Steeg,6' 7'',Opp,UCLA
1,J.R. Norris IV,6' 5'',Opp,UCLA
2,Cole Ketrzynski,6' 8'',OH/Opp,UCLA
3,Cole Pender,6' 2'',L/OH,UCLA
4,Sam Kobrine,6' 3'',OH/S,UCLA
...,...,...,...,...
13,Kevin Ker,6' 2'',S,UCLA
14,Nick Vogel,6' 9'',QH/Opp,UCLA
15,Jeremy Casebeer,6' 4'',OH,UCLA
16,Alex Scattareggia,6' 5'',S,UCLA


In [184]:
ucla.to_csv('UCLA_Roster') # converting to a csv file

In [185]:
ucla.info()

<class 'pandas.core.frame.DataFrame'>
Index: 111 entries, 0 to 18
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      111 non-null    object
 1   Height    111 non-null    object
 2   Position  111 non-null    object
 3   School    111 non-null    object
dtypes: object(4)
memory usage: 4.3+ KB


# Grand Canyon <a class="anchor" id ="gc"></a>

In [157]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()  # Get the text inside the <span> tag
        heights.append(height)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()  # Get the text inside the <span> tag
        positions.append(position)

    # Find all <a> tags with href containing "/roster/"
    name_tags = soup.find_all('a', href=lambda href: href and '/roster/' in href)

    # Extract names
    for tag in name_tags:
        name = tag.text.strip()  # Get the text inside the <a> tag
        
        # Filter out rows with blank names, "Print", and "Full Bio"
        if name and name != "Print" and name != "Full Bio":
            names.append(name)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df


In [158]:
result_gc2020 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2020')

In [159]:
result_gc2020 #checking to see if it worked

Unnamed: 0,Name,Height,Position
0,Chibuike Obi,"6'4""",Middle Blocker
1,Trevor Weary,"6'7""",MB
2,Camden Gianni,"6'5""",Outside Hitter
3,Avery Enriques,"5'10""",OH
4,Jack Burton,"6'4""",Outside Hitter
5,Heath Hughes,"6'7""",OH
6,Onur Cukur,"6'8""",Libero
7,Ian McLain,"6'7""",L
8,Christian Janke,"6'5""",Outside Hitter
9,Grayson Browning,"6'5""",OH


In [160]:
result_gc2020

Unnamed: 0,Name,Height,Position
0,Chibuike Obi,"6'4""",Middle Blocker
1,Trevor Weary,"6'7""",MB
2,Camden Gianni,"6'5""",Outside Hitter
3,Avery Enriques,"5'10""",OH
4,Jack Burton,"6'4""",Outside Hitter
5,Heath Hughes,"6'7""",OH
6,Onur Cukur,"6'8""",Libero
7,Ian McLain,"6'7""",L
8,Christian Janke,"6'5""",Outside Hitter
9,Grayson Browning,"6'5""",OH


In [161]:
result_gc2019 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2019')

In [162]:
result_gc2018 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2018')

In [163]:
result_gc2017 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2017')

In [164]:
result_gc2016 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2016')

In [165]:
result_gc2015 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2015')

In [166]:
result_gc2014 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2014')

In [167]:
result_gc2013 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2013')

In [168]:
result_gc2012 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2012')

In [169]:
result_gc2011 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2011')

In [170]:
result_gc2010 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2010')

In [171]:
result_gc2009 = scrape_player_data('https://gculopes.com/sports/mens-volleyball/roster/2009')

In [172]:
gcu =  pd.concat([result_gc2020, result_gc2019,result_gc2018,result_gc2017,result_gc2016,result_gc2015,result_gc2014,result_gc2013,result_gc2012,result_gc2011,result_gc2010,result_gc2009], axis=0)

In [174]:
gcu = gcu.drop_duplicates() #getting rid of repeat roster entries

In [187]:
gcu['School'] = 'GCU'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gcu['School'] = 'GCU'


In [188]:
gcu

Unnamed: 0,Name,Height,Position,School
0,Chibuike Obi,"6'4""",Middle Blocker,GCU
1,Trevor Weary,"6'7""",MB,GCU
2,Camden Gianni,"6'5""",Outside Hitter,GCU
3,Avery Enriques,"5'10""",OH,GCU
4,Jack Burton,"6'4""",Outside Hitter,GCU
...,...,...,...,...
1,Trent Bruns,"6'0""",OH,GCU
0,Chris Bradshaw,"6'3""",Setter,GCU
1,Robbie Echols,"5'6""",S,GCU
2,Kyle Norton,"6'3""",Outside Hitter,GCU


In [189]:
gcu.to_csv('GCU_Roster')

# Long Beach State <a class="anchor" id ="lbs"></a>

In [243]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <a> tags with class "hover:underline focus:underline"
    player_tags = soup.find_all('a', class_='hover:underline focus:underline')

    # Extract names from <h3> tags within <a> tags
    for tag in player_tags:
        name_tag = tag.find('h3')  # Find <h3> tag within <a> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <h3> tag
            names.append(name)

    # Find all <span> tags with class "s-person-details__bio-stats-item"
    bio_stats_items = soup.find_all('span', class_='s-person-details__bio-stats-item')

    # Extract heights and positions
    for item in bio_stats_items:
        # Extract height
        height_tag = item.find('span', class_='sr-only')
        if height_tag and 'Height' in height_tag.text:
            height = item.text.split('Height')[1].strip()
            heights.append(height)
        
        # Extract position
        if 'Position' in item.text:
            position = item.text.split('Position')[1].strip()
            positions.append(position)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [244]:
result_lbs2020 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2020')

In [245]:
result_lbs2020

Unnamed: 0,Name,Height,Position
0,Shane Holdaway,6' 8'',Middle Blocker
1,Mason Briggs,6' 0'',Libero
2,Dawson Fugate,6' 5'',Outside Hitter
3,Aidan Knipe,6' 3'',Setter
4,Grant Guinasso,6' 0'',DS/Libero
5,Gary Adams,6' 5'',Middle Blocker
6,Nathan Harlan,6' 4'',Opposite/Outside Hitter
7,Simon Andersen,6' 8'',Middle Blocker
8,Marc Moody,6' 8'',Middle Blocker
9,Ryan Poole,6' 5'',Outside Hitter


In [247]:
result_lbs2019 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2019')

In [248]:
result_lbs2018 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2018')

In [249]:
result_lbs2017 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2017')

In [250]:
result_lbs2016 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2016')

In [251]:
result_lbs2015 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2015')

In [252]:
result_lbs2014 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2014')

In [253]:
result_lbs2013 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2013')

In [254]:
result_lbs2012 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2012')

In [255]:
result_lbs2011 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2011')

In [256]:
result_lbs2010 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2010')

In [257]:
result_lbs2009 = scrape_player_data('https://longbeachstate.com/sports/mens-volleyball/roster/2009')

In [282]:
lbs =  pd.concat([result_lbs2020, result_lbs2019,result_lbs2018,result_lbs2017,result_lbs2016,result_lbs2015,result_lbs2014,result_lbs2013,result_lbs2012,result_lbs2011,result_lbs2010,result_lbs2009], axis=0)

In [283]:
lbs= lbs.drop_duplicates() #getting rid of repeat roster entries

In [284]:
lbs['School'] = 'LBS'

In [285]:
lbs

Unnamed: 0,Name,Height,Position,School
0,Shane Holdaway,6' 8'',Middle Blocker,LBS
1,Mason Briggs,6' 0'',Libero,LBS
2,Dawson Fugate,6' 5'',Outside Hitter,LBS
3,Aidan Knipe,6' 3'',Setter,LBS
4,Grant Guinasso,6' 0'',DS/Libero,LBS
...,...,...,...,...
12,Conor Eaton,6' 7'',S,LBS
13,Dean Bittner,6' 4'',OPP,LBS
14,Brad Hemmerling,6' 7'',OH,LBS
15,Dan Alexander,6' 6'',MB,LBS


In [286]:
lbs.to_csv('LBS_Roster')

# Penn State <a class="anchor" id ="penn"></a>

In [267]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <td> tags with class "sidearm-table-player-name"
    name_tags = soup.find_all('td', class_='sidearm-table-player-name')

    # Extract names
    for tag in name_tags:
        # Extract name from the <a> tag within <td>
        name_tag = tag.find('a')
        if name_tag:
            name = name_tag.text.strip()
            names.append(name)

    # Find all <td> tags with class "rp_position_short"
    position_tags = soup.find_all('td', class_='rp_position_short')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <td> tags with class "height"
    height_tags = soup.find_all('td', class_='height')

    # Extract heights
    for tag in height_tags:
        # Extract height from the <td> tag
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df


In [268]:
result_psu2020 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2020')

In [269]:
result_psu2020

Unnamed: 0,Name,Height,Position
0,Declan Pierce,5-10,L
1,Tim Herget,6-2,OH
2,Jack Shampine,6-5,OH
3,Luke Braswell,6-4,S
4,Nathan Smith,6-5,S
5,Cole Bogner,6-3,S
6,Bobby Wilden,6-8,OH
7,Jason Donorovich,6-9,MB
8,John Kerr,6-6,OH
9,Gabe Hartke,6-4,OH


In [270]:
result_psu2019 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2019')

In [339]:
result_psu2019

Unnamed: 0,Name,Height,Position
0,Declan Pierce,5-10,L
1,Jack Goedken,6-4,OH
2,Frank Melvin,6-5,OPP
3,Luke Braswell,6-4,S
4,Nathan Smith,6-5,S
5,Cole Bogner,6-3,S
6,Bobby Wilden,6-8,OH
7,Jason Donorovich,6-9,MB
8,Lee Smith,6-6,OH
9,Henry Payne,6-4,OH


In [271]:
result_psu2018 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2018')

In [340]:
result_psu2018

Unnamed: 0,Name,Height,Position
0,Declan Pierce,5-10,L
1,Jack Goedken,6-4,OH
2,Frank Melvin,6-5,OH
3,Luke Braswell,6-4,S
4,Nathan Smith,6-5,S
5,Royce Clemens,6-1,L
6,Bobby Wilden,6-8,OH
7,Jason Donorovich,6-9,MB
8,Lee Smith,6-6,OH
9,Henry Payne,6-4,OH


In [272]:
result_psu2017 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2017')

In [273]:
result_psu2016 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2016')

In [274]:
result_psu2015 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2015')

In [275]:
result_psu2014 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2014')

In [276]:
result_psu2013 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2013')

In [277]:
result_psu2012 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2012')

In [278]:
result_psu2011 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2011')

In [279]:
result_psu2010 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2010')

In [280]:
result_psu2009 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2009')

In [281]:
result_psu2008 = scrape_player_data('https://gopsusports.com/sports/mens-volleyball/roster/2008')

In [287]:
psu =  pd.concat([result_psu2020, result_psu2019,result_psu2018,result_psu2017,result_psu2016,result_psu2015,result_psu2014,result_psu2013,result_psu2012,result_psu2011,result_psu2010,result_psu2009,result_psu2008], axis=0)

In [289]:
psu = psu.drop_duplicates() #getting rid of repeat roster entries

In [290]:
psu['School'] = 'PSU'

In [291]:
psu

Unnamed: 0,Name,Height,Position,School
0,Declan Pierce,5-10,L,PSU
1,Tim Herget,6-2,OH,PSU
2,Jack Shampine,6-5,OH,PSU
3,Luke Braswell,6-4,S,PSU
4,Nathan Smith,6-5,S,PSU
...,...,...,...,...
6,Edgardo Gonzalez,6-5,S,PSU
10,Will Price,6-5,OPP,PSU
12,Ryan Sweitzer,6-5,OPP,PSU
13,Matt Anderson,6-10,OH,PSU


In [292]:
psu.to_csv('PSU_Roster')

# UCI <a class="anchor" id ="uci"></a>

In [365]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Extract names
    name_tags = soup.find_all('a', attrs={'aria-label': lambda value: value and value.endswith(' - View Full Bio')})
    for tag in name_tags:
        name = tag.text.strip()
        # Check if 'Full Bio' is present in the name (to avoid including it)
        if 'Full Bio' in name:
            continue
        # Check if name is not empty or just whitespace
        if name and not name.isspace():
            names.append(name)

    # Extract positions
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short')
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Extract heights
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [373]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    names = []
    heights = []
    positions = []

    # Extract names
    name_tags = soup.find_all('td', class_='sidearm-table-player-name')
    for tag in name_tags:
        # Extract name from <a> tag within <td>
        name_tag = tag.find('a')
        if name_tag:
            name = name_tag.text.strip()
            names.append(name)

    # Extract heights
    height_tags = soup.find_all('td', class_='height')
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Extract positions
    position_tags = soup.find_all('td', class_='rp_position_short')
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [352]:
result_uci2020 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2020')

In [353]:
result_uci2020

Unnamed: 0,Name,Height,Position
0,Davis Lau,"5'7""",Libero
1,Akhil Tangutur,"6'2""",L
2,Joel Schneidmiller,"6'6""",Outside Hitter
3,Patrick Vorenkamp,"6'6""",OH
4,Jonny Bowles,"6'7""",Outside Hitter
5,Brian Garcia,"6'5""",OH
6,AJ Faille,"6'0""",Setter
7,Doug Dahm,"6'3""",S
8,Garo Barsemian,"6'6""",Opposite/Outside
9,Alexandre Nsakanda,"6'4""",OPP/OH


In [374]:
result_uci2019 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2019?view=2')

In [375]:
result_uci2019

Unnamed: 0,Name,Height,Position
0,Zevan Williams,6-3,MB
1,Dante Chakravorti,6-4,S
2,Joel Schneidmiller,6-6,OH
3,Patrick Vorenkamp,6-6,MB
4,Jonny Bowles,6-7,OPP
5,Brian Garcia,6-5,S
6,Aaron Koubi,6-5,OH
7,AJ Faille,6-0,L
8,Owen Wahlgren,6-8,MB
9,David Parker,6-1,L


In [376]:
result_uci2018 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2018?view=2')

In [377]:
result_uci2018

Unnamed: 0,Name,Height,Position
0,Zevan Williams,6-3,MB
1,Dante Chakravorti,6-4,Setter
2,Joel Schneidmiller,6-6,Outside Hitter
3,Ben Sebastian,6-7,Middle Blocker
4,Tucker Pikula,6-1,Outside Hitter
5,Sean Farmer,6-4,Setter
6,Aaron Koubi,6-5,Outside Hitter
7,Logan Zotovich,6-2,Setter
8,Grant Friedman,6-5,Outside Hitter
9,Reid Dominguez,6-5,Outside Hitter


In [378]:
result_uci2017 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2017?view=2')

In [379]:
result_uci2016 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2016?view=2')

In [389]:
result_uci2016

Unnamed: 0,Name,Height,Position
0,Andrew Benz,6-5,Middle Blocker
1,Dante Chakravorti,0-0,Setter
2,Tamir Hershko,6-6,Opposite
3,Jason Agopian,6-7,Middle Blocker
4,Michael Saeta,6-5,Opposite
5,Tucker Pikula,6-1,Outside Hitter
6,Aaron Koubi,6-5,Outside Hitter
7,Logan Zotovich,6-2,Setter
8,Phillip Friedman,6-4,Outside Hitter
9,Reid Dominguez,6-5,Outside Hitter


In [380]:
result_uci2015 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2015?view=2')

In [381]:
result_uci2015

Unnamed: 0,Name,Height,Position
0,Andrew Benz,6-5,Middle Blocker
1,Roberto Frazzoni,6-2,Setter
2,Tamir Hershko,6-6,Opposite
3,Travis Woloson,6-4,Outside Hitter
4,Michael Saeta,6-5,Opposite
5,Tucker Pikula,6-1,Outside Hitter
6,Aaron Koubi,6-5,Outside Hitter
7,Michael Brinkley,5-10,Libero
8,Phillip Friedman,6-4,Outside Hitter
9,Reid Dominguez,6-5,Outside Hitter


In [382]:
result_uci2014 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2014?view=2')

In [383]:
result_uci2013 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2013?view=2')

In [384]:
result_uci2012 = scrape_player_data('https://ucirvinesports.com/sports/mens-volleyball/roster/2012?view=2')

In [386]:
uci =  pd.concat([result_uci2020, result_uci2019,result_uci2018,result_uci2017,result_uci2016,result_uci2015,result_uci2014,result_uci2013,result_uci2012], axis=0)

In [387]:
uci = uci.drop_duplicates() #getting rid of repeat roster entries

In [390]:
uci['School'] = 'UCI'

In [393]:
uci.to_csv('UCI_Roster')

# Hawaii <a class="anchor" id ="uh"></a>

In [394]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    names = []
    heights = []
    positions = []

    # Extract names
    name_tags = soup.find_all('td', class_='sidearm-table-player-name')
    for tag in name_tags:
        # Extract name from <a> tag within <td>
        name_tag = tag.find('a')
        if name_tag:
            name = name_tag.text.strip()
            names.append(name)

    # Extract heights
    height_tags = soup.find_all('td', class_='height')
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Extract positions
    position_tags = soup.find_all('td', class_='rp_position_short')
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [397]:
result_uh2020 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2020?view=2')

In [398]:
result_uh2020

Unnamed: 0,Name,Height,Position
0,Chaz Galloway,6-3,OH
1,James Anastassiades,6-5,OH
2,Brett Sheward,6-2,S/L
3,‘Eleu Choy,5-7,L
4,Gage Worsley,6-1,L
5,Cole Hogland,6-4,OH/OPP
6,Garrett Geiger,6-5,OH
7,Devon Johnson,6-6,OH
8,Jakob Thelle,6-6,S
9,Dimitrios Mouchlias,6-6,OPP


In [399]:
result_uh2019 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2019?view=2')

In [400]:
result_uh2018 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2018?view=2')

In [401]:
result_uh2017 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2017?view=2')

In [402]:
result_uh2016 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2016?view=2')

In [403]:
result_uh2015 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2015?view=2')

In [404]:
result_uh2014 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2014?view=2')

In [405]:
result_uh2013 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2013?view=2')

In [406]:
result_uh2012 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2012?view=2')

In [407]:
result_uh2011 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2011?view=2')

In [408]:
result_uh2010 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2010?view=2')

In [409]:
result_uh2009 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2009?view=2')

In [410]:
result_uh2008 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2008?view=2')

In [411]:
result_uh2007 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2007?view=2')

In [412]:
result_uh2006 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2006?view=2')

In [413]:
result_uh2005 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2005?view=2')

In [414]:
result_uh2004 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2004?view=2')

In [415]:
result_uh2003 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2003?view=2')

In [416]:
result_uh2002 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2002?view=2')

In [417]:
result_uh2001 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2001?view=2')

In [418]:
result_uh2000 = scrape_player_data('https://hawaiiathletics.com/sports/mens-volleyball/roster/2000?view=2')

In [419]:
uh =  pd.concat([result_uh2020, result_uh2019,result_uh2018,result_uh2017,result_uh2016,result_uh2015,result_uh2014,result_uh2013,result_uh2012,result_uh2011,result_uh2010,result_uh2009,result_uh2008,result_uh2007,result_uh2006,result_uh2005,result_uh2004,result_uh2003,result_uh2002,result_uh2001,result_uh2000], axis=0)

In [422]:
uh = uh.drop_duplicates()  #getting rid of repeat roster entries

In [423]:
uh['School'] = 'UH'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uh['School'] = 'UH'


In [425]:
uh.to_csv('UH_Roster')

# BYU <a class="anchor" id ="byu"></a>

### Because of how the website was set up I had to use the pandas html table reader

In [1317]:
result_byu2020 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2020?view=table')

In [1318]:
result_byu2020_3d = np.array(result_byu2020)

In [1319]:
result_byu2020_2d = result_byu2020_3d.reshape(-1, result_byu2020_3d.shape[-1])

In [1320]:
byu = pd.DataFrame(result_byu2020_2d)

In [1321]:
result_byu2019 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2019?view=table')

In [1322]:
result_byu2019_3d = np.array(result_byu2019)

In [1323]:
result_byu2019_2d = result_byu2019_3d.reshape(-1, result_byu2019_3d.shape[-1])

In [1324]:
byu1 = pd.DataFrame(result_byu2019_2d)

In [1325]:
result_byu2018 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2018?view=table')

In [1326]:
result_byu2018_3d = np.array(result_byu2018)

In [1327]:
result_byu2018_2d = result_byu2018_3d.reshape(-1, result_byu2018_3d.shape[-1])

In [1328]:
byu2 = pd.DataFrame(result_byu2018_2d)

In [1329]:
result_byu2017 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2017?view=table')

In [1330]:
result_byu2017_3d = np.array(result_byu2017)

In [1331]:
result_byu2017_2d = result_byu2017_3d.reshape(-1, result_byu2017_3d.shape[-1])

In [1332]:
byu3 = pd.DataFrame(result_byu2017_2d)

In [1333]:
result_byu2016 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2016?view=table')

In [1334]:
result_byu2016_3d = np.array(result_byu2016)

In [1335]:
result_byu2016_2d = result_byu2016_3d.reshape(-1, result_byu2016_3d.shape[-1])

In [1336]:
byu4 = pd.DataFrame(result_byu2016_2d)

In [1337]:
result_byu2015 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2015?view=table')

In [1338]:
result_byu2015_3d = np.array(result_byu2015)

In [1339]:
result_byu2015_2d = result_byu2015_3d.reshape(-1, result_byu2015_3d.shape[-1])

In [1340]:
byu5 = pd.DataFrame(result_byu2015_2d)

In [1341]:
result_byu2014 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2014?view=table')

In [1342]:
result_byu2014_3d = np.array(result_byu2014)

In [1343]:
result_byu2014_2d = result_byu2014_3d.reshape(-1, result_byu2014_3d.shape[-1])

In [1344]:
byu6 = pd.DataFrame(result_byu2014_2d)

In [1345]:
result_byu2013 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2013?view=table')

In [1346]:
result_byu2013_3d = np.array(result_byu2013)

In [1347]:
result_byu2013_2d = result_byu2013_3d.reshape(-1, result_byu2013_3d.shape[-1])

In [1348]:
byu7 = pd.DataFrame(result_byu2013_2d)

In [1349]:
result_byu2012 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2012?view=table')

In [1350]:
result_byu2012_3d = np.array(result_byu2012)

In [1351]:
result_byu2012_2d = result_byu2012_3d.reshape(-1, result_byu2012_3d.shape[-1])

In [1352]:
byu8 = pd.DataFrame(result_byu2012_2d)

In [1353]:
result_byu2011 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2011?view=table')

In [1354]:
result_byu2011_3d = np.array(result_byu2011)

In [1355]:
result_byu2011_2d = result_byu2011_3d.reshape(-1, result_byu2011_3d.shape[-1])

In [1356]:
byu9 = pd.DataFrame(result_byu2011_2d)

In [1357]:
result_byu2010 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2010?view=table')

In [1358]:
result_byu2010_3d = np.array(result_byu2010)

In [1359]:
result_byu2010_2d = result_byu2010_3d.reshape(-1, result_byu2010_3d.shape[-1])

In [1360]:
byu10 = pd.DataFrame(result_byu2010_2d)

In [1361]:
result_byu2009 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2009?view=table')

In [1362]:
result_byu2009_3d = np.array(result_byu2009)

In [1363]:
result_byu2009_2d = result_byu2009_3d.reshape(-1, result_byu2009_3d.shape[-1])

In [1364]:
byu11 = pd.DataFrame(result_byu2009_2d)

In [1365]:
result_byu2008 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2008?view=table')

In [1366]:
result_byu2008_3d = np.array(result_byu2008)

In [1367]:
result_byu2008_2d = result_byu2008_3d.reshape(-1, result_byu2008_3d.shape[-1])

In [1368]:
byu12 = pd.DataFrame(result_byu2008_2d)

In [1369]:
result_byu2007 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2007?view=table')

In [1370]:
result_byu2007_3d = np.array(result_byu2007)

In [1371]:
result_byu2007_2d = result_byu2007_3d.reshape(-1, result_byu2007_3d.shape[-1])

In [1372]:
byu13 = pd.DataFrame(result_byu2007_2d)

In [1373]:
result_byu2006 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2006?view=table')

In [1374]:
result_byu2006_3d = np.array(result_byu2006)

In [1375]:
result_byu2006_2d = result_byu2006_3d.reshape(-1, result_byu2006_3d.shape[-1])

In [1376]:
byu14 = pd.DataFrame(result_byu2006_2d)

In [1377]:
result_byu2005 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2005?view=table')

In [1378]:
result_byu2005_3d = np.array(result_byu2005)

In [1379]:
result_byu2005_2d = result_byu2005_3d.reshape(-1,result_byu2005_3d.shape[-1])

In [1380]:
byu15 = pd.DataFrame(result_byu2005_2d)

In [1381]:
result_byu2004 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2004?view=table')

In [1382]:
result_byu2004_3d = np.array(result_byu2004)

In [1383]:
result_byu2004_2d = result_byu2004_3d.reshape(-1, result_byu2004_3d.shape[-1])

In [1384]:
byu16 = pd.DataFrame(result_byu2004_2d)

In [1385]:
result_byu2003 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2003?view=table')

In [1386]:
result_byu2003_3d = np.array(result_byu2003)

In [1387]:
result_byu2003_2d = result_byu2003_3d.reshape(-1, result_byu2003_3d.shape[-1])

In [1388]:
byu17 = pd.DataFrame(result_byu2003_2d)

In [1389]:
result_byu2002 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2002?view=table')

In [1390]:
result_byu2002_3d = np.array(result_byu2002)

In [1391]:
result_byu2002_2d = result_byu2002_3d.reshape(-1, result_byu2002_3d.shape[-1])

In [1392]:
byu18 = pd.DataFrame(result_byu2002_2d)

In [1393]:
result_byu2001 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2001?view=table')

In [1394]:
result_byu2001_3d = np.array(result_byu2001)

In [1395]:
result_byu2001_2d = result_byu2001_3d.reshape(-1, result_byu2001_3d.shape[-1])

In [1396]:
byu19 = pd.DataFrame(result_byu2001_2d)

In [1397]:
result_byu2000 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/2000?view=table')

In [1398]:
result_byu2000_3d = np.array(result_byu2000)

In [1399]:
result_byu2000_2d = result_byu2000_3d.reshape(-1, result_byu2000_3d.shape[-1])

In [1400]:
byu20 = pd.DataFrame(result_byu2000_2d)

In [1401]:
result_byu1999 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1999?view=table')

In [1402]:
result_byu1999_3d = np.array(result_byu1999)

In [1403]:
result_byu1999_2d = result_byu1999_3d.reshape(-1, result_byu1999_3d.shape[-1])

In [1404]:
byu21 = pd.DataFrame(result_byu1999_2d)

In [1405]:
result_byu1998 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1998?view=table')

In [1406]:
result_byu1998_3d = np.array(result_byu1998)

In [1407]:
result_byu1998_2d = result_byu1998_3d.reshape(-1, result_byu1998_3d.shape[-1])

In [1408]:
byu22 = pd.DataFrame(result_byu1998_2d)

In [1409]:
result_byu1997 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1997?view=table')

In [1410]:
result_byu1997_3d = np.array(result_byu1997)

In [1411]:
result_byu1997_2d = result_byu1997_3d.reshape(-1, result_byu1997_3d.shape[-1])

In [1412]:
byu23 = pd.DataFrame(result_byu1997_2d)

In [1413]:
result_byu1996 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1996?view=table')

In [1414]:
result_byu1996_3d = np.array(result_byu1996)

In [1415]:
result_byu1996_2d = result_byu1996_3d.reshape(-1, result_byu1996_3d.shape[-1])

In [1416]:
byu24 = pd.DataFrame(result_byu1996_2d)

In [1417]:
result_byu1995 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1995?view=table')

In [1418]:
result_byu1995_3d = np.array(result_byu1995)

In [1419]:
result_byu1995_2d = result_byu1995_3d.reshape(-1, result_byu1995_3d.shape[-1])

In [1420]:
byu25 = pd.DataFrame(result_byu1995_2d)

In [1421]:
result_byu1994 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1994?view=table')

In [1422]:
result_byu1994_3d = np.array(result_byu1994)

In [1423]:
result_byu1994_2d = result_byu1994_3d.reshape(-1, result_byu1994_3d.shape[-1])

In [1424]:
byu26 = pd.DataFrame(result_byu1994_2d)

In [1425]:
result_byu1993 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1993?view=table')

In [1426]:
result_byu1993_3d = np.array(result_byu1993)

In [1427]:
result_byu1993_2d = result_byu1993_3d.reshape(-1, result_byu1993_3d.shape[-1])

In [1428]:
byu27 = pd.DataFrame(result_byu1993_2d)

In [1429]:
result_byu1992 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1992?view=table')

In [1430]:
result_byu1992_3d = np.array(result_byu1992)

In [1431]:
result_byu1992_2d = result_byu1992_3d.reshape(-1, result_byu1992_3d.shape[-1])

In [1432]:
byu28 = pd.DataFrame(result_byu1992_2d)

In [1433]:
result_byu1991 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1991?view=table')

In [1434]:
result_byu1991_3d = np.array(result_byu1991)

In [1435]:
result_byu1991_2d = result_byu1991_3d.reshape(-1, result_byu1991_3d.shape[-1])

In [1436]:
byu29 = pd.DataFrame(result_byu1991_2d)

In [1437]:
result_byu1990 = pd.read_html('https://byucougars.com/sports/mens-volleyball/roster/season/1990?view=table')

In [1438]:
result_byu1990_3d = np.array(result_byu1990)

In [1439]:
result_byu1990_2d = result_byu1990_3d.reshape(-1, result_byu1990_3d.shape[-1])

In [1440]:
byu30 = pd.DataFrame(result_byu1990_2d)

In [1441]:
byu =  pd.concat([byu,byu1,byu2,byu3,byu4,byu5,byu6,byu7,byu8,byu9,byu10,byu11,byu12,byu13,byu14,byu15,byu16,byu17,byu18,byu19,byu20,byu21,byu22,byu23,byu24,byu25,byu26,byu27,byu28,byu29,byu30], axis=0)

In [1442]:
byu = byu.drop_duplicates()

In [1443]:
byu

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,Davide Gardini,Outside Hitter,6-9,,Sophomore,Ravenna,,Manfredi Azzarita,
1,2,Alex Ah Sue,Opposite Hitter,6-4,,Sophomore,Camarillo,,Rio Mesa High School,
2,3,Wil Stanley,Setter,6-4,,Senior,Honolulu,,Punahou High School,
3,4,Jon Stanley,Libero/Defensive Specialist,6-2,,Freshman,"Honolulu, Hawaii",,Kaiser High School,Opens in a new window Instagram
4,5,Gabi Garcia Fernandez,Opposite Hitter,6-7,,Junior,San Juan,,Saint Francis HS,
...,...,...,...,...,...,...,...,...,...,...
9,11,Shawn Patchell,Middle Blocker,6-6,205 lbs,Freshman,Irvine,,,
10,20,Ross Burningham,Middle Blocker,6-5,,Sophomore,Chula Vista,,,
11,21,Rodney Cortez,Setter,6-0,,Junior,Livermore,,,
12,44,Jeff Arnold,Outside Hitter/Middle Blocker,6-6,,Freshman,Salt Lake City,,,


In [1444]:
byu.columns = ['number','Name','Position','Height','Weight','class','city','idk','highschool','idk2']

In [1445]:
byu = byu.drop(columns = ['number','Weight','class','city','idk','highschool','idk2'])

In [1446]:
byu = byu.drop_duplicates()

Unnamed: 0.1,Unnamed: 0,Name,Position,Height,School
0,0,Davide Gardini,Outside Hitter,6-9,BYU
1,1,Alex Ah Sue,Opposite Hitter,6-4,BYU
2,2,Wil Stanley,Setter,6-4,BYU
3,3,Jon Stanley,Libero/Defensive Specialist,6-2,BYU
4,4,Gabi Garcia Fernandez,Opposite Hitter,6-7,BYU
...,...,...,...,...,...
215,3,David Johnson,Outside Hitter,6-4,BYU
216,4,Gunnar Lindstoel,Outside Hitter,6-6,BYU
217,5,Ross McKeever,Outside Hitter,6-4,BYU
218,7,Skule Vagen,Outside Hitter,6-3,BYU


In [1447]:
byu['School'] = 'BYU'

In [1448]:
byu.to_csv('BYU_Roster')

# Stanford <a class="anchor" id ="stan"></a>

In [660]:
result_stan2020 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2020?view=table')
result_stan2020 = result_stan2020[0].dropna(axis=0, thresh=4)

In [661]:
result_stan2020

Unnamed: 0,#,Name,Position,Height,Weight,Class,Hometown,High School,Previous School,Connect
0,1,Adam Chang,Middle Blocker,6-6,,Freshman,"San Diego, Calif.",Westview,,
1,2,Justin Lui,Libero,5-10,,Redshirt Freshman,"Pickering, Ontario, Canada",Pickering,,
2,3,Paul Bischoff,Setter,6-6,,Senior,"Wheaton, Ill.",Glenbard West,,
3,4,Eric Beatty,Outside Hitter,6-7,,Senior,"Huntington Beach, Calif.",Huntington Beach,,
4,5,JP Reilly,Outside Hitter,6-4,,Junior,"Manhattan Beach, Calif.",Loyola,,
5,8,Leo Henken,Outside Hitter,6-6,,Junior,"Webster Groves, Mo.",Saint Louis University HS,,
6,9,Stephen Moye,Middle Blocker,6-9,,Senior,"El Segundo, Calif.",El Segundo,,
7,10,Kevin Lamp,Outside Hitter,6-6,,Freshman,"Lake Bluff, Ill.",Lake Forest,,
8,12,Nathan Lietzke,Setter,6-6,,Freshman,"Austin, Texas",St. Andrew's Episcopal School,,
9,14,Kyler Presho,Middle Blocker,6-8,,Junior,"San Clemente, Calif.",San Clemente,,


In [662]:
result_stan2019 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2019?view=table')
result_stan2019 = result_stan2019[0].dropna(axis=0, thresh=4)

In [687]:
result_stan2019

Unnamed: 0,#,Name,Position,Height,Weight,Class,Hometown,High School,Previous School,Connect
0,1,Kyle Dagostino,Libero,5-9,,Redshirt Senior,"Tampa, Fla.",Berkeley Preparatory,,
1,2,Justin Lui,Libero,5-10,,Freshman,"Pickering, Ontario, Canada",Pickering,,
2,3,Paul Bischoff,Setter,6-5,,Junior,"Wheaton, Ill.",Glenbard West,,
3,4,Eric Beatty,Outside Hitter,6-7,,Junior,"Huntington Beach, Calif.",Huntington Beach,,
4,5,JP Reilly,Outside Hitter,6-4,,Sophomore,"Manhattan Beach, Calif.",Loyola,,
5,6,Russell Dervay,Setter,6-1,,Senior,"Virginia Beach, Va.",Frank W. Cox,,
6,7,Mason Tufuga,Opposite,6-5,,Junior,"Costa Mesa, Calif.",Costa Mesa,,
7,8,Leo Henken,Outside Hitter,6-5,,Sophomore,"Webster Groves, Mo.",Saint Louis University HS,,
8,9,Stephen Moye,Middle Blocker,6-9,,Junior,"El Segundo, Calif.",El Segundo,,
9,11,Chris Moore,Middle Blocker,6-5,,Senior,"Lake Mary, Fla.",Lake Mary,,


In [663]:
result_stan2018 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2018?view=table')
result_stan2018 = result_stan2018[0].dropna(axis=0, thresh=4)

In [668]:
result_stan2017 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2017?view=table')
result_stan2017 = result_stan2017[0].dropna(axis=0, thresh=4)

In [669]:
result_stan2016 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2016?view=table')
result_stan2016 = result_stan2016[0].dropna(axis=0, thresh=4)

In [670]:
result_stan2015 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2015?view=table')
result_stan2015 = result_stan2015[0].dropna(axis=0, thresh=4)

In [671]:
result_stan2014 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2014?view=table')
result_stan2014 = result_stan2014[0].dropna(axis=0, thresh=4)

In [672]:
result_stan2013 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2013?view=table')
result_stan2013 = result_stan2013[0].dropna(axis=0, thresh=4)

In [673]:
result_stan2012 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2012?view=table')
result_stan2012 = result_stan2012[0].dropna(axis=0, thresh=4)

In [674]:
result_stan2011 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2011?view=table')
result_stan2011 = result_stan2011[0].dropna(axis=0, thresh=4)

In [676]:
result_stan2010 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2010?view=table')
result_stan2010 = result_stan2010[0].dropna(axis=0, thresh=4)

In [677]:
result_stan2009 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2009?view=table')
result_stan2009 = result_stan2009[0].dropna(axis=0, thresh=4)

In [678]:
result_stan2008 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2008?view=table')
result_stan2008 = result_stan2008[0].dropna(axis=0, thresh=4)

In [679]:
result_stan2007 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2007?view=table')
result_stan2007 = result_stan2007[0].dropna(axis=0, thresh=4)

In [680]:
result_stan2006 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2006?view=table')
result_stan2006 = result_stan2006[0].dropna(axis=0, thresh=4)

In [681]:
result_stan2005 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2005?view=table')
result_stan2005 = result_stan2005[0].dropna(axis=0, thresh=4)

In [682]:
result_stan2004 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2004?view=table')
result_stan2004 = result_stan2004[0].dropna(axis=0, thresh=4)

In [683]:
result_stan2003 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2003?view=table')
result_stan2003 = result_stan2003[0].dropna(axis=0, thresh=4)

In [684]:
result_stan2002 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2002?view=table')
result_stan2002 = result_stan2002[0].dropna(axis=0, thresh=4)

In [665]:
result_stan2001 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2001?view=table')
result_stan2001 = result_stan2001[0].dropna(axis=0, thresh=4)

In [666]:
result_stan2000 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/2000?view=table')
result_stan2000 = result_stan2000[0].dropna(axis=0, thresh=4)

In [667]:
result_stan1999 = pd.read_html('https://gostanford.com/sports/mens-volleyball/roster/season/1999?view=table')
result_stan1999 = result_stan1999[0].dropna(axis=0, thresh=4)

In [685]:
stan =  pd.concat([result_stan2020,result_stan2019,result_stan2018,result_stan2017,result_stan2016,result_stan2015,result_stan2014,result_stan2013,result_stan2012,result_stan2011,result_stan2010,result_stan2009,result_stan2008,result_stan2007,result_stan2006,result_stan2005,result_stan2004,result_stan2003,result_stan2002,result_stan2001,result_stan2000,result_stan1999], axis=0)

In [690]:
stan

Unnamed: 0,#,Name,Position,Height,Weight,Class,Hometown,High School,Previous School,Connect,Title
0,1.0,Adam Chang,Middle Blocker,6-6,,Freshman,"San Diego, Calif.",Westview,,,
1,2.0,Justin Lui,Libero,5-10,,Redshirt Freshman,"Pickering, Ontario, Canada",Pickering,,,
2,3.0,Paul Bischoff,Setter,6-6,,Senior,"Wheaton, Ill.",Glenbard West,,,
3,4.0,Eric Beatty,Outside Hitter,6-7,,Senior,"Huntington Beach, Calif.",Huntington Beach,,,
4,5.0,JP Reilly,Outside Hitter,6-4,,Junior,"Manhattan Beach, Calif.",Loyola,,,
...,...,...,...,...,...,...,...,...,...,...,...
16,17.0,Dylan Kordic,Setter,6-3,,Junior,"Manhattan Beach, Calif.",Mira Costa,,,
17,18.0,Chandler Kaaa,Setter,6-4,,Redshirt Freshman,"Hilo, Hawaii",Kamehameha-Hawai'i,,,
18,20.0,Jake Vandermeer,Outside Hitter/Opposite,6-5,,Junior,"Dallas, Texas",St. Mark's School of Texas,,,
19,21.0,Grant Delgado,Libero,5-9,,Freshman,"Rolling Hills Estates, Calif.",Palos Verdes,,,


In [692]:
stan = stan.drop(columns = ['#','Weight','Class','Hometown','High School','Previous School','Connect','Title'])

In [694]:
stan.drop_duplicates()

Unnamed: 0,Name,Position,Height
0,Adam Chang,Middle Blocker,6-6
1,Justin Lui,Libero,5-10
2,Paul Bischoff,Setter,6-6
3,Eric Beatty,Outside Hitter,6-7
4,JP Reilly,Outside Hitter,6-4
...,...,...,...
7,Garrett Dobbs,Outside Hitter/Opposite,6-4
13,Spencer McLachlin,Outside Hitter,6-7
14,Max Halvorson,Middle Blocker,6-7
15,Ian Connolly,Outside Hitter,6-3


In [695]:
stan['School'] = 'Stanford'

In [697]:
stan.to_csv('Stan_Roster')

# Pepperdine <a class="anchor" id ="pep"></a>

In [716]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [717]:
result_pep2020 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2020')

In [718]:
result_pep2020

Unnamed: 0,Name,Height,Position
0,Scott Solan,"6'8""",Opposite
1,Jon Minsberg,"6'1""",Setter
2,Joe Karlous,"6'0""",Setter
3,Ben Weinberg,"6'4""",Outside Hitter
4,Noah Dyer,"6'5""",Outside Hitter
5,Diego Rosal,"6'4""",Outside Hitter
6,Spencer Wickens,"5'11""",Libero
7,Zac Norvid,"6'0""",Libero
8,Diego Perez,"6'0""",Libero
9,Auden McCaw,"6'6""",Middle Blocker


In [719]:
result_pep2019 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2019')

In [730]:
result_pep2019

Unnamed: 0,Name,Height,Position
0,Michael Wexter,"6'6""",Outside Hitter
1,Jon Minsberg,"6'2""",Setter
2,Jack Cole,"6'6""",Middle Blocker
3,Ben Weinberg,"6'4""",Outside Hitter
4,Noah Dyer,"6'5""",Outside Hitter/Setter
5,Brendin Chandler,"6'5""",Setter
6,Spencer Wickens,"6'0""",Libero
7,Zac Norvid,"6'1""",Libero
8,Clay Carr,"6'6""",Opposite
9,Kaleb Denmark,"6'4""",Outside Hitter


In [736]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [737]:
result_pep2018 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2018')

In [738]:
result_pep2018

Unnamed: 0,Name,Height,Position
0,Michael Wexter,"6'6""",OH
1,Jon Minsberg,"6'2""",S
2,Jack Cole,"6'6""",MB
3,Daniel Vaziri,"6'6""",OPP/MB
4,Noah Dyer,"6'5""",OH/S
5,Colby Harriman,"6'4""",OH
6,Spencer Wickens,"6'0""",L
7,Zac Norvid,"6'1""",L
8,Clay Carr,"6'6""",OPP
9,Kaleb Denmark,"6'4""",OH


In [739]:
result_pep2017 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2017')

In [740]:
result_pep2016 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2016')

In [741]:
result_pep2015 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2015')

In [742]:
result_pep2014 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2014')

In [743]:
result_pep2013 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2013')

In [744]:
result_pep2012 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2012')

In [745]:
result_pep2011 = scrape_player_data('https://pepperdinewaves.com/sports/mens-volleyball/roster/2011')

In [746]:
pep =  pd.concat([result_pep2020,result_pep2019,result_pep2018,result_pep2017,result_pep2016,result_pep2015,result_pep2014,result_pep2013,result_pep2012,result_pep2011], axis=0)

In [748]:
pep.drop_duplicates()

Unnamed: 0,Name,Height,Position
0,Scott Solan,"6'8""",Opposite
1,Jon Minsberg,"6'1""",Setter
2,Joe Karlous,"6'0""",Setter
3,Ben Weinberg,"6'4""",Outside Hitter
4,Noah Dyer,"6'5""",Outside Hitter
...,...,...,...
7,Ryan Leung,"6'4""",Outside Hitter
9,Kasey Crider,"6'4""",Setter
14,Sean Pedersen,"6'6""",Outside Hitter
15,Trevor Van Uden,"6'2""",Outside Hitter


In [749]:
pep['School'] = 'Pepperdine'

In [754]:
pep.to_csv('Pepperdine_Roster')

# Loyola Chicago <a class="anchor" id ="loy"></a>

In [755]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [758]:
result_loy2022 = scrape_player_data('https://loyolaramblers.com/sports/mens-volleyball/roster/2022')

In [759]:
result_loy2021 = scrape_player_data('https://loyolaramblers.com/sports/mens-volleyball/roster/2021')

In [756]:
result_loy2020 = scrape_player_data('https://loyolaramblers.com/sports/mens-volleyball/roster/2020')

In [757]:
result_loy2020

Unnamed: 0,Name,Height,Position
0,Cole Schlothauer,"6'5""",OH
1,Jake Freeman,"5'10""",L
2,Kyle Piekarski,"6'8""",MB
3,Jake Maffett,"6'4""",OH
4,Andrew Lyons,"6'7""",OH
5,Devin Joslyn,"6'6""",OH
6,Jack Yentz,"6'6""",OH
7,Luke Denton,"6'6""",OPP
8,Henry Payne,"6'5""",OH
9,Brian Voight,"6'5""",S


In [760]:
result_loy2019 = scrape_player_data('https://loyolaramblers.com/sports/mens-volleyball/roster/2019')

In [761]:
result_loy2018 = scrape_player_data('https://loyolaramblers.com/sports/mens-volleyball/roster/2018')

In [762]:
result_loy2017 = scrape_player_data('https://loyolaramblers.com/sports/mens-volleyball/roster/2017')

In [763]:
loy =  pd.concat([result_loy2022,result_loy2021,result_loy2020,result_loy2019,result_loy2018,result_loy2017], axis=0)

In [767]:
loy = loy.drop_duplicates()

In [768]:
loy['School']='Loyola Chicago'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loy['School']='Loyola Chicago'


In [769]:
loy

Unnamed: 0,Name,Height,Position,School
0,Cole Schlothauer,"6'5""",OH,Loyola Chicago
1,Justin Ross,"6'8""",MB,Loyola Chicago
2,Ben Montplaisir,"6'6""",MB,Loyola Chicago
3,Andrew Lyons,"6'7""",OH,Loyola Chicago
4,Jack Yentz,"6'6""",OH,Loyola Chicago
5,Henry Payne,"6'5""",OH,Loyola Chicago
6,Brian Voight,"6'5""",S,Loyola Chicago
7,Matt Oakley,"6'1""",L,Loyola Chicago
8,Garrett Zolg,"6'3""",S,Loyola Chicago
9,Parker Van Buren,"6'9""",OH,Loyola Chicago


# Saint Francis (PA) <a class="anchor" id ="sfsu"></a>

In [770]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [847]:
result_sfu2022 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2022')

In [848]:
result_sfu2022

Unnamed: 0,Name,Height,Position
0,Ashton Nahrup,"6'2""",Opposite
1,Nicholas Pozzuto,"6'1""",Libero
2,Michael Mosbacher,"6'6""",Outside Hitter
3,Braden Richard,"6'5""",Outside Hitter
4,Blake Liprando,"6'3""",Outside Hitter
5,Alex Finch,"6'5""",Outside Hitter
6,AJ Schmidt,"6'5""",Setter
7,Nathan Zini,"6'7""",Opposite
8,Nicholas Lynch,"6'7""",Setter
9,Matthew Menosky,"6'0""",Libero


In [849]:
result_sfu2021 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2021')

In [850]:
result_sfu2020 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2020')

In [851]:
result_sfu2019 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2019')

In [852]:
result_sfu2018 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2018')

In [853]:
result_sfu2017 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2017')

In [854]:
result_sfu2016 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2016')

In [855]:
result_sfu2015 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2015')

In [856]:
result_sfu2014 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2014')

In [857]:
result_sfu2013 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2013')

In [858]:
result_sfu2012 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2012')

In [859]:
result_sfu2011 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2011')

In [860]:
result_sfu2010 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2010')

In [861]:
result_sfu2009 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2009')

In [862]:
result_sfu2008 = scrape_player_data('https://sfuathletics.com/sports/mens-volleyball/roster/2008')

In [863]:
sfu =  pd.concat([result_sfu2022,result_sfu2021,result_sfu2020,result_sfu2019,result_sfu2018,result_sfu2017,result_sfu2016,result_sfu2015,result_sfu2014,result_sfu2013,result_sfu2012,result_sfu2011,result_sfu2010,result_sfu2009,result_sfu2008], axis=0)

In [864]:
sfu = sfu.drop_duplicates()

In [865]:
sfu['School'] = 'SFU'

In [866]:
sfu.to_csv('SFU_Roster')

# Ball St. <a class="anchor" id ="ball"></a>

In [796]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [822]:
result_ball2022 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2022')

In [823]:
result_ball2022

Unnamed: 0,Name,Height,Position
0,David Flores,"6'1""",Setter
1,Kaleb Jenness,"6'6""",Outside Attacker
2,Xander Pink,"5'7""",Libero
3,Lukas Pytlak,"6'2""",Libero
4,Quinn Isaacson,"6'2""",Setter
5,Colin Ensalaco,"6'1""",Libero
6,Trevor Phillips,"6'6""",Outside Attacker
7,Vanis Buckholz,"6'6""",Middle Blocker
8,Bryce Behrendt,"6'4""",Outside Attacker
9,Angelos Mandilaris,"6'7""",Opposite


In [824]:
result_ball2021 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2021')

In [825]:
result_ball2020 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2020')

In [826]:
result_ball2019 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2019')

In [827]:
result_ball2018 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2018')

In [828]:
result_ball2017 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2017')

In [829]:
result_ball2016 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2016')

In [830]:
result_ball2015 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2015')

In [831]:
result_ball2014 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2014')

In [832]:
result_ball2013 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2013')

In [833]:
result_ball2012 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2012')

In [834]:
result_ball2011 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2011')

In [835]:
result_ball2010 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2010')

In [836]:
result_ball2009 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2009')

In [837]:
result_ball2008 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2008')

In [838]:
result_ball2007 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2007')

In [839]:
result_ball2006 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2006')

In [840]:
result_ball2005 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2005')

In [841]:
result_ball2004 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2004')

In [842]:
result_ball2003 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2003')

In [843]:
result_ball2002 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2002')

In [844]:
result_ball2001 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2001')

In [845]:
result_ball2000 = scrape_player_data('https://ballstatesports.com/sports/mens-volleyball/roster/2000')

In [867]:
ball =  pd.concat([result_ball2022,result_ball2021,result_ball2020,result_ball2019,result_ball2018,result_ball2017,result_ball2016,result_ball2015,result_ball2014,result_ball2013,result_ball2012,result_ball2011,result_ball2010,result_ball2009,result_ball2008,result_ball2007,result_ball2006,result_ball2005,result_ball2004,result_ball2003,result_ball2002,result_ball2001,result_ball2000], axis=0)

In [870]:
ball = ball.drop_duplicates()

In [871]:
ball['School']= 'Ball State'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ball['School']= 'Ball State'


In [873]:
ball.to_csv('Ball_Roster')

# George Mason <a class="anchor" id ="gm"></a>

In [889]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <div> tags with class "sidearm-roster-list-item-name sidearm-roster-player-name"
    name_tags = soup.find_all('div', class_='sidearm-roster-list-item-name')

    # Extract names
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <div> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-list-item-position"
    position_tags = soup.find_all('span', class_='sidearm-roster-list-item-position')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-list-item-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-list-item-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()  # Get text inside <span> tag
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [890]:
result_ball2022 = scrape_player_data('https://gomason.com/sports/mens-volleyball/roster/2022')

In [891]:
result_ball2022

Unnamed: 0,Name,Height,Position


In [None]:
cant get anything

# Lindenwood <a class="anchor" id ="lw"></a>

In [892]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [893]:
result_lin2022 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2022')

In [894]:
result_lin2022

Unnamed: 0,Name,Height,Position
0,Phil Swartz,"6'7""",Outside Hitter
1,Blase Catanese,"6'0""",Libero
2,Alex Pappas,"6'7""",Middle Blocker
3,Carter Stenmark,"6'3""",Outside Hitter
4,Kyle Deutschmann,"6'2""",Libero
5,Brian Schwob,"6'3""",Setter
6,Jose Vargas,"6'2""",Setter
7,Kadin Warner,"6'8""",Middle Blocker
8,RaShawn Bonner,"6'8""",Opposite Hitter
9,Cole Schuler,"6'7""",Outside Hitter


In [895]:
result_lin2021 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2021')

In [896]:
result_lin2020 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2020')

In [897]:
result_lin2019 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2019')

In [898]:
result_lin2018 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2018')

In [899]:
result_lin2017 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2017')

In [900]:
result_lin2016 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2016')

In [901]:
result_lin2015 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2015')

In [902]:
result_lin2014 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2014')

In [903]:
result_lin2013 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2013')

In [904]:
result_lin2012 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2012')

In [905]:
result_lin2011 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2011')

In [906]:
result_lin2010 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2010')

In [907]:
result_lin2009 = scrape_player_data('https://lindenwoodlions.com/sports/mens-volleyball/roster/2009')

In [908]:
lin =  pd.concat([result_lin2022,result_lin2021,result_lin2020,result_lin2019,result_lin2018,result_lin2017,result_lin2016,result_lin2015,result_lin2014,result_lin2013,result_lin2012,result_lin2011,result_lin2010,result_lin2009,], axis=0)

In [913]:
lin = lin.drop_duplicates()

In [914]:
lin['School']= 'Lindenwood'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lin['School']= 'Lindenwood'


In [916]:
lin.to_csv('Lindenwood_Roster')

# USC <a class="anchor" id ="usc"></a>

In [917]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [918]:
result_usc2022 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2022?view=1')

In [919]:
result_usc2022

Unnamed: 0,Name,Height,Position
0,Brandon Browning,"6'4""",OH
1,Austin Stuard,"6'0""",L
2,Cole Paxson,"5'11""",L
3,Jackson Reed,"6'1""",OH
4,Gus Acord,"6'2""",L
5,Chris Hall,"6'2""",S
6,Sam Kobrine,"6'3""",OH-OPP-S
7,Kyle Paulson,"6'8""",S-OPP
8,George Dyer,"6'3""",OH
9,Jameson McKibbin,"6'3""",S


In [920]:
result_usc2021 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2021?view=1')

In [921]:
result_usc2020 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2020?view=1')

In [922]:
result_usc2019 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2019?view=1')

In [923]:
result_usc2018 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2018?view=1')

In [924]:
result_usc2017 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2017?view=1')

In [925]:
result_usc2016 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2016?view=1')

In [926]:
result_usc2015 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2015?view=1')

In [927]:
result_usc2014 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2014?view=1')

In [928]:
result_usc2013 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2013?view=1')

In [929]:
result_usc2012 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2012?view=1')

In [930]:
result_usc2011 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2011?view=1')

In [931]:
result_usc2010 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2010?view=1')

In [932]:
result_usc2009 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2009?view=1')

In [933]:
result_usc2008 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2008?view=1')

In [934]:
result_usc2007 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2007?view=1')

In [935]:
result_usc2006 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2006?view=1')

In [936]:
result_usc2005 = scrape_player_data('https://usctrojans.com/sports/mens-volleyball/roster/2005?view=1')

In [937]:
usc =  pd.concat([result_usc2022,result_usc2021,result_usc2020,result_usc2019,result_usc2018,result_usc2017,result_usc2016,result_usc2015,result_usc2014,result_usc2013,result_usc2012,result_usc2011,result_usc2010,result_usc2009,result_usc2008,result_usc2007,result_usc2006,result_usc2005], axis=0)

In [941]:
usc= usc.drop_duplicates()

In [942]:
usc['School']= 'USC'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usc['School']= 'USC'


In [944]:
usc.to_csv('USC_Roster')

# Purdue Fort Wayne <a class="anchor" id ="pfw"></a>

In [945]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <a> tags with class "hover:underline focus:underline"
    player_tags = soup.find_all('a', class_='hover:underline focus:underline')

    # Extract names from <h3> tags within <a> tags
    for tag in player_tags:
        name_tag = tag.find('h3')  # Find <h3> tag within <a> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <h3> tag
            names.append(name)

    # Find all <span> tags with class "s-person-details__bio-stats-item"
    bio_stats_items = soup.find_all('span', class_='s-person-details__bio-stats-item')

    # Extract heights and positions
    for item in bio_stats_items:
        # Extract height
        height_tag = item.find('span', class_='sr-only')
        if height_tag and 'Height' in height_tag.text:
            height = item.text.split('Height')[1].strip()
            heights.append(height)
        
        # Extract position
        if 'Position' in item.text:
            position = item.text.split('Position')[1].strip()
            positions.append(position)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [946]:
result_pfw2022 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2022')

In [947]:
result_pfw2022

Unnamed: 0,Name,Height,Position
0,Rico Wardlow,6' 5'',MB
1,Wilmer Hernandez,6' 3'',L/DS
2,Troy Gooch,6' 0'',L
3,Jon Diedrich,6' 6'',OH/OPP
4,Sean Califf,6' 4'',S
5,Kade Bontrager,6' 0'',OH
6,Axel Melendez Watts,6' 9'',OH
7,Mark Frazier,6' 6'',MB
8,Carlos Mercado,6' 5'',OH/OPP
9,Cody Johnson,6' 7'',MB


In [948]:
result_pfw2021 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2021')

In [949]:
result_pfw2020 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2020')

In [950]:
result_pfw2019 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2019')

In [951]:
result_pfw2018 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2018')

In [952]:
result_pfw2017 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2017')

In [953]:
result_pfw2016 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2016')

In [954]:
result_pfw2015 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2015')

In [955]:
result_pfw2014 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2014')

In [956]:
result_pfw2013 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2013')

In [957]:
result_pfw2012 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2012')

In [958]:
result_pfw2011 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2011')

In [959]:
result_pfw2010 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2010')

In [960]:
result_pfw2009 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2009')

In [961]:
result_pfw2008 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2008')

In [962]:
result_pfw2007 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2007')

In [963]:
result_pfw2006 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2006')

In [964]:
result_pfw2005 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2005')

In [965]:
result_pfw2004 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2004')

In [966]:
result_pfw2003 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2003')

In [967]:
result_pfw2002 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2002')

In [968]:
result_pfw2001 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2001')

In [969]:
result_pfw2000 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/2000')

In [970]:
result_pfw1999 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/1999')

In [971]:
result_pfw1998 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/1998')

In [972]:
result_pfw1997 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/1997')

In [973]:
result_pfw1996 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/1996')

In [974]:
result_pfw1995 = scrape_player_data('https://gomastodons.com/sports/mens-volleyball/roster/1995')

In [976]:
pfw =  pd.concat([result_pfw2022,result_pfw2021,result_pfw2020,result_pfw2019,result_pfw2018,result_pfw2017,result_pfw2016,result_pfw2015,result_pfw2014,result_pfw2013,result_pfw2012,result_pfw2011,result_pfw2010,result_pfw2009,result_pfw2008,result_pfw2007,result_pfw2006,result_pfw2005,result_pfw2004,result_pfw2003,result_pfw2002,result_pfw2001,result_pfw2000,result_pfw1999,result_pfw1998,result_pfw1997,result_pfw1996,result_pfw1995], axis=0)

In [979]:
pfw = pfw.drop_duplicates()

In [980]:
pfw['School']=' PFW'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pfw['School']=' PFW'


In [982]:
pfw.to_csv('Purdue_Fort_Wayne_Roster')

# Lewis University <a class="anchor" id ="lu"></a>

In [983]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [984]:
result_lu2022 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2022')

In [985]:
result_lu2022

Unnamed: 0,Name,Height,Position
0,Cole Brillhart,"6'2""",OH
1,Tyler Simpson,"6'10""",OPP/MB
2,Tyler Morgan,"6'7""",S
3,Jason Gibbs,"6'4""",UTL
4,Kevin Kauling,"6'8""",S
5,Joe Kenzinger,"6'6""",MB\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
6,Isaac Benka,"6'9""",MB
7,Tyler Mitchem,"6'11""",MB\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
8,Max Roquet,"6'8""",OH
9,Alec Lehnert,"6'8""",OH


In [986]:
result_lu2021 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2021')

In [987]:
result_lu2020 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2020')

In [988]:
result_lu2019 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2019')

In [989]:
result_lu2018 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2018')

In [990]:
result_lu2017 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2017')

In [991]:
result_lu2016 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2016')

In [992]:
result_lu2015 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2015')

In [993]:
result_lu2014 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2014')

In [994]:
result_lu2013 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2013')

In [995]:
result_lu2012 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2012')

In [996]:
result_lu2011 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2011')

In [997]:
result_lu2010 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2010')

In [998]:
result_lu2009 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2009')

In [999]:
result_lu2008 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2008')

In [1000]:
result_lu2007 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2007')

In [1001]:
result_lu2006 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2006')

In [1002]:
result_lu2005 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2005')

In [1003]:
result_lu2004 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2004')

In [1004]:
result_lu2003 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2003')

In [1005]:
result_lu2002 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2002')

In [1006]:
result_lu2001 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2001')

In [1007]:
result_lu2000 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/2000')

In [1008]:
result_lu1999 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/1999')

In [1009]:
result_lu1998 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/1998')

In [1010]:
result_lu1997 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/1997')

In [1011]:
result_lu1996 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/1996')

In [1012]:
result_lu1995 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/1995')

In [1013]:
result_lu1994 = scrape_player_data('https://lewisflyers.com/sports/mens-volleyball/roster/1994')

In [1014]:
lu =  pd.concat([result_lu2022,result_lu2021,result_lu2020,result_lu2019,result_lu2018,result_lu2017,result_lu2016,result_lu2015,result_lu2014,result_lu2013,result_lu2012,result_lu2011,result_lu2010,result_lu2009,result_lu2008,result_lu2007,result_lu2006,result_lu2005,result_lu2004,result_lu2003,result_lu2002,result_lu2001,result_lu2000,result_lu1999,result_lu1998,result_lu1997,result_lu1996,result_lu1995,result_lu1994], axis=0)

In [1016]:
lu = lu.drop_duplicates()

In [1017]:
lu['School'] = 'LU'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lu['School'] = 'LU'


In [1019]:
lu.to_csv('Lewis_University_Roster')

#  Daemen <a class="anchor" id ="dae"></a>

In [1020]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1021]:
result_daem2022 = scrape_player_data('https://daemenwildcats.com/sports/mens-volleyball/roster/2022')

In [1022]:
result_daem2022

Unnamed: 0,Name,Height,Position
0,Michael Krueger,"6'2""",Setter
1,Cameron Milligan,"6'4""",Opposite/Outside Hitter
2,Jake Basinski,"6'2""",Outside Hitter
3,Jaden Gillette,"5'10""",Defensive Specialist
4,Ryan Parker,"6'4""",Outside Hitter
5,Jake Couzens,"6'5""",Opposite/Middle Hitter
6,Robert Patzer,"6'6""",Opposite Hitter
7,Henry Moffitt,"6'6""",Middle Hitter
8,Zach Schneider,"6'6""",Outside Hitter
9,Billy Wieberg,"6'4""",Outside Hitter


In [1023]:
result_daem2021 = scrape_player_data('https://daemenwildcats.com/sports/mens-volleyball/roster/2021')

In [1024]:
result_daem2020 = scrape_player_data('https://daemenwildcats.com/sports/mens-volleyball/roster/2020')

In [1025]:
result_daem2019 = scrape_player_data('https://daemenwildcats.com/sports/mens-volleyball/roster/2019')

In [1026]:
daem =  pd.concat([result_daem2022,result_daem2021,result_daem2020,result_daem2019], axis=0)

In [1029]:
daem = daem.drop_duplicates()

In [1030]:
daem['School']='Daemen_University'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daem['School']='Daemen_University'


In [1032]:
daem.to_csv('Daemen_University_Roster')

# Belmont abbey college <a class="anchor" id ="bac"></a>

In [1033]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1034]:
result_bac2022 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2022')

In [1035]:
result_bac2022

Unnamed: 0,Name,Height,Position
0,Conrad Hill,"5'6""",L
1,Daniel Cerqua,"5'9""",L/DS
2,Gage Giller,"6'7""",OH
3,Mark Timmons,"6'2""",OH
4,Nolan Schmidt,"6'7""",MB
5,Riley Mulkey,"6'5""",MB
6,Jake Somers,"6'4""",OH
7,Matthew Staskunas,"6'5""",OH
8,Kyle Ferguson,"6'5""",MB
9,Clayton Zimmerman,"6'2""",S


In [1036]:
result_bac2021 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2021')

In [1037]:
result_bac2020 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2020')

In [1038]:
result_bac2019 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2019')

In [1039]:
result_bac2018 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2018')

In [1040]:
result_bac2017 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2017')

In [1041]:
result_bac2016 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2016')

In [1042]:
result_bac2015 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2015')

In [1043]:
result_bac2014 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2014')

In [1044]:
result_bac2013 = scrape_player_data('https://abbeyathletics.com/sports/mens-volleyball/roster/2013')

In [1045]:
bac =  pd.concat([result_bac2022,result_bac2021,result_bac2020,result_bac2019,result_bac2018,result_bac2017,result_bac2016,result_bac2015,result_bac2014,result_bac2013], axis=0)

In [1050]:
bac = bac.drop_duplicates()

In [1051]:
bac['School'] = 'BAC'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bac['School'] = 'BAC'


In [1052]:
bac

Unnamed: 0,Name,Height,Position,School
0,Conrad Hill,"5'6""",L,BAC
1,Daniel Cerqua,"5'9""",L/DS,BAC
2,Gage Giller,"6'7""",OH,BAC
3,Mark Timmons,"6'2""",OH,BAC
4,Nolan Schmidt,"6'7""",MB,BAC
...,...,...,...,...
6,John Lowenhagen,"6'5""",OH,BAC
7,Justin Snyder,"6'4""",OH,BAC
8,Dustin Buell,"6'7""",MB,BAC
9,Dane Hunsicker,"6'6""",RH,BAC


# McKendree <a class="anchor" id ="mc"></a>

In [1054]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1055]:
result_mck2022 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2022')

In [1056]:
result_mck2022

Unnamed: 0,Name,Height,Position
0,Francisco Comas,"5'6""",Libero
1,Daniel Duggan,"6'7""",Middle Blocker
2,Tyler Poulsen,"6'4""",Setter
3,Brendon Dunn,"6'4""",Outside Hitter
4,Tyler Tripp,"6'2""",Libero
5,Tommy McGrath,"6'7""",Middle Blocker
6,Chris Kissling,"5'10""",Libero
7,Patrick Ross,"6'5""",Outside Hitter
8,Ryan Serrano,"6'4""",Setter
9,Tredall Blanchard-Davis,"6'3""",Outside Hitter


In [1057]:
result_mck2021 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2021')

In [1058]:
result_mck2020 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2020')

In [1059]:
result_mck2019 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2019')

In [1060]:
result_mck2018 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2018')

In [1061]:
result_mck2017 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2017')

In [1062]:
result_mck2016 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2016')

In [1063]:
result_mck2015 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2015')

In [1064]:
result_mck2014 = scrape_player_data('https://mckbearcats.com/sports/mens-volleyball/roster/2014')

In [1065]:
mck =  pd.concat([result_mck2022,result_mck2021,result_mck2020,result_mck2019,result_mck2018,result_mck2017,result_mck2016,result_mck2015,result_mck2014], axis=0)

In [1068]:
mck = mck.drop_duplicates()

In [1070]:
mck['School'] = 'McKendree'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mck['School'] = 'McKendree'


In [1072]:
mck.to_csv('McKendree_Roster')

# Lincoln Memorial <a class="anchor" id ="lm"></a>

In [1073]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1074]:
result_lmu2022 = scrape_player_data('https://lmurailsplitters.com/sports/mens-volleyball/roster/2022?view=1')

In [1075]:
result_lmu2022

Unnamed: 0,Name,Height,Position
0,Dawson Walker,"6'7""",Middle Hitter/Right Side
1,Johansen Negron,"6'2""",Outside Hitter
2,Luigi Pacini,"6'4""",Setter
3,Matthew Gentry,"6'7""",Middle Hitter
4,Ryan Foy,"6'5""",Setter
5,Attia Soliman,"5'8""",Defensive Specialist/Libero
6,Matt Friddle,"6'4""",Setter/Right Side
7,Will Eiken,"6'9""",Outside Hitter/Middle Hitter
8,Diego Clark Keith,"6'4""",Outside Hitter
9,Jacob Titus,"6'4""",Outside Hitter


In [1076]:
result_lmu2021 = scrape_player_data('https://lmurailsplitters.com/sports/mens-volleyball/roster/2021?view=1')

In [1077]:
result_lmu2020 = scrape_player_data('https://lmurailsplitters.com/sports/mens-volleyball/roster/2020?view=1')

In [1078]:
result_lmu2019 = scrape_player_data('https://lmurailsplitters.com/sports/mens-volleyball/roster/2019?view=1')

In [1079]:
result_lmu2018 = scrape_player_data('https://lmurailsplitters.com/sports/mens-volleyball/roster/2018?view=1')

In [1080]:
result_lmu2017 = scrape_player_data('https://lmurailsplitters.com/sports/mens-volleyball/roster/2017?view=1')

In [1081]:
lmu =  pd.concat([result_lmu2022,result_lmu2021,result_lmu2020,result_lmu2019,result_lmu2018,result_lmu2017], axis=0)

In [1083]:
lmu = lmu.drop_duplicates()

In [1084]:
lmu['School']= 'LMU'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lmu['School']= 'LMU'


In [1086]:
lmu.to_csv('LMU_Roster')

# CSUN <a class="anchor" id ="csun"></a>

In [1087]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1088]:
result_csun2022 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2022')

In [1089]:
result_csun2022

Unnamed: 0,Name,Height,Position
0,Joe Picone,"6'0""",Libero
1,Lorenzo Bertozzi,"6'4""",Outside Hitter
2,Luke Krzmarzick,"6'4""",Opposite
3,Ryan DeWeese,"6'5""",Opposite
4,Kyle Hobus,"6'7""",Outside Hitter
5,Inaki Bustamante,"6'7""",Outside Hitter
6,Lance Krenik,"6'4""",Setter
7,Daniel Wetter,"6'5""",Middle Blocker
8,Taylor Ittner,"6'4""",Setter
9,Griffin Walters,"6'6""",Outside Hitter


In [1090]:
result_csun2021 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2021')

In [1091]:
result_csun2020 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2020')

In [1092]:
result_csun2019 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2019')

In [1093]:
result_csun2018 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2018')

In [1094]:
result_csun2017 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2017')

In [1109]:
result_csun2017

Unnamed: 0,Name,Height,Position
0,CJ Suarez,"5'10""",Libero
1,Kelsey Yogi,"5'7""",Libero
2,Sam Porter,"6'3""",Setter
3,Domanik Stratford,"6'5""",Outside Hitter
4,Mitch Theisen,"6'1""",Libero
5,Lucas Timm,"6'6""",Outside Hitter
6,Dimitar Kalchev,"6'4""",Outside Hitter
7,Schylar Lillethorup,"6'4""",Setter
8,Arvis Greene,"6'7""",Opposite
9,Parker Maki,"6'5""",Outside Hitter


In [1095]:
result_csun2016 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2016')

In [1096]:
result_csun2015 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2015')

In [1112]:
result_csun2015

Unnamed: 0,Name,Height,Position
0,Mitch Theisen,"6'0""",Libero
1,Nick Alegrado,"5'8""",Libero
2,Bradley Sakaida,"6'2""",Outside Hitter
3,Damani Lenore,"6'4""",Opposite
4,Jakub Ciesla,"6'6""",Opposite
5,CJ Suarez,"5'10""",Libero
6,Dan Starkey,"6'3""",Opposite
7,Kyle Stevenson,"6'5""",Outside Hitter
8,Jakob Karlsson,"6'3""",Outside Hitter
9,John Peachey,"6'6""",Setter


In [1113]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1114]:
result_csun2014 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2014')

In [1115]:
result_csun2014

Unnamed: 0,Name,Height,Position
0,Charlie Condron,"5'11""",L
1,Nick Alegrado,"5'8""",L
2,Bradley Sakaida,"6'2""",L
3,Damani Lenore,"6'3""",OPP/RS
4,Jakub Ciesla,"6'6""",OPP/RS
5,Justin Beskeen,"6'6""",MB
6,Leni Ma'ia'i,"6'9""",MB
7,Kyle Stevenson,"6'5""",OH
8,John Peachey,"6'6""",S
9,Sam Holt,"6'7""",OH


In [1116]:
result_csun2013 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2013')

In [1117]:
result_csun2013

Unnamed: 0,Name,Height,Position
0,Charlie Condron,"5'11""",L
1,Nick Alegrado,"5'8""",L
2,Chance Earnest,"6'2""",OH
3,Alex Jones,"6'5""",S
4,Kyle Stevenson,"6'5""",OH
5,Brandon Lebrock,"6'1""",OH
6,Jared Moore,"6'8""",MB
7,Sam Holt,"6'7""",OH
8,Vaughn Wellenreiter,"6'7""",MB
9,Drew Staker,"6'8""",MB


In [1118]:
result_csun2012 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2012')

In [1119]:
result_csun2011 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2011')

In [1120]:
result_csun2010 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2010')

In [1121]:
result_csun2009 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2009')

In [1122]:
result_csun2008 = scrape_player_data('https://gomatadors.com/sports/mens-volleyball/roster/2008')

In [1123]:
csun =  pd.concat([result_csun2022,result_csun2021,result_csun2020,result_csun2019,result_csun2018,result_csun2017,result_csun2016,result_csun2015,result_csun2014,result_csun2013,result_csun2012,result_csun2011,result_csun2010,result_csun2009,result_csun2008], axis=0)

In [1125]:
csun = csun.drop_duplicates()

In [1126]:
csun['School'] = 'CSUN'

In [1128]:
csun.to_csv('CSUN_Roster')

# UC San Diego <a class="anchor" id ="ucsd"></a>

In [1129]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1130]:
result_ucsd2022 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2022')

In [1131]:
result_ucsd2022

Unnamed: 0,Name,Height,Position
0,Ben Blakely,"6'5""",Opposite/Outside Hitter
1,Logan Clark,"6'7""",Middle Blocker
2,Gabriel Dyer,"6'5""",Setter/Opposite
3,Matt Palma,"6'0""",Libero
4,Matthew Lim,"6'6""",Outside Hitter
5,Wyatt Harrison,"6'6""",Outside Hitter
6,Ryan Ka,"6'3""",Outside Hitter
7,Josh Schellinger,"6'5""",Outside Hitter
8,Brett Pursley,"6'4""",Outside Hitter
9,Nick Rigo,"6'6""",Middle Blocker


In [1132]:
result_ucsd2021 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2021')

In [1133]:
result_ucsd2020 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2020')

In [1134]:
result_ucsd2019 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2019')

In [1135]:
result_ucsd2018 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2018')

In [1136]:
result_ucsd2017 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2017')

In [1137]:
result_ucsd2016 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2016')

In [1138]:
result_ucsd2015 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2015')

In [1139]:
result_ucsd2014 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2014')

In [1140]:
result_ucsd2013 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2013')

In [1141]:
result_ucsd2012 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2012')

In [1142]:
result_ucsd2011 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2011')

In [1143]:
result_ucsd2010 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2010')

In [1144]:
result_ucsd2009 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2009')

In [1145]:
result_ucsd2008 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2008')

In [1146]:
result_ucsd2007 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2007')

In [1147]:
result_ucsd2006 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2006')

In [None]:
result_ucsd2022 = scrape_player_data('https://ucsdtritons.com/sports/mens-volleyball/roster/2022')

In [1148]:
ucsd =  pd.concat([result_ucsd2022,result_ucsd2021,result_ucsd2020,result_ucsd2019,result_ucsd2018,result_ucsd2017,result_ucsd2016,result_ucsd2015,result_ucsd2014,result_ucsd2013,result_ucsd2012,result_ucsd2011,result_ucsd2010,result_ucsd2009,result_ucsd2008,result_ucsd2007,result_ucsd2006], axis=0)

In [1150]:
ucsd = ucsd.drop_duplicates()

In [1151]:
ucsd['School'] = 'UCSD'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ucsd['School'] = 'UCSD'


In [1153]:
ucsd.to_csv('UCSD_Roster')

# Princeton <a class="anchor" id ="prin"></a>

In [1154]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <a> tags with class "hover:underline focus:underline"
    player_tags = soup.find_all('a', class_='hover:underline focus:underline')

    # Extract names from <h3> tags within <a> tags
    for tag in player_tags:
        name_tag = tag.find('h3')  # Find <h3> tag within <a> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <h3> tag
            names.append(name)

    # Find all <span> tags with class "s-person-details__bio-stats-item"
    bio_stats_items = soup.find_all('span', class_='s-person-details__bio-stats-item')

    # Extract heights and positions
    for item in bio_stats_items:
        # Extract height
        height_tag = item.find('span', class_='sr-only')
        if height_tag and 'Height' in height_tag.text:
            height = item.text.split('Height')[1].strip()
            heights.append(height)
        
        # Extract position
        if 'Position' in item.text:
            position = item.text.split('Position')[1].strip()
            positions.append(position)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1155]:
result_prince2022 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2022')

In [1156]:
result_prince2022

Unnamed: 0,Name,Height,Position
0,Brady Wedbush,6' 7'',OH/RS
1,Gavin Leising,6' 9'',MB
2,Attila Delingat,6' 9'',MB
3,Alexander Mrkalj,6' 5'',OH
4,Danny Sun,6' 0'',S
5,Aiden Benson,6' 6'',MB
6,Nate Thompson,6' 7'',OH
7,Ben Harrington,6' 4'',OH
8,James Hartley,6' 7'',OH/OPP
9,Henry Wedbush,6' 4'',S


In [1157]:
result_prince2021 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2021')

In [1158]:
result_prince2020 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2020')

In [1159]:
result_prince2019 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2019')

In [1160]:
result_prince2018 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2018')

In [1178]:
result_prince2017

Unnamed: 0,Name,Height,Position
0,Billy Andrew,6' 7'',MB
1,George Huhmann,6' 11'',MB
2,Matthew Nicholas,6' 6'',OPP
3,Kendall Ratter,6' 4'',OH
4,Trey Sickler,6' 6'',MB
5,Greg Luck,6' 5'',OH
6,Shane Gooding,6' 4'',S
7,Mike Fuerst,6' 6'',OH/OPP
8,Jonah May,6' 2'',S
9,Parker Dixon,6' 6'',OH


In [1161]:
result_prince2017 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2017')

In [1162]:
result_prince2016 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2016')

In [1163]:
result_prince2015 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2015')

In [1164]:
result_prince2014 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2014')

In [1179]:
result_prince2013

Unnamed: 0,Name,Height,Position
0,Zach Shaw,6' 4'',OH
1,Michael Bagnell,6' 1'',OH
2,Cody Kessel,6' 5'',OH
3,Brad Howard,6' 5'',MB
4,Bar Shabtai,5' 8'',L
5,Jeff Stapleton,6' 2'',OH
6,Daniel Tien,6' 0'',OH
7,Ryan Poladian,6' 7'',MB
8,Will Siroky,6' 5'',MB/OH
9,Conor Dube,5' 10'',S


In [1165]:
result_prince2013 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2013')

In [1166]:
result_prince2012 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2012')

In [1167]:
result_prince2011 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2011')

In [1176]:
result_prince2011

Unnamed: 0,Name,Height,Position
0,Scott Liljestrom,6' 2'',S
1,Dexter Scobee,5' 9'',L
2,John Morris,6' 3'',S
3,Keenan McCarthy,6' 5'',MB
4,Bar Shabtai,5' 8'',L
5,Jeff Stapleton,6' 2'',OH
6,Daniel Tien,6' 0'',OH
7,Ryan Poladian,6' 7'',MB
8,Brad Howard,6' 5'',MB
9,Davis Waddell,6' 5'',S


In [1168]:
result_prince2010 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2010')

In [1169]:
result_prince2009 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2009')

In [1170]:
result_prince2008 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2008')

In [1171]:
result_prince2007 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2007')

In [1177]:
result_prince2007

Unnamed: 0,Name,Height,Position
0,Aaron Linsky,6' 0'',L
1,Phillip Rosenberg,6' 1'',OH
2,Brandon Denham,6' 3'',S
3,Harsha Dante,6' 3'',OH
4,Ka'ohu Berg-Hee,6' 3'',OH
5,Cameron Heggi,6' 4'',MB
6,Reid Joseph,6' 5'',OH
7,Peter Eichler,6' 5'',OH
8,R.J. Liljestrom,6' 6'',MH
9,Mike Vincent,6' 9'',MB


In [1172]:
result_prince2006 = scrape_player_data('https://goprincetontigers.com/sports/mens-volleyball/roster/2006')

In [1173]:
prince =  pd.concat([result_prince2022,result_prince2021,result_prince2020,result_prince2019,result_prince2018,result_prince2017,result_prince2016,result_prince2015,result_prince2014,result_prince2013,result_prince2012,result_prince2011,result_prince2010,result_prince2009,result_prince2008,result_prince2007,result_prince2006], axis=0)

In [1180]:
prince = prince.drop_duplicates()

In [1181]:
prince['School'] = 'Princeton'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prince['School'] = 'Princeton'


In [1183]:
prince.to_csv('Princeton_Roster')

# UC Santa Barbara <a class="anchor" id ="ucsb"></a>

In [1184]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1185]:
result_ucsb2022 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2022')

In [1186]:
result_ucsb2022

Unnamed: 0,Name,Height,Position
0,Jaden Glenn,"6'0""",Libero/Setter
1,Geste Bianchi,"6'6""",Opposite
2,Owen Birg,"6'8""",Outside Hitter/Opposite
3,Rees Barnett,"6'5""",Opposite
4,Ben Coordt,"6'4""",Outside Hitter
5,Donovan Todorov,"6'6""",Middle Blocker
6,Haotian Xia,"6'5""",Opposite
7,Ryan Pecsok,"5'10""",Libero
8,Ryan Wilcox,"6'2""",Outside Hitter
9,Brandon Hicks,"6'4""",Middle Blocker


In [1187]:
result_ucsb2021 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2021')

In [1188]:
result_ucsb2020 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2020')

In [1189]:
result_ucsb2019 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2019')

In [1190]:
result_ucsb2018 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2018')

In [1191]:
result_ucsb2017 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2017')

In [1192]:
result_ucsb2016 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2016')

In [1193]:
result_ucsb2015 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2015')

In [1194]:
result_ucsb2014 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2014')

In [1195]:
result_ucsb2013 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2013')

In [1196]:
result_ucsb2012 = scrape_player_data('https://ucsbgauchos.com/sports/mens-volleyball/roster/2012')

In [1197]:
ucsb =  pd.concat([result_ucsb2022,result_ucsb2021,result_ucsb2020,result_ucsb2019,result_ucsb2018,result_ucsb2017,result_ucsb2016,result_ucsb2015,result_ucsb2014,result_ucsb2013,result_ucsb2012], axis=0)

In [1200]:
ucsb = ucsb.drop_duplicates()

In [1202]:
ucsb['School'] = 'UCSB'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ucsb['School'] = 'UCSB'


In [1204]:
ucsb.to_csv('UCSB_Roster')

# North Greenville <a class="anchor" id ="ng"></a>

In [1205]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1206]:
result_ngu2022 = scrape_player_data('https://www.nguathletics.com/sports/mvball/roster/2022')

In [1207]:
result_ngu2022

Unnamed: 0,Name,Height,Position
0,Christian Phung,"6'1""",L
1,Sergio Carrillo,"6'6""",S
2,Blake Petteway,"6'1""",L/DS
3,Luke Densmore,"6'1""",OH
4,Duncan Henderson,"6'1""",L/DS
5,Andrew Davidson,"6'2""",L/DS
6,Tom Curry,"6'2""",OH
7,Diego Rosich,"6'5""",OH
8,Brandon Baker,"6'6""",OH
9,Zec Johnson,"6'2""",OH


In [1208]:
result_ngu2021 = scrape_player_data('https://www.nguathletics.com/sports/mvball/roster/2021')

In [1209]:
result_ngu2020 = scrape_player_data('https://www.nguathletics.com/sports/mvball/roster/2020')

In [1210]:
result_ngu2019 = scrape_player_data('https://www.nguathletics.com/sports/mvball/roster/2019')

In [1211]:
result_ngu2018 = scrape_player_data('https://www.nguathletics.com/sports/mvball/roster/2018')

In [1212]:
result_ngu2017 = scrape_player_data('https://www.nguathletics.com/sports/mvball/roster/2017')

In [1213]:
result_ngu2016 = scrape_player_data('https://www.nguathletics.com/sports/mvball/roster/2016')

In [1214]:
result_ngu2015 = scrape_player_data('https://www.nguathletics.com/sports/mvball/roster/2015')

In [1215]:
ngu =  pd.concat([result_ngu2022,result_ngu2021,result_ngu2020,result_ngu2019,result_ngu2018,result_ngu2017,result_ngu2016,result_ngu2015], axis=0)

In [1218]:
ngu = ngu.drop_duplicates()

In [1219]:
ngu['School']= 'NGU'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ngu['School']= 'NGU'


In [1221]:
ngu.to_csv('NGU_Roster')

# LIU <a class="anchor" id ="liu"></a>

In [1222]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1223]:
result_liu2022 = scrape_player_data('https://www.liuathletics.com/sports/mens-volleyball/roster/2022')

In [1225]:
liu = result_liu2022

In [1226]:
liu['School']= 'LIU'

In [1228]:
liu.to_csv('LIU_Roster')

# Harvard <a class="anchor" id ="harv"></a>

In [1229]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1230]:
result_harv2022 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2022')

In [1231]:
result_harv2022

Unnamed: 0,Name,Height,Position
0,Andrew Lobo,"6'2""",Outside Hitter
1,Ethan Smith,"6'7""",Middle Blocker
2,Ryan Hong,"6'2""",Middle Blocker/Outside Hitter
3,Kade McGovern,"6'6""",Opposite Hitter
4,Azim Raheem,"6'6""",Opposite Hitter
5,Alessio Pignatelli,"5'11""",Libero/Defensive Specialist
6,Eric Li,"6'6""",Outside Hitter
7,Ethan McCrary,"6'9""",Middle Blocker
8,Logan Shepherd,"6'4""",Outside Hitter
9,Will Polster,"6'1""",Outside Hitter


In [1232]:
result_harv2021 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2021')

In [1233]:
result_harv2020 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2020')

In [1234]:
result_harv2019 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2019')

In [1235]:
result_harv2018 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2018')

In [1236]:
result_harv2017 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2017')

In [1237]:
result_harv2016 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2016')

In [1238]:
result_harv2015 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2015')

In [1239]:
result_harv2014 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2014')

In [1240]:
result_harv2013 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2013')

In [1241]:
result_harv2012 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2012')

In [1242]:
result_harv2011 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2011')

In [1243]:
result_harv2010 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2010')

In [1244]:
result_harv2009 = scrape_player_data('https://gocrimson.com/sports/mens-volleyball/roster/2009')

In [1245]:
harv =  pd.concat([result_harv2022,result_harv2021,result_harv2020,result_harv2019,result_harv2018,result_harv2017,result_harv2016,result_harv2015,result_harv2014,result_harv2013,result_harv2012,result_harv2011,result_harv2010,result_harv2009], axis=0)

In [1247]:
harv.drop_duplicates()

Unnamed: 0,Name,Height,Position
0,Andrew Lobo,"6'2""",Outside Hitter
1,Ethan Smith,"6'7""",Middle Blocker
2,Ryan Hong,"6'2""",Middle Blocker/Outside Hitter
3,Kade McGovern,"6'6""",Opposite Hitter
4,Azim Raheem,"6'6""",Opposite Hitter
...,...,...,...
1,Gil Weintraub,"6'3""",Setter
2,Jeff Nathan,"6'3""",Outside Hitter
5,Lubomir Malo,"6'1""",Outside Hitter
7,Dan Schreff,"6'1""",Outside Hitter


In [1307]:
harv['School']= 'Harvard'

In [1308]:
harv.to_csv('Harvard_Roster')

# NJIT <a class="anchor" id ="njit"></a>

In [1250]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "sidearm-roster-player-position-long-short hide-on-small-down"
    position_tags = soup.find_all('span', class_='sidearm-roster-player-position-long-short hide-on-small-down')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height"
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [1251]:
result_njit2022 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2022')

In [1252]:
result_njit2022

Unnamed: 0,Name,Height,Position
0,Griffin Fieseler,"6'7""",S
1,Will Andrews,"6'0""",OH
2,Josh Gregg,"6'5""",OH
3,Roque Nido,"6'4""",S
4,Antonio Feliciano,"6'4""",OH
5,Mason Matos,"6'1""",S
6,Derek Evans,"6'6""",M/OPP
7,Nolan Wollmer,"6'5""",OPP
8,Julian Meissner,"6'7""",OPP
9,Martin de Chavarria,"6'7""",MB


In [1253]:
result_njit2021 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2021')

In [1254]:
result_njit2020 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2020')

In [1255]:
result_njit2019 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2019')

In [1256]:
result_njit2018 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2018')

In [1257]:
result_njit2017 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2017')

In [1258]:
result_njit2016 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2016')

In [1259]:
result_njit2015 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2015')

In [1260]:
result_njit2014 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2014')

In [1261]:
result_njit2013 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2013')

In [1262]:
result_njit2012 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2012')

In [1263]:
result_njit2011 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2011')

In [1264]:
result_njit2010 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2010')

In [1265]:
result_njit2009 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2009')

In [1266]:
result_njit2008 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2008')

In [1267]:
result_njit2007 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2007')

In [1268]:
result_njit2006 = scrape_player_data('https://njithighlanders.com/sports/mens-volleyball/roster/2006')

In [1269]:
njit =  pd.concat([result_njit2022,result_njit2021,result_njit2020,result_njit2019,result_njit2018,result_njit2017,result_njit2016,result_njit2015,result_njit2014,result_njit2013,result_njit2012,result_njit2011,result_njit2010,result_njit2009,result_njit2008,result_njit2007,result_njit2006], axis=0)

In [1272]:
njit = njit.drop_duplicates()

In [1273]:
njit['School']='NJIT'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  njit['School']='NJIT'


In [1275]:
njit.to_csv('NJIT_Roster')

# Below schools were not included in final dataframe due to time contraints

# Mount Olive <a class="anchor" id ="other"></a>

In [5]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [6]:
result_mo2022 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2022')

In [7]:
result_mo2022

Unnamed: 0,Name,Height,Position
0,Tristan Schraudner,"6'0""",L
1,Chance Gallardo,"6'2""",OH
2,Luke Visgitis,"6'5""",RS/OPP/MB
3,Eric Visgitis,"6'5""",MB
4,Dominic Hagerty,"6'3""",S
5,Louie Hadfield,"6'3""",RS/OH
6,Jarrod Ferguson,"6'6""",OH/S
7,Blake Hosic,"6'3""",OH
8,Tobi Azeez,"6'4""",OH
9,Tyran Gillespie,"6'1""",OH


In [8]:
result_mo2021 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2021')

In [9]:
result_mo2020 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2020')

In [10]:
result_mo2019 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2019')

In [11]:
result_mo2018 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2018')

In [12]:
result_mo2017 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2017')

In [13]:
result_mo2016 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2016')

In [14]:
result_mo2015 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2015')

In [15]:
result_mo2014 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2014')

In [16]:
result_mo2013 = scrape_player_data('https://umotrojans.com/sports/mens-volleyball/roster/2013')

In [17]:
mo =  pd.concat([result_mo2022,result_mo2021,result_mo2020,result_mo2019,result_mo2018,result_mo2017,result_mo2016,result_mo2015,result_mo2014,result_mo2013], axis=0)

In [18]:
mo = mo.drop_duplicates()

In [19]:
mo['School']='MO'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mo['School']='MO'


In [33]:
mo

Unnamed: 0,Name,Height,Position,School
0,Tristan Schraudner,"6'0""",L,MO
1,Chance Gallardo,"6'2""",OH,MO
2,Luke Visgitis,"6'5""",RS/OPP/MB,MO
3,Eric Visgitis,"6'5""",MB,MO
4,Dominic Hagerty,"6'3""",S,MO
...,...,...,...,...
13,Andreas Lengler,"6'7""",MB,MO
14,Tim Ebbecke,"6'5""",MB,MO
15,Ben Boncella,"6'1""",OH,MO
16,Ben Casado,"5'10""",S,MO


In [20]:
mo.to_csv('MO_Roster')

# FDU

In [21]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [22]:
result_cui2022 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2022')

In [23]:
result_cui2022

Unnamed: 0,Name,Height,Position
0,Maxwell McCullough,"6'8""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...
1,Gil Herold,"6'6""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...
2,Devyn Zavala,"6'3""",Outside Hitter/Opposite Hitter\r\n\t\t\t\t\t\t...
3,Danny Smithers,"6'0""",Libero\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
4,Christian Oviedo,"5'9""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
5,McLain Mott,"6'2""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
6,Makai Lipson,"6'4""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
7,Kawika Simon,"5'11""",Libero/Defensive Specialist\r\n\t\t\t\t\t\t\t\...
8,Owen Chun,"6'5""",Middle Blocker/Opposite\r\n\t\t\t\t\t\t\t\t\t\...
9,Kobe Kiley,"6'3""",Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...


In [24]:
result_cui2021 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2021')

In [25]:
result_cui2020 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2020')

In [26]:
result_cui2019 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2019')

In [27]:
result_cui2018 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2018')

In [28]:
result_cui2017 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2017')

In [29]:
result_cui2016 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2016')

In [30]:
result_cui2015 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2015')

In [31]:
result_cui2014 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2014')

In [32]:
result_cui2013 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2013')

In [34]:
cui =  pd.concat([result_cui2022,result_cui2021,result_cui2020,result_cui2019,result_cui2018,result_cui2017,result_cui2016,result_cui2015,result_cui2014,result_cui2013], axis=0)

In [35]:
cui = cui.drop_duplicates()

In [52]:
cui['School']='CUI'

In [53]:
cui

Unnamed: 0,Name,Height,Position,School
0,Maxwell McCullough,"6'8""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,CUI
1,Gil Herold,"6'6""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,CUI
2,Devyn Zavala,"6'3""",Outside Hitter/Opposite Hitter\r\n\t\t\t\t\t\t...,CUI
3,Danny Smithers,"6'0""",Libero\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...,CUI
4,Christian Oviedo,"5'9""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...,CUI
...,...,...,...,...
20,Connor Dell,"6'5""",Opposite\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...,CUI
21,Clement Osahon Jr.,"6'5""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,CUI
22,John Rzepniewski,"6'7""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,CUI
23,Scott Montez,"6'5""",Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,CUI


In [54]:
cui.to_csv('CUI_Roster')

# Charleston WV

In [38]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [39]:
result_uc2022 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2022')

In [40]:
result_uc2022

Unnamed: 0,Name,Height,Position
0,Maxwell McCullough,"6'8""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...
1,Gil Herold,"6'6""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...
2,Devyn Zavala,"6'3""",Outside Hitter/Opposite Hitter\r\n\t\t\t\t\t\t...
3,Danny Smithers,"6'0""",Libero\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
4,Christian Oviedo,"5'9""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
5,McLain Mott,"6'2""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
6,Makai Lipson,"6'4""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
7,Kawika Simon,"5'11""",Libero/Defensive Specialist\r\n\t\t\t\t\t\t\t\...
8,Owen Chun,"6'5""",Middle Blocker/Opposite\r\n\t\t\t\t\t\t\t\t\t\...
9,Kobe Kiley,"6'3""",Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...


In [41]:
result_uc2021 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2021')

In [42]:
result_uc2020 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2020')

In [43]:
result_uc2019 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2019')

In [44]:
result_uc2018 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2018')

In [45]:
result_uc2017 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2017')

In [46]:
result_uc2016 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2016')

In [47]:
result_uc2015 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2015')

In [48]:
result_uc2014 = scrape_player_data('https://cuigoldeneagles.com/sports/mens-volleyball/roster/2014')

In [49]:
uc =  pd.concat([result_uc2022,result_uc2021,result_uc2020,result_uc2019,result_uc2018,result_uc2017,result_uc2016,result_uc2015,result_uc2014], axis=0)

In [50]:
uc = uc.drop_duplicates()

In [55]:
uc['School']='UC'

In [56]:
uc

Unnamed: 0,Name,Height,Position,School
0,Maxwell McCullough,"6'8""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,UC
1,Gil Herold,"6'6""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,UC
2,Devyn Zavala,"6'3""",Outside Hitter/Opposite Hitter\r\n\t\t\t\t\t\t...,UC
3,Danny Smithers,"6'0""",Libero\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...,UC
4,Christian Oviedo,"5'9""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...,UC
...,...,...,...,...
20,Connor Dell,"6'5""",Opposite\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...,UC
21,Clement Osahon Jr.,"6'5""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,UC
22,John Rzepniewski,"6'7""",Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,UC
23,Scott Montez,"6'5""",Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...,UC


In [74]:
uc.to_csv('UC_Roster')

# Sacred Heart University

In [57]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [58]:
result_shu2022 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2022')

In [59]:
result_shu2022

Unnamed: 0,Name,Height,Position
0,Gregory DeGeorge,"6'2""",Libero/DS
1,Tyler Kwinta,"5'11""",Setter/Libero
2,Ardian Kodzodziku,"5'11""",Libero
3,Jeremiah Bernardo,"5'7""",Libero/DS
4,Asa LaBreche,"6'3""",Outside Hitter
5,Angus Henricks,"5'10""",Libero
6,Cole Younger,"6'1""",Libero/DS
7,Carlos Terrassa,"6'1""",Outside Hitter
8,Brody Hoelperl,"6'5""",Outside Hitter
9,Thomas Tustison,"6'3""",Outside Hitter


In [60]:
result_shu2021 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2021')

In [61]:
result_shu2020 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2020')

In [62]:
result_shu2019 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2019')

In [63]:
result_shu2018 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2018')

In [64]:
result_shu2017 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2017')

In [65]:
result_shu2016 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2016')

In [66]:
result_shu2015 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2015')

In [67]:
result_shu2014 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2014')

In [68]:
result_shu2013 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2013')

In [69]:
result_shu2012 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2012')

In [70]:
result_shu2011 = scrape_player_data('https://sacredheartpioneers.com/sports/mens-volleyball/roster/2011')

In [71]:
shu =  pd.concat([result_shu2022,result_shu2021,result_shu2020,result_shu2019,result_shu2018,result_shu2017,result_shu2016,result_shu2015,result_shu2014,result_shu2013,result_shu2012,result_shu2011], axis=0)

In [72]:
shu = shu.drop_duplicates()

In [73]:
shu['School']='SHU'

In [75]:
shu.to_csv('SHU_Roster')

# Queens

In [76]:
def scrape_player_data(url):
    # Fetch the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize lists to store data
    heights = []
    positions = []
    names = []

    # Find all <h3> tags within <a> tags
    name_tags = soup.find_all('h3')

    # Extract names from <a> tags within <h3> tags
    for tag in name_tags:
        name_tag = tag.find('a')  # Find <a> tag within <h3> tag
        if name_tag:
            name = name_tag.text.strip()  # Get text inside <a> tag
            names.append(name)

    # Find all <span> tags with class "text-bold" (for positions)
    position_tags = soup.find_all('span', class_='text-bold')

    # Extract positions
    for tag in position_tags:
        position = tag.text.strip()
        positions.append(position)

    # Find all <span> tags with class "sidearm-roster-player-height" (for heights)
    height_tags = soup.find_all('span', class_='sidearm-roster-player-height')

    # Extract heights
    for tag in height_tags:
        height = tag.text.strip()
        heights.append(height)

    # Ensure all lists have the same length
    min_length = min(len(names), len(heights), len(positions))
    names = names[:min_length]
    heights = heights[:min_length]
    positions = positions[:min_length]

    # Create a DataFrame
    df = pd.DataFrame({
        'Name': names,
        'Height': heights,
        'Position': positions
    })

    return df

In [77]:
result_qu2022 = scrape_player_data('https://queensathletics.com/sports/mens-volleyball/roster/2022')

In [78]:
result_qu2022

Unnamed: 0,Name,Height,Position
0,Logan McDonald,"5'7""",Defensive Specialist\r\n\t\t\t\t\t\t\t\t\t\t\t...
1,Ananias Hayes,"5'10""",Outside Hitter/Opposite\r\n\t\t\t\t\t\t\t\t\t\...
2,Jack Pflugner,"6'6""",Middle Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\...
3,Brian Camacho,"6'3""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
4,Stirling Sims,"6'5""",Middle Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\...
5,Brandon Grabow,"5'10""",Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ...
6,Guillermo Jordan,"6'0""",Defensive Specialist\r\n\t\t\t\t\t\t\t\t\t\t\t...
7,Jackson Maples,"6'2""",Outside Hitter/Defensive Specialist\r\n\t\t\t\...
8,Ryan Cymbor,"6'4""",Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...
9,Drew Steele,"6'4""",Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t...


In [79]:
result_qu2021 = scrape_player_data('https://queensathletics.com/sports/mens-volleyball/roster/2021')

In [80]:
result_qu2020 = scrape_player_data('https://queensathletics.com/sports/mens-volleyball/roster/2020')

In [81]:
result_qu2019 = scrape_player_data('https://queensathletics.com/sports/mens-volleyball/roster/2019')

In [82]:
result_qu2018 = scrape_player_data('https://queensathletics.com/sports/mens-volleyball/roster/2018')

In [83]:
qu =  pd.concat([result_qu2022,result_qu2021,result_qu2020,result_qu2019,result_qu2018], axis=0)

In [84]:
qu = qu.drop_duplicates()

In [85]:
qu['School']='QU'

In [86]:
qu.to_csv('QU_Roster')

# Combining all the individual roster data into a full dataframe

In [1276]:
ucla = pd.read_csv('UCLA_Roster')

In [1277]:
gcu = pd.read_csv('GCU_Roster')

In [1278]:
lbs = pd.read_csv('LBS_Roster')

In [1279]:
psu = pd.read_csv('PSU_Roster')

In [1280]:
uci = pd.read_csv('UCI_Roster')

In [1281]:
uh = pd.read_csv('UH_Roster')

In [1449]:
byu = pd.read_csv('BYU_Roster')

In [1283]:
stan = pd.read_csv('stan_Roster')

In [1284]:
pep = pd.read_csv('pepperdine_Roster')

In [1285]:
sfu = pd.read_csv('SFU_Roster')

In [1286]:
ball = pd.read_csv('ball_Roster')

In [1287]:
lwd = pd.read_csv('lindenwood_Roster')

In [1288]:
usc = pd.read_csv('USC_Roster')

In [1289]:
pur = pd.read_csv('Purdue_Fort_Wayne_Roster')

In [1290]:
lu = pd.read_csv('Lewis_University_Roster')

In [1292]:
daem = pd.read_csv('Daemen_University_Roster')

In [1293]:
mck = pd.read_csv('McKendree_Roster')

In [1294]:
lmu = pd.read_csv('LMU_Roster')

In [1295]:
csun = pd.read_csv('CSUN_Roster')

In [1296]:
ucsd = pd.read_csv('UCSD_Roster')

In [1297]:
prince = pd.read_csv('princeton_Roster')

In [1298]:
ucsb = pd.read_csv('UCSB_Roster')

In [1299]:
ngu = pd.read_csv('NGU_Roster')

In [1300]:
liu = pd.read_csv('LIU_Roster')

In [1309]:
harv = pd.read_csv('Harvard_Roster')

In [1302]:
njit = pd.read_csv('NJIT_Roster')

In [1450]:
college_rosters =  pd.concat([ucla,gcu,lbs,psu,uci,uh,byu,stan,pep,sfu,ball,lwd,usc,pur,lu,daem,mck,lmu,csun,ucsd,prince,ucsb,ngu,liu,harv,njit], axis=0)

In [1454]:
college_rosters

Unnamed: 0.2,Unnamed: 0,Name,Height,Position,School,Unnamed: 0.1,Schools
0,0,Kyle Vom Steeg,6' 7'',Opp,UCLA,,
1,1,J.R. Norris IV,6' 5'',Opp,UCLA,,
2,2,Cole Ketrzynski,6' 8'',OH/Opp,UCLA,,
3,3,Cole Pender,6' 2'',L/OH,UCLA,,
4,4,Sam Kobrine,6' 3'',OH/S,UCLA,,
...,...,...,...,...,...,...,...
85,5,Glen DeMagalhaes,"5'10""",L/OH,NJIT,,
86,6,Charles Bell,"6'2""",OH,NJIT,,
87,7,Nick Jensen,"6'8""",MB,NJIT,,
88,8,Ricky Duran,"6'2""",OH,NJIT,,


In [1456]:
college_rosters = college_rosters.drop(columns = ['Schools','Unnamed: 0.1','Unnamed: 0'])

In [1459]:
college_rosters.to_csv('College_Rosters_half')

half way done, going to use this for sprint 2

In [1478]:
college_rosters= pd.read_csv('college_roster_almost_clean')

###  Mapping all the various names for the positions into Outside Hitter, Middle-Blocker, Opposite, Setter, Libero

In [1470]:
position_mapping = {'Outside Hitter': 'Outside Hitter',
'OH': 'Outside Hitter',
'Middle Blocker': 'Middle-Blocker',
'MB': 'Middle-Blocker',
'Setter': 'Setter',
'S': 'Setter',
'Libero': 'Libero',
'L': 'Libero',
'Opposite': 'Opposite',
 'OPP': 'Opposite',
 'Opposite Hitter': 'Opposite',
'MH': 'Middle-Blocker',
 'Outside Attacker': 'Outside Hitter',
 'OH\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            OH': 'Outside Hitter',
 'Middle Hitter': 'Middle-Blocker',
 'OH/OPP': 'Outside Hitter',
 'MB\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            MB': 'Middle-Blocker',
 'Middle Attacker': 'Middle-Blocker',
 'Middle': 'Middle-Blocker',
 'Libero/Outside Hitter': 'Libero',
 'Opp': 'Opposite',
'\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t S': 'Setter',
'Defensive Specialist': 'Libero',
'L/DS': 'Libero',
'Outside Hitter/Opposite Hitter': 'Outside Hitter',
'Middle Blocker/Opposite': 'Middle-Blocker',
'Middle Blocker/Outside Hitter': 'Middle-Blocker',
'Opp\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Opp': 'Opposite',
'RS': 'Opposite',
'OH/L': 'Outside Hitter',
'MB/OPP': 'Middle-Blocker',
'DS': 'Libero',
'OPP/OH': 'Opposite',
'M': 'Middle-Blocker',
'Outside Hitter/Libero': 'Outside Hitter',
'Setter/Libero': 'Setter',
'SW': 'Setter',
'OP': 'Opposite',
'OPP/MB': 'Opposite',
'S/OPP': 'Setter',
'Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OH': 'Outside Hitter',
'OH/S': 'Outside Hitter',
'L/OH': 'Libero',
'Setter/Opposite': 'Setter',
'OH-OP': 'Outside Hitter',
'Outside hitter': 'Outside Hitter',
'Outside Hitter/Middle Blocker': 'Outside Hitter',
'Opposite Hitter/Middle Blocker': 'Opposite',
'Setter/Outside Hitter': 'Setter',
'Opp.': 'Opposite',
'Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Outside Hitter': 'Outside Hitter',
'Fr.': 'Outside Hitter',
'OH/MB': 'Outside Hitter',
'Libero/Setter': 'Libero',
'Middle/Opposite': 'Middle-Blocker',
'MB-OP': 'Middle-Blocker',
'S/Opp': 'Setter',
'OH/RS': 'Outside Hitter',
'OPP/RS': 'Opposite',
'S/OH': 'Setter',
'Libero/Defensive Specialist': 'Libero',
'Right-Side Hitter': 'Opposite',
'OPP\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OPP': 'Opposite',
'MB/OH': 'Middle-Blocker',
'Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Middle Blocker': 'Middle-Blocker',
'Opposite-Side Hitter': 'Opposite',
'Opposite Side Hitter': 'Opposite',
'Jr.': 'Outside Hitter',
'QH': 'Outside Hitter',
'So.': 'Outside Hitter',
'Libero\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Libero': 'Libero',
'MB/RS': 'Middle-Blocker',
'OH/Opp': 'Outside Hitter',
'Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Setter': 'Setter',
'Right Side/Middle Hitter': 'Opposite',
'OH-OP': 'Outside Hitter',
'Right Side': 'Opposite',
'DS-L': 'Libero',
'L-OH': 'Libero',
'Libero\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t L': 'Libero',
'Outside Hitter/Right Side': 'Outside Hitter',
'OH/DS\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OH/DS': 'Outside Hitter',
'D/S': 'Libero',
'OH/S\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OH/S': 'Outside Hitter',
'L/S': 'Libero',
'Opposite Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OPP': 'Opposite',
'Setter/Opposite Hitter': 'Setter',
'S-OPP': 'Setter',
'Outside Hitter/Setter': 'Outside Hitter',
'RS Jr.': 'Opposite',
'RS So.': 'Opposite',
'Sr.': 'Outside Hitter',
'Opposite/Outside': 'Opposite',
'S/L': 'Setter',
'Defensive Specialist/Setter': 'Libero',
'Lib': 'Libero',
'Opposite/Middle': 'Opposite',
'O H/Opp/M B': 'Outside Hitter',
'S/DS': 'Setter',
'Opposite/Middle Blocker': 'Opposite',
'LB': 'Libero',
'Outside Attacker/Setter': 'Outside Hitter',
'Outside Attacker/Libero': 'Outside Hitter',
'Setter/Right Side': 'Setter',
'S/RS': 'Setter',
'Middle Hitter/Right Side': 'Middle-Blocker',
'Oppsite': 'Opposite'}

In [1481]:
# Sample data frame
df = pd.DataFrame(college_rosters)

# Mapping dictionary
position_mapping = {'Outside Hitter': 'Outside Hitter',
'OH': 'Outside Hitter',
'Middle Blocker': 'Middle-Blocker',
'MB': 'Middle-Blocker',
'Setter': 'Setter',
'S': 'Setter',
'Libero': 'Libero',
'L': 'Libero',
'Opposite': 'Opposite',
 'OPP': 'Opposite',
 'Opposite Hitter': 'Opposite',
'MH': 'Middle-Blocker',
 'Outside Attacker': 'Outside Hitter',
 'OH\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            OH': 'Outside Hitter',
 'Middle Hitter': 'Middle-Blocker',
 'OH/OPP': 'Outside Hitter',
 'MB\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            MB': 'Middle-Blocker',
 'Middle Attacker': 'Middle-Blocker',
 'Middle': 'Middle-Blocker',
 'Libero/Outside Hitter': 'Libero',
 'Opp': 'Opposite',
'\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t S': 'Setter',
'Defensive Specialist': 'Libero',
'L/DS': 'Libero',
'Outside Hitter/Opposite Hitter': 'Outside Hitter',
'Middle Blocker/Opposite': 'Middle-Blocker',
'Middle Blocker/Outside Hitter': 'Middle-Blocker',
'Opp\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Opp': 'Opposite',
'RS': 'Opposite',
'OH/L': 'Outside Hitter',
'MB/OPP': 'Middle-Blocker',
'DS': 'Libero',
'OPP/OH': 'Opposite',
'M': 'Middle-Blocker',
'Outside Hitter/Libero': 'Outside Hitter',
'Setter/Libero': 'Setter',
'SW': 'Setter',
'OP': 'Opposite',
'OPP/MB': 'Opposite',
'S/OPP': 'Setter',
'Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OH': 'Outside Hitter',
'OH/S': 'Outside Hitter',
'L/OH': 'Libero',
'Setter/Opposite': 'Setter',
'OH-OP': 'Outside Hitter',
'Outside hitter': 'Outside Hitter',
'Outside Hitter/Middle Blocker': 'Outside Hitter',
'Opposite Hitter/Middle Blocker': 'Opposite',
'Setter/Outside Hitter': 'Setter',
'Opp.': 'Opposite',
'Outside Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Outside Hitter': 'Outside Hitter',
'Fr.': 'Outside Hitter',
'OH/MB': 'Outside Hitter',
'Libero/Setter': 'Libero',
'Middle/Opposite': 'Middle-Blocker',
'MB-OP': 'Middle-Blocker',
'S/Opp': 'Setter',
'OH/RS': 'Outside Hitter',
'OPP/RS': 'Opposite',
'S/OH': 'Setter',
'Libero/Defensive Specialist': 'Libero',
'Right-Side Hitter': 'Opposite',
'OPP\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OPP': 'Opposite',
'MB/OH': 'Middle-Blocker',
'Middle Blocker\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Middle Blocker': 'Middle-Blocker',
'Opposite-Side Hitter': 'Opposite',
'Opposite Side Hitter': 'Opposite',
'Jr.': 'Outside Hitter',
'QH': 'Outside Hitter',
'So.': 'Outside Hitter',
'Libero\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Libero': 'Libero',
'MB/RS': 'Middle-Blocker',
'OH/Opp': 'Outside Hitter',
'Setter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t Setter': 'Setter',
'Right Side/Middle Hitter': 'Opposite',
'OH-OP': 'Outside Hitter',
'Right Side': 'Opposite',
'DS-L': 'Libero',
'L-OH': 'Libero',
'Libero\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t L': 'Libero',
'Outside Hitter/Right Side': 'Outside Hitter',
'OH/DS\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OH/DS': 'Outside Hitter',
'D/S': 'Libero',
'OH/S\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OH/S': 'Outside Hitter',
'L/S': 'Libero',
'Opposite Hitter\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t OPP': 'Opposite',
'Setter/Opposite Hitter': 'Setter',
'S-OPP': 'Setter',
'Outside Hitter/Setter': 'Outside Hitter',
'RS Jr.': 'Opposite',
'RS So.': 'Opposite',
'Sr.': 'Outside Hitter',
'Opposite/Outside': 'Opposite',
'S/L': 'Setter',
'Defensive Specialist/Setter': 'Libero',
'Lib': 'Libero',
'Opposite/Middle': 'Opposite',
'O H/Opp/M B': 'Outside Hitter',
'S/DS': 'Setter',
'Opposite/Middle Blocker': 'Opposite',
'LB': 'Libero',
'Outside Attacker/Setter': 'Outside Hitter',
'Outside Attacker/Libero': 'Outside Hitter',
'Setter/Right Side': 'Setter',
'S/RS': 'Setter',
'Middle Hitter/Right Side': 'Middle-Blocker',
'Oppsite': 'Opposite'}

def map_positions(position):
    for key, values in position_mapping.items():
        if position in values:
            return key
    return position  # Return original if no match found

# Apply mapping to 'Position' column
df['Position'] = df['Position'].apply(map_positions)

In [1484]:
df = pd.DataFrame(df)

# Mapping dictionary
position_mapping = {
    'Outside Hitter': ['Outside Hitter', 'OH'],
    'Middle-Blocker': ['Middle Blocker', 'MB'],
    'Setter': ['Setter', 'S'],
    'Libero': ['Libero', 'L'],
    'Opposite': ['Opposite', 'OPP']
}

# Function to apply mapping
def map_positions(position):
    for key, values in position_mapping.items():
        if position in values:
            return key
    return position  # Return original if no match found

# Apply mapping to 'Position' column
df['Position'] = df['Position'].apply(map_positions)

In [1485]:
df

Unnamed: 0.1,Unnamed: 0,Name,Height,Position,School
0,0,Kyle Vom Steeg,200.66,Opposite,UCLA
1,1,J.R. Norris IV,195.58,Opposite,UCLA
2,2,Cole Ketrzynski,203.20,OH/Opp,UCLA
3,3,Cole Pender,187.96,L/OH,UCLA
4,4,Sam Kobrine,190.50,OH/S,UCLA
...,...,...,...,...,...
3619,3619,Glen DeMagalhaes,177.80,L/OH,NJIT
3620,3620,Charles Bell,187.96,Outside Hitter,NJIT
3621,3621,Nick Jensen,203.20,Middle-Blocker,NJIT
3622,3622,Ricky Duran,187.96,Outside Hitter,NJIT


In [1486]:
df['Position'].value_counts()

Position
Outside Hitter                      1008
Middle-Blocker                       668
Setter                               464
Libero                               382
Opposite                             275
                                    ... 
OH-OPP-S                               1
Outside Hitter`                        1
Middle Attacker/Outside Attacker       1
Outside hitter/Opposite                1
N                                      1
Name: count, Length: 159, dtype: int64

### An improvement but still have to map more.

In [1487]:
df = pd.DataFrame(df)

# Function to clean position column
def clean_position(position):
    return position.split('/')[0].strip()  # Split by '/' and take the first part

# Apply the function to 'Position' column
df['Position'] = df['Position'].apply(clean_position)

In [1488]:
df

Unnamed: 0.1,Unnamed: 0,Name,Height,Position,School
0,0,Kyle Vom Steeg,200.66,Opposite,UCLA
1,1,J.R. Norris IV,195.58,Opposite,UCLA
2,2,Cole Ketrzynski,203.20,OH,UCLA
3,3,Cole Pender,187.96,L,UCLA
4,4,Sam Kobrine,190.50,OH,UCLA
...,...,...,...,...,...
3619,3619,Glen DeMagalhaes,177.80,L,NJIT
3620,3620,Charles Bell,187.96,Outside Hitter,NJIT
3621,3621,Nick Jensen,203.20,Middle-Blocker,NJIT
3622,3622,Ricky Duran,187.96,Outside Hitter,NJIT


In [1489]:
df = pd.DataFrame(df)

# Mapping dictionary
position_mapping = {
    'Outside Hitter': ['Outside Hitter', 'OH'],
    'Middle-Blocker': ['Middle Blocker', 'MB'],
    'Setter': ['Setter', 'S'],
    'Libero': ['Libero', 'L'],
    'Opposite': ['Opposite', 'OPP']
}

# Function to apply mapping
def map_positions(position):
    for key, values in position_mapping.items():
        if position in values:
            return key
    return position  # Return original if no match found

# Apply mapping to 'Position' column
df['Position'] = df['Position'].apply(map_positions)

In [1495]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)

In [1490]:
df

Unnamed: 0.1,Unnamed: 0,Name,Height,Position,School
0,0,Kyle Vom Steeg,200.66,Opposite,UCLA
1,1,J.R. Norris IV,195.58,Opposite,UCLA
2,2,Cole Ketrzynski,203.20,Outside Hitter,UCLA
3,3,Cole Pender,187.96,Libero,UCLA
4,4,Sam Kobrine,190.50,Outside Hitter,UCLA
...,...,...,...,...,...
3619,3619,Glen DeMagalhaes,177.80,Libero,NJIT
3620,3620,Charles Bell,187.96,Outside Hitter,NJIT
3621,3621,Nick Jensen,203.20,Middle-Blocker,NJIT
3622,3622,Ricky Duran,187.96,Outside Hitter,NJIT


In [1496]:
df['Position'].value_counts()

Position
Outside Hitter                                                                                                                                                  1124
Middle                                                                                                                                                           714
Setter                                                                                                                                                           511
Libero                                                                                                                                                           429
Opposite                                                                                                                                                         324
Opposite Hitter                                                                                                                                                   59
O

In [1492]:
df = pd.DataFrame(df)

# Function to clean position column
def clean_position(position):
    return position.split('-')[0].strip()  # Split by '-' and take the first part

# Apply the function to 'Position' column
df['Position'] = df['Position'].apply(clean_position)

In [1494]:
df['Position'].value_counts()

Position
Outside Hitter     1124
Middle              714
Setter              511
Libero              429
Opposite            324
                   ... 
MIDDLE HITTER         1
Middle Blcoker        1
OPP                   1
Outside Hitter`       1
N                     1
Name: count, Length: 64, dtype: int64

In [1497]:
position_patterns = {
    'OH': r'\bOH\b|\bOutside Hitter\b',
    'MB': r'\bMB\b|\bMiddle Blocker\b',
    'L': r'\bL\b|\bLibero\b|\bDS\b|\bDefensive Specialist\b',
    'DS': r'\bDS\b|\bDefensive Specialist\b',
    'S': r'\bS\b|\bSetter\b',
    'OPP': r'\bOPP\b|\bOpposite\b|\bOpposite Hitter\b'
}

# Function to extract positions
def extract_positions(position_text):
    positions_found = []
    for key, pattern in position_patterns.items():
        if re.search(pattern, position_text, flags=re.IGNORECASE):
            positions_found.append(key)
    return positions_found if positions_found else None

# Apply the function to 'Position' column
df['Extracted_Positions'] = df['Position'].apply(lambda x: extract_positions(x))

In [1498]:
df

Unnamed: 0.1,Unnamed: 0,Name,Height,Position,School,Extracted_Positions
0,0,Kyle Vom Steeg,200.66,Opposite,UCLA,[OPP]
1,1,J.R. Norris IV,195.58,Opposite,UCLA,[OPP]
2,2,Cole Ketrzynski,203.20,Outside Hitter,UCLA,[OH]
3,3,Cole Pender,187.96,Libero,UCLA,[L]
4,4,Sam Kobrine,190.50,Outside Hitter,UCLA,[OH]
...,...,...,...,...,...,...
3619,3619,Glen DeMagalhaes,177.80,Libero,NJIT,[L]
3620,3620,Charles Bell,187.96,Outside Hitter,NJIT,[OH]
3621,3621,Nick Jensen,203.20,Middle,NJIT,
3622,3622,Ricky Duran,187.96,Outside Hitter,NJIT,[OH]


In [1505]:
df = pd.DataFrame(df)

# Define regex patterns for positions
position_patterns = {
    'OH': r'\bOH\b|\bOutside Hitter\b',
    'MB': r'\bMB\b|\bMiddle Blocker\b|\bMiddle\b',
    'L': r'\bL\b|\bLibero\b|\bDS\b|\bDefensive Specialist\b',
    'DS': r'\bDS\b|\bDefensive Specialist\b',
    'S': r'\bS\b|\bSetter\b',
    'OPP': r'\bOPP\b|\bOpposite\b|\bOpposite Hitter\b'
}

# Reverse mapping for extracted positions to standardized positions
reverse_position_mapping = {
    'OH': 'Outside Hitter',
    'MB': 'Middle Blocker',
    'L': 'Libero',
    'DS': 'Libero',  # Mapping DS to Libero based on your previous request
    'S': 'Setter',
    'OPP': 'Opposite'
}

# Function to extract positions
def extract_positions(position_text):
    positions_found = []
    for key, pattern in position_patterns.items():
        if re.search(pattern, position_text, flags=re.IGNORECASE):
            positions_found.append(key)
    return positions_found if positions_found else None

# Function to convert extracted positions back to standardized position name
def convert_to_standardized_position(extracted_positions):
    if extracted_positions:
        standardized_positions = [reverse_position_mapping[pos] for pos in extracted_positions]
        return ', '.join(standardized_positions)
    else:
        return None

# Apply the function to 'Position' column
df['Extracted_Positions'] = df['Position'].apply(lambda x: extract_positions(x))

# Apply conversion to standardized position names
df['Standardized_Position'] = df['Extracted_Positions'].apply(lambda x: convert_to_standardized_position(x))

In [1506]:
df

Unnamed: 0.1,Unnamed: 0,Name,Height,Position,School,Extracted_Positions,Standardized_Position
0,0,Kyle Vom Steeg,200.66,Opposite,UCLA,[OPP],Opposite
1,1,J.R. Norris IV,195.58,Opposite,UCLA,[OPP],Opposite
2,2,Cole Ketrzynski,203.20,Outside Hitter,UCLA,[OH],Outside Hitter
3,3,Cole Pender,187.96,Libero,UCLA,[L],Libero
4,4,Sam Kobrine,190.50,Outside Hitter,UCLA,[OH],Outside Hitter
...,...,...,...,...,...,...,...
3619,3619,Glen DeMagalhaes,177.80,Libero,NJIT,[L],Libero
3620,3620,Charles Bell,187.96,Outside Hitter,NJIT,[OH],Outside Hitter
3621,3621,Nick Jensen,203.20,Middle,NJIT,[MB],Middle Blocker
3622,3622,Ricky Duran,187.96,Outside Hitter,NJIT,[OH],Outside Hitter


In [1508]:
df['Standardized_Position'].value_counts().sum()

3453

In [1504]:
df['Position'].value_counts()

Position
Outside Hitter                                                                                                                                                  1124
Middle                                                                                                                                                           714
Setter                                                                                                                                                           511
Libero                                                                                                                                                           429
Opposite                                                                                                                                                         324
Opposite Hitter                                                                                                                                                   59
O

In [1509]:
df = pd.DataFrame(df)

# Merge Standardized_Position and Position
df['Merged_Position'] = df['Standardized_Position'].fillna(df['Position'])

# Drop the Standardized_Position column if no longer needed
df.drop(columns=['Standardized_Position'], inplace=True)

In [1510]:
df

Unnamed: 0.1,Unnamed: 0,Name,Height,Position,School,Extracted_Positions,Merged_Position
0,0,Kyle Vom Steeg,200.66,Opposite,UCLA,[OPP],Opposite
1,1,J.R. Norris IV,195.58,Opposite,UCLA,[OPP],Opposite
2,2,Cole Ketrzynski,203.20,Outside Hitter,UCLA,[OH],Outside Hitter
3,3,Cole Pender,187.96,Libero,UCLA,[L],Libero
4,4,Sam Kobrine,190.50,Outside Hitter,UCLA,[OH],Outside Hitter
...,...,...,...,...,...,...,...
3619,3619,Glen DeMagalhaes,177.80,Libero,NJIT,[L],Libero
3620,3620,Charles Bell,187.96,Outside Hitter,NJIT,[OH],Outside Hitter
3621,3621,Nick Jensen,203.20,Middle,NJIT,[MB],Middle Blocker
3622,3622,Ricky Duran,187.96,Outside Hitter,NJIT,[OH],Outside Hitter


In [1511]:
df['Merged_Position'].value_counts()

Merged_Position
Outside Hitter                                                                                                                            1190
Middle Blocker                                                                                                                             828
Setter                                                                                                                                     536
Libero                                                                                                                                     451
Opposite                                                                                                                                   415
MH                                                                                                                                          46
Outside Attacker                                                                                                              

In [1513]:
df = pd.DataFrame(df)

# Define the position mapping dictionary
position_mapping = {
    'Middle- blocker': 'Middle Blocker',
    'Outside Hitter': 'Outside Hitter',
    'Setter': 'Setter',
    'Libero': 'Libero',
    'Opposite': 'Opposite',
    'MH': 'Middle Blocker',
    'Outside Attacker': 'Outside Hitter',
    'RS': 'Opposite',
    'OP': 'Opposite',
    'SW': 'Setter',
    'Right Side': 'Opposite',
    'Fr.': None,
    'Right': 'Opposite',
    'Jr.': None,
    'QH': None,
    'So.': None,
    'LIB': 'Libero',
    'D': 'Libero',
    'LB': 'Libero',
    'Oppsite': 'Opposite',
    'O H': 'Outside Hitter',
    'Sr.': None,
    'RS So.': None,
    'RS Jr.': None,
    'UTL': None,
    'UTL\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t    remove this entry                        UTL': None,
    'MH\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t     Middle- blocker': 'Middle Blocker',
    ' MH': 'Middle Blocker',
    'M': 'Middle Blocker',
    'N': None,
    'libero, libero': 'Libero'

# Function to apply position mapping
def map_positions(df, mapping):
    # Apply the mapping to the Merged_Position column
    df['Standardized_Position'] = df['Merged_Position'].map(mapping)
    return df

# Apply the position mapping function
df_mapped = map_positions(df, position_mapping)

In [1516]:
df

Unnamed: 0.1,Unnamed: 0,Name,Height,Position,School,Extracted_Positions,Merged_Position,Standardized_Position
0,0,Kyle Vom Steeg,200.66,Opposite,UCLA,[OPP],Opposite,Opposite
1,1,J.R. Norris IV,195.58,Opposite,UCLA,[OPP],Opposite,Opposite
2,2,Cole Ketrzynski,203.20,Outside Hitter,UCLA,[OH],Outside Hitter,Outside Hitter
3,3,Cole Pender,187.96,Libero,UCLA,[L],Libero,Libero
4,4,Sam Kobrine,190.50,Outside Hitter,UCLA,[OH],Outside Hitter,Outside Hitter
...,...,...,...,...,...,...,...,...
3619,3619,Glen DeMagalhaes,177.80,Libero,NJIT,[L],Libero,Libero
3620,3620,Charles Bell,187.96,Outside Hitter,NJIT,[OH],Outside Hitter,Outside Hitter
3621,3621,Nick Jensen,203.20,Middle,NJIT,[MB],Middle Blocker,
3622,3622,Ricky Duran,187.96,Outside Hitter,NJIT,[OH],Outside Hitter,Outside Hitter


In [1517]:
df['Standardized_Position'].value_counts()

Standardized_Position
Outside Hitter    1238
Setter             545
Libero             458
Opposite           449
Middle Blocker      47
Name: count, dtype: int64

In [1519]:
df = pd.DataFrame(df)

# Merge Standardized_Position and Position
df['Merged_Position'] = df['Standardized_Position'].fillna(df['Merged_Position'])



In [1521]:
df['Merged_Position'].value_counts()

Merged_Position
Outside Hitter                                                                                                                            1238
Middle Blocker                                                                                                                             875
Setter                                                                                                                                     545
Libero                                                                                                                                     458
Opposite                                                                                                                                   449
Libero, Libero                                                                                                                              33
Fr.                                                                                                                           

In [1527]:
df = pd.DataFrame(df)

# Define the position mapping dictionary
position_mapping = {
    'Middle- blocker': 'Middle Blocker',
    'Outside Hitter': 'Outside Hitter',
    'Setter': 'Setter',
    'Libero': 'Libero',
    'Opposite': 'Opposite',
    'MH': 'Middle Blocker',
    'Outside Attacker': 'Outside Hitter',
    'RS': 'Opposite',
    'OP': 'Opposite',
    'SW': 'Setter',
    'Right Side': 'Opposite',
    'Fr.': None,
    'Right': 'Opposite',
    'Jr.': None,
    'QH': None,
    'So.': None,
    'LIB': 'Libero',
    'D': 'Libero',
    'LB': 'Libero',
    'Oppsite': 'Opposite',
    'O H': 'Outside Hitter',
    'Sr.': None,
    'RS So.': None,
    'RS Jr.': None,
    'UTL': None,
    'UTL\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                           UTL': None,
    'MH\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t                            \n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t     Middle- blocker': 'Middle Blocker',
    ' MH': 'Middle Blocker',
    'M': 'Middle Blocker',
    'N': None,
    'Libero, Libero': 'Libero'}

# Function to apply position mapping

def map_positions(df, mapping):
    # Apply the mapping to the Merged_Position column
    df['Standardized_Position'] = df['Merged_Position'].map(mapping)
    return df

# Apply the position mapping function
df_mapped = map_positions(df, position_mapping)

In [1530]:
df_mapped['Standardized_Position'].value_counts()

Standardized_Position
Outside Hitter    1238
Setter             545
Libero             458
Opposite           449
Name: count, dtype: int64

In [1532]:
df_mapped['Merged_Position'].value_counts()

Merged_Position
Outside Hitter                                                                                                                            1238
Middle Blocker                                                                                                                             875
Setter                                                                                                                                     545
Libero                                                                                                                                     458
Opposite                                                                                                                                   449
Libero, Libero                                                                                                                              33
Fr.                                                                                                                           

In [1533]:
df = pd.DataFrame(df)

# Merge Standardized_Position and Position
df['Merged_Position'] = df_mapped['Standardized_Position'].fillna(df_mapped['Merged_Position'])



In [1535]:
df['Merged_Position'].value_counts()

Merged_Position
Outside Hitter                                                                                                                            1238
Middle Blocker                                                                                                                             875
Setter                                                                                                                                     545
Libero                                                                                                                                     458
Opposite                                                                                                                                   449
Libero, Libero                                                                                                                              33
Fr.                                                                                                                           

In [1536]:
df = pd.DataFrame(df)

# Define the position mapping dictionary
position_mapping = {
    'Middle Blocker': 'Middle-blocker',
    'Libero, Libero': 'Libero'  # Updated mapping for "Libero, Libero"
}

# Function to map positions based on the mapping dictionary
def map_positions(position):
    if pd.isnull(position):
        return position
    for key, value in position_mapping.items():
        if key.lower() in position.lower():
            return value
    return position  # Return original if no match found

# Apply the mapping function to the dataframe
df['Standardized_Position'] = df['Merged_Position'].apply(map_positions)

In [1538]:
df['Standardized_Position'].value_counts()

Standardized_Position
Outside Hitter                                                                                                                            1238
Middle-blocker                                                                                                                             875
Setter                                                                                                                                     545
Libero                                                                                                                                     491
Opposite                                                                                                                                   449
Fr.                                                                                                                                          5
QH                                                                                                                      

# Close enough to be usable 

In [1539]:
df = pd.DataFrame(df)

# Count occurrences of each value in Merged_Position
position_counts = df['Merged_Position'].value_counts()

# Filter out values occurring less than 6 times
valid_positions = position_counts[position_counts >= 6].index.tolist()

# Define the position mapping dictionary based on valid_positions
position_mapping = {
    'Middle Blocker': 'Middle-blocker',
    'Libero, Libero': 'Libero'  # Updated mapping for "Libero, Libero"
}

# Function to map positions based on the mapping dictionary
def map_positions(position):
    if pd.isnull(position):
        return position
    for key, value in position_mapping.items():
        if key.lower() in position.lower():
            return value
    return position  # Return original if no match found

# Apply the mapping function to the dataframe
df['Standardized_Position'] = df['Merged_Position'].apply(map_positions)

# Display the updated dataframe with filtered values
filtered_df = df[df['Merged_Position'].isin(valid_positions)]

In [1545]:
df = filtered_df.drop(columns= ['Extracted_Positions','Merged_Position'])

In [1547]:
df['Position'] =df['Standardized_Position']

In [1550]:
df = df.drop(columns = ['Standardized_Position'])


In [1551]:
df.to_csv('college_roster_clean')

# DataFrame almost fully cleaned but ready to be added to the Volleybox data set to complete the final dataframe that will be used for modeling.