<h1> VNL Rosters + Total Stats 2021-2023</h1>

The below code creates a dataframe with the following columns:
- Jersey Number
- Player Name
- Position
- Player_ID
- Year
- Country_ID
- Nationality
- Age
- Height
- Total Points
- Avg. By Match
- Attack Points
- Efficiency
- Attack Avg. Points
- Block Points
- Block Success
- Block Avg. Points
- Serve Points
- Serve Success
- Serve Avg. Points

<h2> Importing Libraries </h2>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO
import warnings
warnings.filterwarnings('ignore')

<h2> This code creates a df with year, jersey number, player name and player_id </h2>

In [2]:
# List of years
years = ['2021', '2022', '2023']

# Updated mapping of years to their respective country IDs and names with corrected IDs
country_names_by_year = {
    '2021': dict(zip(list(range(4674, 4690)), ["Belgium", "Brazil", "Bulgaria", "Canada", "China", "Dominican Republic", "Germany", "Italy", "Japan", "Korea", "Netherlands", "Poland", "Serbia", "Thailand", "Turkey", "USA"])),
    '2022': dict(zip([5120, 5121, 5201, 5122, 5123, 5124, 5125, 5126, 5127, 5128, 5129, 5130, 5132, 5133, 5134, 5135], ["Belgium", "Brazil", "Bulgaria", "Canada", "China", "Dominican Republic", "Germany", "Italy", "Japan", "Korea", "Netherlands", "Poland", "Serbia", "Thailand", "Turkey", "USA"])),
    '2023': dict(zip(list(range(5834, 5850)), ["Belgium", "Brazil", "Bulgaria", "Canada", "China", "Dominican Republic", "Germany", "Italy", "Japan", "Korea", "Netherlands", "Poland", "Serbia", "Thailand", "Turkey", "USA"]))
}

# Initialize an empty list to collect dataframes
df_list = []

# Function to remove duplicates while maintaining order
def remove_duplicates_keep_order(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

# Please note, this portion of the code attempts to scrape web content,
# which might be against the terms of service of the website.
# Ensure you have permission to scrape the site and comply with its robots.txt file.
# The code snippet provided is for educational purposes only.

# Nested loops to iterate over each year and the respective country IDs and names
for year, countries in country_names_by_year.items():
    for country_id, country_name in countries.items():
        url = f'https://en.volleyballworld.com/volleyball/competitions/volleyball-nations-league/{year}/teams/women/{country_id}/players/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table')

        if table:
            html_string = str(table)
            html_io = StringIO(html_string)
            df = pd.read_html(html_io)[0]

            player_links = [a['href'] for a in soup.find_all('a', href=True) if 'players/' in a['href']]
            player_links = remove_duplicates_keep_order(player_links)
            
            player_ids = []
            for link in player_links:
                player_id = link.split('/')[-1]
                if player_id.isnumeric():
                    player_ids.append(player_id)
            
            player_ids.append('N/A') # Assuming there might be non-player links
            player_ids += ['N/A'] * (len(df) - len(player_ids)) # Padding 'N/A'
            
            df['Player_ID'] = player_ids[:len(df)]
            df['Year'] = year
            df['Country_Name'] = country_name  # Use the name instead of ID
            
            df_list.append(df)

# Concatenate all dataframes into one
df_rosters = pd.concat(df_list, ignore_index=True)


In [3]:
#Checking Team USA's roster for 2022
df_rosters[(df_rosters['Country_Name'] == 'USA') & (df_rosters['Year'] == '2022')]

Unnamed: 0,No.,Player Name,Position,Player_ID,Year,Country_Name
665,1.0,Hancock Micha,S,132835.0,2022,USA
666,2.0,Poulter Jordyn,S,143517.0,2022,USA
667,3.0,Plummer Kathryn,OH,146722.0,2022,USA
668,4.0,Wong-Orantes Justine,L,135564.0,2022,USA
669,5.0,Hentz Morgan,L,152187.0,2022,USA
670,6.0,Dixon Tetori,MB,145702.0,2022,USA
671,7.0,Carlini Lauren,S,132500.0,2022,USA
672,8.0,Tapp Hannah,MB,160909.0,2022,USA
673,9.0,Rishel Madison Kingdon,OH,135983.0,2022,USA
674,10.0,Butler Brionne,MB,152194.0,2022,USA


<h2> This appends the following data to the above dataframe: Nationality, Age, Birthdate, Height, Year </h2>

In [4]:
# Function to scrape player details remains unchanged
def scrape_player_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize a dictionary to hold the scraped values
    details = {'Nationality': None, 'Age': None, 'Height': None}
    
    for detail_key in details.keys():
        detail_label = soup.find(text=lambda text: text and detail_key in text)
        if detail_label and detail_label.find_next():
            details[detail_key] = detail_label.find_next().text.strip()
    
    return details

# Lists to hold the scraped details
nationalities = []
ages = []
heights = []

# Most recent year on website for age calculation
current_year = 2023

# Loop through each row in the df_rosters dataframe
for index, row in df_rosters.iterrows():
    # Skip the row if the position is COACH
    if row['Position'] == 'COACH':
        nationalities.append(None)
        ages.append(None)
        heights.append(None)
        continue
    
    # Construct the URL using the Year and Player_ID
    player_id = row['Player_ID']
    year = row['Year']
    if player_id != 'N/A':  # Check if player_id is not a coach
        url = f"https://en.volleyballworld.com/volleyball/competitions/volleyball-nations-league/{year}/players/{player_id}"
        # Scrape player details from the URL
        player_details = scrape_player_details(url)
        # Adjust the age based on the year relative to the current year (2023)
        if player_details['Age'] is not None:
            adjusted_age = int(player_details['Age']) - (current_year - int(year))
            ages.append(adjusted_age)
        else:
            ages.append(None)
    else:
        nationalities.append(None)
        ages.append(None)
        heights.append(None)
        continue
    
    nationalities.append(player_details.get('Nationality'))
    heights.append(player_details.get('Height'))

# Add the details as new columns to the df_rosters dataframe
df_rosters['Nationality'] = nationalities
df_rosters['Age'] = ages
df_rosters['Height'] = heights

In [5]:
#Checking Team USA's roster for 2022
df_rosters[(df_rosters['Country_Name'] == 'USA') & (df_rosters['Year'] == '2022')]

Unnamed: 0,No.,Player Name,Position,Player_ID,Year,Country_Name,Nationality,Age,Height
665,1.0,Hancock Micha,S,132835.0,2022,USA,United States,30.0,180cm
666,2.0,Poulter Jordyn,S,143517.0,2022,USA,United States,25.0,188cm
667,3.0,Plummer Kathryn,OH,146722.0,2022,USA,United States,24.0,198cm
668,4.0,Wong-Orantes Justine,L,135564.0,2022,USA,United States,27.0,168cm
669,5.0,Hentz Morgan,L,152187.0,2022,USA,United States,24.0,175cm
670,6.0,Dixon Tetori,MB,145702.0,2022,USA,United States,30.0,191cm
671,7.0,Carlini Lauren,S,132500.0,2022,USA,United States,28.0,185cm
672,8.0,Tapp Hannah,MB,160909.0,2022,USA,United States,27.0,191cm
673,9.0,Rishel Madison Kingdon,OH,135983.0,2022,USA,United States,29.0,183cm
674,10.0,Butler Brionne,MB,152194.0,2022,USA,United States,24.0,194cm


<h2> This code continues to add onto the same dataframe and places total, attacking, blocking and serving points per year per player </h2>

In [6]:
# Function to scrape player details
def scrape_player_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize a dictionary to hold the scraped values
    details = {
        'Total Points': None,
        'Average by Match': None, 
        'Attack Points': None,
        'Efficiency': None,
        'Attack Avg Points': None,
        'Block Points': None,
        'Block Success': None,
        'Block Avg Points': None, 
        'Serve Points': None,
        'Serve Success': None,
        'Serve Avg Points': None
    }

    #This portion of code is to handle duplicates.
    #On the player roster, there are multiple labels for Success and Avg. Points.
    #The below ensures we're grabbing the correct value for each metric (Attack, Block, Serve)
    avg_points_labels = soup.find_all('div', class_='vbw-player-stats-head', text='Avg Points')
    success_labels = soup.find_all('div', class_='vbw-player-stats-head', text='Success')
    
    for detail_key in details.keys():
        if detail_key == 'Attack Avg Points' and avg_points_labels:
            details[detail_key] = avg_points_labels[0].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        elif detail_key == 'Block Avg Points' and len(avg_points_labels) > 1:
            details[detail_key] = avg_points_labels[1].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        elif detail_key == 'Serve Avg Points' and len(avg_points_labels) > 2:
            details[detail_key] = avg_points_labels[2].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        elif detail_key == 'Block Success' and success_labels:
            details[detail_key] = success_labels[0].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        elif detail_key == 'Serve Success' and len(success_labels) > 1:
            details[detail_key] = success_labels[1].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        else:
            detail_label = soup.find('div', class_='vbw-player-stats-head', text=detail_key)
            if detail_label and detail_label.find_next_sibling('div', class_='vbw-player-stats-text'):
                details[detail_key] = detail_label.find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
                
    return details

# Initialize lists to hold the scraped details
details_columns = {
    'Total Points': [],
    'Average by Match': [],
    'Attack Points': [],
    'Efficiency': [],
    'Attack Avg Points': [],
    'Block Points': [],
    'Block Success': [],
    'Block Avg Points': [],
    'Serve Points': [],
    'Serve Success': [],
    'Serve Avg Points': []
}

# Loop through each row in the df_rosters dataframe
for index, row in df_rosters.iterrows():
    # Skip the row if the position is COACH
    if row['Position'] == 'COACH':
        for detail in details_columns:
            details_columns[detail].append(None)
        continue
    
    # Construct the URL
    player_id = row['Player_ID']
    year = row['Year']
    if player_id != 'N/A':  # Check if player_id is not a coach
        url = f"https://en.volleyballworld.com/volleyball/competitions/volleyball-nations-league/{year}/players/{player_id}"
        # Scrape player details from the URL
        player_details = scrape_player_details(url)
    else:
        player_details = {key: None for key in details_columns.keys()}

    # Append the details to the respective lists
    for detail in details_columns:
        details_columns[detail].append(player_details[detail])

# Add the details as new columns to the df_rosters dataframe
for detail in details_columns:
    df_rosters[detail] = details_columns[detail]

#renaming 
df_rosters_21_23 = df_rosters

In [7]:
#Checking Team USA's roster for 2022
df_rosters[(df_rosters['Country_Name'] == 'USA') & (df_rosters['Year'] == '2022')]

Unnamed: 0,No.,Player Name,Position,Player_ID,Year,Country_Name,Nationality,Age,Height,Total Points,Average by Match,Attack Points,Efficiency,Attack Avg Points,Block Points,Block Success,Block Avg Points,Serve Points,Serve Success,Serve Avg Points
665,1.0,Hancock Micha,S,132835.0,2022,USA,United States,30.0,180cm,10,1.25,4,50.00%,0.50,1,4.35%,0.12,5,11.63%,0.62
666,2.0,Poulter Jordyn,S,143517.0,2022,USA,United States,25.0,188cm,11,2.20,1,33.33%,0.20,7,33.33%,1.40,3,5.66%,0.60
667,3.0,Plummer Kathryn,OH,146722.0,2022,USA,United States,24.0,198cm,62,6.89,53,49.53%,5.89,6,28.57%,0.67,3,6.25%,0.33
668,4.0,Wong-Orantes Justine,L,135564.0,2022,USA,United States,27.0,168cm,0,0.00,0,-,-,0,-,-,0,-,-
669,5.0,Hentz Morgan,L,152187.0,2022,USA,United States,24.0,175cm,0,0.00,0,-,-,0,-,-,0,-,-
670,6.0,Dixon Tetori,MB,145702.0,2022,USA,United States,30.0,191cm,21,2.62,16,42.11%,2.00,3,8.57%,0.38,2,3.28%,0.25
671,7.0,Carlini Lauren,S,132500.0,2022,USA,United States,28.0,185cm,17,1.89,10,50.00%,1.11,2,6.25%,0.22,5,6.25%,0.56
672,8.0,Tapp Hannah,MB,160909.0,2022,USA,United States,27.0,191cm,30,3.33,19,63.33%,2.11,9,23.08%,1.00,2,3.39%,0.22
673,9.0,Rishel Madison Kingdon,OH,135983.0,2022,USA,United States,29.0,183cm,36,4.50,32,39.02%,4.00,3,33.33%,0.38,1,3.03%,0.12
674,10.0,Butler Brionne,MB,152194.0,2022,USA,United States,24.0,194cm,9,2.25,6,75.00%,1.50,3,37.50%,0.75,0,0.00%,0.00


In [9]:
df_rosters_21_23.to_csv('df_womens_rosters_21_23.csv',index=False)