<h1> VNL Rosters + Total Stats 2021-2023</h1>

The below code creates a dataframe with the following columns:
- Jersey Number
- Player Name
- Position
- Player_ID
- Year
- Country_ID
- Nationality
- Age
- Height
- Total Points
- Avg. By Match
- Attack Points
- Efficiency
- Attack Avg. Points
- Block Points
- Block Success
- Block Avg. Points
- Serve Points
- Serve Success
- Serve Avg. Points

<h2> Importing Libraries </h2>

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO
import warnings
warnings.filterwarnings('ignore')

<h2> This code creates a df with year, jersey number, player name and player_id </h2>

In [2]:
# List of years
years = ['2021', '2022', '2023']

# Updated mapping of years to their respective country IDs and names with corrected IDs
country_names_by_year = {
    '2021': dict(zip([4658, 4659, 4660, 4661, 4662, 4664, 4665, 4666, 4667, 4668, 4756, 4669, 4670, 4671, 4672, 4673], ["Argentina", "Australia", "Brazil", "Bulgaria", "Canada", "France", "Germany", "Iran", "Italy", "Japan", "Netherlands", "Poland", "Russia", "Serbia", "Slovenia", "USA"])),
    '2022': dict(zip([5136, 5137, 5138, 5139, 5140, 5218, 5141, 5142, 5143, 5144, 5145, 5146, 5147, 5149, 5150, 5151], ["Argentina", "Australia", "Brazil", "Bulgaria", "Canada", "China", "France", "Germany", "Iran", "Italy", "Japan", "Netherlands", "Poland", "Serbia", "Slovenia", "USA"])),
    '2023': dict(zip(list(range(5818, 5834)), ["Argentina", "Brazil", "Bulgaria", "Canada", "China", "Cuba", "France", "Germany", "Iran", "Italy", "Japan", "Netherlands", "Poland", "Serbia", "Slovenia", "USA"]))
}

# Initialize an empty list to collect dataframes
df_list = []

# Function to remove duplicates while maintaining order
def remove_duplicates_keep_order(sequence):
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

# Nested loops to iterate over each year and the respective country IDs and names
for year, countries in country_names_by_year.items():
    for country_id, country_name in countries.items():
        url = f'https://en.volleyballworld.com/volleyball/competitions/volleyball-nations-league/{year}/teams/men/{country_id}/players/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table')

        if table:
            html_string = str(table)
            html_io = StringIO(html_string)
            df = pd.read_html(html_io)[0]

            player_links = [a['href'] for a in soup.find_all('a', href=True) if 'players/' in a['href']]
            player_links = remove_duplicates_keep_order(player_links)
            
            player_ids = []
            for link in player_links:
                player_id = link.split('/')[-1]
                if player_id.isnumeric():
                    player_ids.append(player_id)
            
            player_ids.append('N/A') # Assuming there might be non-player links (i.e coaches!)
            player_ids += ['N/A'] * (len(df) - len(player_ids)) # Padding 'N/A'
            
            df['Player_ID'] = player_ids[:len(df)]
            df['Year'] = year
            df['Country_Name'] = country_name  # Use the name instead of ID
            
            df_list.append(df)

# Concatenate all dataframes into one
df_rosters = pd.concat(df_list, ignore_index=True)

In [3]:
#Checking Team USA's roster for 2022
df_rosters[(df_rosters['Country_Name'] == 'USA') & (df_rosters['Year'] == '2022')]

Unnamed: 0,No.,Player Name,Position,Player_ID,Year,Country_Name
675,2.0,Russell Aaron,OH,132624.0,2022,USA
676,3.0,Shaw James,S,136078.0,2022,USA
677,4.0,Jendryk II Jeffrey,MB,147721.0,2022,USA
678,5.0,Ensing Kyle,O,147417.0,2022,USA
679,6.0,Stahl Mitchell,MB,140185.0,2022,USA
680,7.0,Pasteur Jacob,OH,183140.0,2022,USA
681,8.0,Defalco Torey,OH,139254.0,2022,USA
682,9.0,Hanes Jake,O,170104.0,2022,USA
683,10.0,Dagostino Kyle,L,142445.0,2022,USA
684,11.0,Christenson Micah,S,124879.0,2022,USA


<h2> This appends the following data to the above dataframe: Nationality, Age, Birthdate, Height, Year </h2>

In [4]:
# Function to scrape player details remains unchanged
def scrape_player_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize a dictionary to hold the scraped values
    details = {'Nationality': None, 'Age': None, 'Height': None}
    
    for detail_key in details.keys():
        detail_label = soup.find(text=lambda text: text and detail_key in text)
        if detail_label and detail_label.find_next():
            details[detail_key] = detail_label.find_next().text.strip()
    
    return details

# Lists to hold the scraped details
nationalities = []
ages = []
heights = []

# Most recent year on website for age calculation
current_year = 2023

# Loop through each row in the df_rosters dataframe
for index, row in df_rosters.iterrows():
    # Skip the row if the position is COACH
    if row['Position'] == 'COACH':
        nationalities.append(None)
        ages.append(None)
        heights.append(None)
        continue
    
    # Construct the URL using the Year and Player_ID
    player_id = row['Player_ID']
    year = row['Year']
    if player_id != 'N/A':  # Check if player_id is not a coach
        url = f"https://en.volleyballworld.com/volleyball/competitions/volleyball-nations-league/{year}/players/{player_id}"
        # Scrape player details from the URL
        player_details = scrape_player_details(url)
        # Adjust the age based on the year relative to the current year (2023)
        if player_details['Age'] is not None:
            adjusted_age = int(player_details['Age']) - (current_year - int(year))
            ages.append(adjusted_age)
        else:
            ages.append(None)
    else:
        nationalities.append(None)
        ages.append(None)
        heights.append(None)
        continue
    
    nationalities.append(player_details.get('Nationality'))
    heights.append(player_details.get('Height'))

# Add the details as new columns to the df_rosters dataframe
df_rosters['Nationality'] = nationalities
df_rosters['Age'] = ages
df_rosters['Height'] = heights

In [5]:
#Checking Team USA's roster for 2022
df_rosters[(df_rosters['Country_Name'] == 'USA') & (df_rosters['Year'] == '2022')]

Unnamed: 0,No.,Player Name,Position,Player_ID,Year,Country_Name,Nationality,Age,Height
675,2.0,Russell Aaron,OH,132624.0,2022,USA,United States,29.0,205cm
676,3.0,Shaw James,S,136078.0,2022,USA,United States,29.0,203cm
677,4.0,Jendryk II Jeffrey,MB,147721.0,2022,USA,United States,27.0,208cm
678,5.0,Ensing Kyle,O,147417.0,2022,USA,United States,26.0,201cm
679,6.0,Stahl Mitchell,MB,140185.0,2022,USA,United States,28.0,203cm
680,7.0,Pasteur Jacob,OH,183140.0,2022,USA,United States,20.0,193cm
681,8.0,Defalco Torey,OH,139254.0,2022,USA,United States,25.0,198cm
682,9.0,Hanes Jake,O,170104.0,2022,USA,United States,24.0,210cm
683,10.0,Dagostino Kyle,L,142445.0,2022,USA,United States,27.0,175cm
684,11.0,Christenson Micah,S,124879.0,2022,USA,United States,29.0,198cm


<h2> This code continues to add onto the same dataframe and places total, attacking, blocking and serving points per year per player </h2>

In [6]:
# Function to scrape player details
def scrape_player_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize a dictionary to hold the scraped values
    details = {
        'Total Points': None,
        'Average by Match': None, 
        'Attack Points': None,
        'Efficiency': None,
        'Attack Avg Points': None,
        'Block Points': None,
        'Block Success': None,
        'Block Avg Points': None, 
        'Serve Points': None,
        'Serve Success': None,
        'Serve Avg Points': None
    }

    #This portion of code is to handle duplicates.
    #On the player roster, there are multiple labels for Success and Avg. Points.
    #The below ensures we're grabbing the correct value for each metric (Attack, Block, Serve)
    avg_points_labels = soup.find_all('div', class_='vbw-player-stats-head', text='Avg Points')
    success_labels = soup.find_all('div', class_='vbw-player-stats-head', text='Success')
    
    for detail_key in details.keys():
        if detail_key == 'Attack Avg Points' and avg_points_labels:
            details[detail_key] = avg_points_labels[0].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        elif detail_key == 'Block Avg Points' and len(avg_points_labels) > 1:
            details[detail_key] = avg_points_labels[1].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        elif detail_key == 'Serve Avg Points' and len(avg_points_labels) > 2:
            details[detail_key] = avg_points_labels[2].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        elif detail_key == 'Block Success' and success_labels:
            details[detail_key] = success_labels[0].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        elif detail_key == 'Serve Success' and len(success_labels) > 1:
            details[detail_key] = success_labels[1].find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
        else:
            detail_label = soup.find('div', class_='vbw-player-stats-head', text=detail_key)
            if detail_label and detail_label.find_next_sibling('div', class_='vbw-player-stats-text'):
                details[detail_key] = detail_label.find_next_sibling('div', class_='vbw-player-stats-text').text.strip()
                
    return details

# Initialize lists to hold the scraped details
details_columns = {
    'Total Points': [],
    'Average by Match': [],
    'Attack Points': [],
    'Efficiency': [],
    'Attack Avg Points': [],
    'Block Points': [],
    'Block Success': [],
    'Block Avg Points': [],
    'Serve Points': [],
    'Serve Success': [],
    'Serve Avg Points': []
}

# Loop through each row in the df_rosters dataframe
for index, row in df_rosters.iterrows():
    # Skip the row if the position is COACH
    if row['Position'] == 'COACH':
        for detail in details_columns:
            details_columns[detail].append(None)
        continue
    
    # Construct the URL
    player_id = row['Player_ID']
    year = row['Year']
    if player_id != 'N/A':  # Check if player_id is not a coach
        url = f"https://en.volleyballworld.com/volleyball/competitions/volleyball-nations-league/{year}/players/{player_id}"
        # Scrape player details from the URL
        player_details = scrape_player_details(url)
    else:
        player_details = {key: None for key in details_columns.keys()}

    # Append the details to the respective lists
    for detail in details_columns:
        details_columns[detail].append(player_details[detail])

# Add the details as new columns to the df_rosters dataframe
for detail in details_columns:
    df_rosters[detail] = details_columns[detail]

#renaming 
df_rosters_21_23 = df_rosters

In [7]:
#Checking Team USA's roster for 2022
df_rosters[(df_rosters['Country_Name'] == 'USA') & (df_rosters['Year'] == '2022')]

Unnamed: 0,No.,Player Name,Position,Player_ID,Year,Country_Name,Nationality,Age,Height,Total Points,Average by Match,Attack Points,Efficiency,Attack Avg Points,Block Points,Block Success,Block Avg Points,Serve Points,Serve Success,Serve Avg Points
675,2.0,Russell Aaron,OH,132624.0,2022,USA,United States,29.0,205cm,197,13.13,162,50.00%,10.80,14,18.42%,0.93,21,10.88%,1.40
676,3.0,Shaw James,S,136078.0,2022,USA,United States,29.0,203cm,4,0.50,2,40.00%,0.25,2,40.00%,0.25,0,0.00%,0.00
677,4.0,Jendryk II Jeffrey,MB,147721.0,2022,USA,United States,27.0,208cm,107,7.13,79,65.83%,5.27,21,13.73%,1.40,7,4.09%,0.47
678,5.0,Ensing Kyle,O,147417.0,2022,USA,United States,26.0,201cm,161,10.73,143,53.16%,9.53,8,9.30%,0.53,10,6.37%,0.67
679,6.0,Stahl Mitchell,MB,140185.0,2022,USA,United States,28.0,203cm,23,2.09,15,55.56%,1.36,5,17.24%,0.45,3,8.57%,0.27
680,7.0,Pasteur Jacob,OH,183140.0,2022,USA,United States,20.0,193cm,0,-,0,-,-,0,-,-,0,-,-
681,8.0,Defalco Torey,OH,139254.0,2022,USA,United States,25.0,198cm,180,16.36,157,55.09%,14.27,9,13.04%,0.82,14,7.41%,1.27
682,9.0,Hanes Jake,O,170104.0,2022,USA,United States,24.0,210cm,36,4.50,33,47.83%,4.12,2,12.50%,0.25,1,2.63%,0.12
683,10.0,Dagostino Kyle,L,142445.0,2022,USA,United States,27.0,175cm,0,0.00,0,-,-,0,-,-,0,-,-
684,11.0,Christenson Micah,S,124879.0,2022,USA,United States,29.0,198cm,25,3.57,9,56.25%,1.29,11,25.00%,1.57,5,4.46%,0.71


In [8]:
df_rosters_21_23.to_csv('df_rosters_21_23.csv',index=False)