Getting results directly from HTML for each season

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np


In [2]:
url_2021_2022 = "https://resultater.volleyball.dk/tms/Turneringer-og-resultater/Pulje-Komplet-Kampprogram.aspx?PuljeId=2978"


# Send a GET request to fetch the content of the page
response = requests.get(url_2021_2022)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table element containing the match data
    table = soup.find('table', {'class': 'srDefault srProgramNormal'})
    
    # Find all rows in the table body
    rows = table.find_all('tr')[0:]  # Skip the header row
    
    match_data = []
    
    # Loop through each row to extract match details
    for row in rows:
        columns = row.find_all('td')
        
        # Extract relevant details from each column
        match_number = columns[0].text.strip()
        date = columns[1].text.strip()
        home_team = columns[2].text.strip()
        away_team = columns[3].text.strip()
        stadium = columns[4].text.strip()
        result = columns[5].text.strip()
        
        # Append the extracted data to the match_data list
        match_data.append({
            'match_number': match_number,
            'date': date,
            'home_team': home_team,
            'away_team': away_team,
            'stadium': stadium,
            'result': result
        })
    
    # Convert the list of dictionaries into a pandas DataFrame
    df_2021_2022 = pd.DataFrame(match_data)
    
    # Print the DataFrame
    #print(df_2023_2024)

else:
    print(f'Failed to retrieve the page. Status code: {response.status_code}')

df_2021_2022["season"] = "2021-2022"


df = df_2021_2022

df['stadium'] = df['stadium'].str.replace(r'[\r\n\t]*1\t*', '', regex=True)
df['stadium'] = df['stadium'].str.replace(r'[\r\n]', '', regex=True)  # Remove '\r\n'


df['date'] = df['date'].str.replace(r'[\r\n]', '', regex=True)  # Remove '\r\n'
df['date'] = df['date'].str.strip()  # Remove leading/trailing spaces


# Step 2: Check if ' kl. ' with any whitespace before or after is in the 'date' column
if df['date'].str.contains(r'\s*kl\.\s*').any():  # Regex allows for any amount of whitespace
    # Split the 'date' column into date and time
    df[['date', 'time']] = df['date'].str.split(r'\s*kl\.\s*', expand=True)
else:
    print("No matching pattern found in some rows, check the data.")

# Step 1: Strip whitespace and split the result into home and away scores
df['result'] = df['result'].str.strip()  # Strip any leading/trailing whitespace

# Handle cases where the result is an empty string (not a played game)
df['result'] = df['result'].replace('', np.nan)  # Replace empty strings with NaN

# Split the result into two columns and handle rows with missing/invalid data
split_results = df['result'].str.split(' - ', expand=True)

df['result'] = df['result'].str.split(' - ', expand=True)
df = df.dropna(subset=['result'])


# Step 1: Strip whitespace and split the result into home and away scores
#df['result'] = df['result'].str.strip()  # Strip any leading/trailing whitespace
df.loc[:, 'home_result'] = df['result'].str.extract(r'(\d)', expand=False)
df.loc[:, 'away_result'] = df['result'].str.extract(r'(\d)$', expand=False)

# Create a new 'winner' column based on the highest score between home and away results
df['winner'] = df.apply(
    lambda row: row['home_team'] if row['home_result'] > row['away_result'] else (row['away_team'] if row['away_result'] > row['home_result'] else None),
    axis=1
)



df['home_result'] = pd.to_numeric(df['home_result'])
df['away_result'] = pd.to_numeric(df['away_result'])

# Count the number of sets won by each team in each game
# For the home team, if the home score >= 3, they won a set. For the away team, if the away score >= 3, they won a set.
df['home_sets_won'] = df['home_result']
df['away_sets_won'] = df['away_result']
df['number_of_sets'] = df['home_sets_won'] + df['away_sets_won']

# Count the number of games won by each team
# If the home team won 3 sets, they won the game. Similarly for the away team.
df['home_game_won'] = df['home_result'].apply(lambda x: 1 if x == 3 else 0)
df['away_game_won'] = df['away_result'].apply(lambda x: 1 if x == 3 else 0)


df['home_sets_lost'] = df['number_of_sets'] - df['home_sets_won']
df['away_sets_lost'] = df['number_of_sets'] - df['away_sets_won']

# Count the number of home games played by each team
home_games_played = df.groupby('home_team').size()

# Count the number of away games played by each team
away_games_played = df.groupby('away_team').size()

# Combine home and away games to get the total number of games played by each team
total_games_played = pd.concat([home_games_played, away_games_played], axis=0).groupby(level=0).sum()


# Create a summary dataframe with home, away, and total games played, sets won, and games won
summary_2021_2022 = pd.DataFrame({
    'home_games_played': home_games_played,
    'away_games_played': away_games_played,
    'total_games_played': total_games_played,
    'sets_won': pd.concat([
        df.groupby('home_team')['home_sets_won'].sum(),
        df.groupby('away_team')['away_sets_won'].sum()
    ], axis=0).groupby(level=0).sum(),
    'sets_lost': pd.concat([
        df.groupby('home_team')['home_sets_lost'].sum(),
        df.groupby('away_team')['away_sets_lost'].sum()
    ], axis=0).groupby(level=0).sum(),    'games_won': pd.concat([
        df.groupby('home_team')['home_game_won'].sum(),
        df.groupby('away_team')['away_game_won'].sum()
    ], axis=0).groupby(level=0).sum()
}).fillna(0)  # Handle teams that have no games played

summary_2021_2022["season"] = "2021-2022"


df_2021_2022 = df

In [3]:
url_2022_2023 = "https://resultater.volleyball.dk/tms/Turneringer-og-resultater/Pulje-Komplet-Kampprogram.aspx?PuljeId=3482"


# Send a GET request to fetch the content of the page
response = requests.get(url_2022_2023)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table element containing the match data
    table = soup.find('table', {'class': 'srDefault srProgramNormal'})
    
    # Find all rows in the table body
    rows = table.find_all('tr')[0:]  # Skip the header row
    
    match_data = []
    
    # Loop through each row to extract match details
    for row in rows:
        columns = row.find_all('td')
        
        # Extract relevant details from each column
        match_number = columns[0].text.strip()
        date = columns[1].text.strip()
        home_team = columns[2].text.strip()
        away_team = columns[3].text.strip()
        stadium = columns[4].text.strip()
        result = columns[5].text.strip()
        
        # Append the extracted data to the match_data list
        match_data.append({
            'match_number': match_number,
            'date': date,
            'home_team': home_team,
            'away_team': away_team,
            'stadium': stadium,
            'result': result
        })
    
    # Convert the list of dictionaries into a pandas DataFrame
    df_2022_2023 = pd.DataFrame(match_data)
    
    # Print the DataFrame
    #print(df_2023_2024)

else:
    print(f'Failed to retrieve the page. Status code: {response.status_code}')

df_2022_2023["season"] = "2022-2023"


df = df_2022_2023

df['stadium'] = df['stadium'].str.replace(r'[\r\n\t]*1\t*', '', regex=True)
df['stadium'] = df['stadium'].str.replace(r'[\r\n]', '', regex=True)  # Remove '\r\n'


df['date'] = df['date'].str.replace(r'[\r\n]', '', regex=True)  # Remove '\r\n'
df['date'] = df['date'].str.strip()  # Remove leading/trailing spaces


# Step 2: Check if ' kl. ' with any whitespace before or after is in the 'date' column
if df['date'].str.contains(r'\s*kl\.\s*').any():  # Regex allows for any amount of whitespace
    # Split the 'date' column into date and time
    df[['date', 'time']] = df['date'].str.split(r'\s*kl\.\s*', expand=True)
else:
    print("No matching pattern found in some rows, check the data.")

# Step 1: Strip whitespace and split the result into home and away scores
df['result'] = df['result'].str.strip()  # Strip any leading/trailing whitespace

# Handle cases where the result is an empty string (not a played game)
df['result'] = df['result'].replace('', np.nan)  # Replace empty strings with NaN

# Split the result into two columns and handle rows with missing/invalid data
split_results = df['result'].str.split(' - ', expand=True)

df['result'] = df['result'].str.split(' - ', expand=True)
df = df.dropna(subset=['result'])


# Step 1: Strip whitespace and split the result into home and away scores
#df['result'] = df['result'].str.strip()  # Strip any leading/trailing whitespace
df.loc[:, 'home_result'] = df['result'].str.extract(r'(\d)', expand=False)
df.loc[:, 'away_result'] = df['result'].str.extract(r'(\d)$', expand=False)

# Create a new 'winner' column based on the highest score between home and away results
df['winner'] = df.apply(
    lambda row: row['home_team'] if row['home_result'] > row['away_result'] else (row['away_team'] if row['away_result'] > row['home_result'] else None),
    axis=1
)



df['home_result'] = pd.to_numeric(df['home_result'])
df['away_result'] = pd.to_numeric(df['away_result'])

# Count the number of sets won by each team in each game
# For the home team, if the home score >= 3, they won a set. For the away team, if the away score >= 3, they won a set.
df['home_sets_won'] = df['home_result']
df['away_sets_won'] = df['away_result']
df['number_of_sets'] = df['home_sets_won'] + df['away_sets_won']

# Count the number of games won by each team
# If the home team won 3 sets, they won the game. Similarly for the away team.
df['home_game_won'] = df['home_result'].apply(lambda x: 1 if x == 3 else 0)
df['away_game_won'] = df['away_result'].apply(lambda x: 1 if x == 3 else 0)


df['home_sets_lost'] = df['number_of_sets'] - df['home_sets_won']
df['away_sets_lost'] = df['number_of_sets'] - df['away_sets_won']

# Count the number of home games played by each team
home_games_played = df.groupby('home_team').size()

# Count the number of away games played by each team
away_games_played = df.groupby('away_team').size()

# Combine home and away games to get the total number of games played by each team
total_games_played = pd.concat([home_games_played, away_games_played], axis=0).groupby(level=0).sum()


# Create a summary dataframe with home, away, and total games played, sets won, and games won
summary_2022_2023 = pd.DataFrame({
    'home_games_played': home_games_played,
    'away_games_played': away_games_played,
    'total_games_played': total_games_played,
    'sets_won': pd.concat([
        df.groupby('home_team')['home_sets_won'].sum(),
        df.groupby('away_team')['away_sets_won'].sum()
    ], axis=0).groupby(level=0).sum(),
    'sets_lost': pd.concat([
        df.groupby('home_team')['home_sets_lost'].sum(),
        df.groupby('away_team')['away_sets_lost'].sum()
    ], axis=0).groupby(level=0).sum(),    'games_won': pd.concat([
        df.groupby('home_team')['home_game_won'].sum(),
        df.groupby('away_team')['away_game_won'].sum()
    ], axis=0).groupby(level=0).sum()
}).fillna(0)  # Handle teams that have no games played

summary_2022_2023["season"] = "2022-2023"


df_2022_2023 = df

In [4]:


# URL of the page you want to scrape
url_2023_2024 = 'https://resultater.volleyball.dk/tms/Turneringer-og-resultater/Pulje-Komplet-Kampprogram.aspx?PuljeId=3576'  # Replace this with the actual URL

# Send a GET request to fetch the content of the page
response = requests.get(url_2023_2024)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table element containing the match data
    table = soup.find('table', {'class': 'srDefault srProgramNormal'})
    
    # Find all rows in the table body
    rows = table.find_all('tr')[0:]  # Skip the header row
    
    match_data = []
    
    # Loop through each row to extract match details
    for row in rows:
        columns = row.find_all('td')
        
        # Extract relevant details from each column
        match_number = columns[0].text.strip()
        date = columns[1].text.strip()
        home_team = columns[2].text.strip()
        away_team = columns[3].text.strip()
        stadium = columns[4].text.strip()
        result = columns[5].text.strip()
        
        # Append the extracted data to the match_data list
        match_data.append({
            'match_number': match_number,
            'date': date,
            'home_team': home_team,
            'away_team': away_team,
            'stadium': stadium,
            'result': result
        })
    
    # Convert the list of dictionaries into a pandas DataFrame
    df_2023_2024 = pd.DataFrame(match_data)
    
    # Print the DataFrame
    #print(df_2023_2024)

else:
    print(f'Failed to retrieve the page. Status code: {response.status_code}')

df_2023_2024["season"] = "2023-2024"

df = df_2023_2024

df['stadium'] = df['stadium'].str.replace(r'[\r\n\t]*1\t*', '', regex=True)
df['stadium'] = df['stadium'].str.replace(r'[\r\n]', '', regex=True)  # Remove '\r\n'


df['date'] = df['date'].str.replace(r'[\r\n]', '', regex=True)  # Remove '\r\n'
df['date'] = df['date'].str.strip()  # Remove leading/trailing spaces


# Step 2: Check if ' kl. ' with any whitespace before or after is in the 'date' column
if df['date'].str.contains(r'\s*kl\.\s*').any():  # Regex allows for any amount of whitespace
    # Split the 'date' column into date and time
    df[['date', 'time']] = df['date'].str.split(r'\s*kl\.\s*', expand=True)
else:
    print("No matching pattern found in some rows, check the data.")

# Step 1: Strip whitespace and split the result into home and away scores
df['result'] = df['result'].str.strip()  # Strip any leading/trailing whitespace

# Handle cases where the result is an empty string (not a played game)
df['result'] = df['result'].replace('', np.nan)  # Replace empty strings with NaN

# Split the result into two columns and handle rows with missing/invalid data
split_results = df['result'].str.split(' - ', expand=True)

df['result'] = df['result'].str.split(' - ', expand=True)
df = df.dropna(subset=['result'])


# Step 1: Strip whitespace and split the result into home and away scores
#df['result'] = df['result'].str.strip()  # Strip any leading/trailing whitespace
df.loc[:, 'home_result'] = df['result'].str.extract(r'(\d)', expand=False)
df.loc[:, 'away_result'] = df['result'].str.extract(r'(\d)$', expand=False)

# Create a new 'winner' column based on the highest score between home and away results
df['winner'] = df.apply(
    lambda row: row['home_team'] if row['home_result'] > row['away_result'] else (row['away_team'] if row['away_result'] > row['home_result'] else None),
    axis=1
)



df['home_result'] = pd.to_numeric(df['home_result'])
df['away_result'] = pd.to_numeric(df['away_result'])

# Count the number of sets won by each team in each game
# For the home team, if the home score >= 3, they won a set. For the away team, if the away score >= 3, they won a set.
df['home_sets_won'] = df['home_result']
df['away_sets_won'] = df['away_result']
df['number_of_sets'] = df['home_sets_won'] + df['away_sets_won']

# Count the number of games won by each team
# If the home team won 3 sets, they won the game. Similarly for the away team.
df['home_game_won'] = df['home_result'].apply(lambda x: 1 if x == 3 else 0)
df['away_game_won'] = df['away_result'].apply(lambda x: 1 if x == 3 else 0)


df['home_sets_lost'] = df['number_of_sets'] - df['home_sets_won']
df['away_sets_lost'] = df['number_of_sets'] - df['away_sets_won']

# Count the number of home games played by each team
home_games_played = df.groupby('home_team').size()

# Count the number of away games played by each team
away_games_played = df.groupby('away_team').size()

# Combine home and away games to get the total number of games played by each team
total_games_played = pd.concat([home_games_played, away_games_played], axis=0).groupby(level=0).sum()


# Create a summary dataframe with home, away, and total games played, sets won, and games won
summary_2023_2024 = pd.DataFrame({
    'home_games_played': home_games_played,
    'away_games_played': away_games_played,
    'total_games_played': total_games_played,
    'sets_won': pd.concat([
        df.groupby('home_team')['home_sets_won'].sum(),
        df.groupby('away_team')['away_sets_won'].sum()
    ], axis=0).groupby(level=0).sum(),
    'sets_lost': pd.concat([
        df.groupby('home_team')['home_sets_lost'].sum(),
        df.groupby('away_team')['away_sets_lost'].sum()
    ], axis=0).groupby(level=0).sum(),    'games_won': pd.concat([
        df.groupby('home_team')['home_game_won'].sum(),
        df.groupby('away_team')['away_game_won'].sum()
    ], axis=0).groupby(level=0).sum()
}).fillna(0)  # Handle teams that have no games played

summary_2023_2024["season"] = "2023-2024"

df_2023_2024 = df

In [5]:

# URL of the page you want to scrape
url_2024_2025 = "https://resultater.volleyball.dk/tms/Turneringer-og-resultater/Pulje-Komplet-Kampprogram.aspx?PuljeId=3752"

# Send a GET request to fetch the content of the page
response = requests.get(url_2024_2025)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table element containing the match data
    table = soup.find('table', {'class': 'srDefault srProgramNormal'})
    
    # Find all rows in the table body
    rows = table.find_all('tr')[0:]  # Skip the header row
    
    match_data = []
    
    # Loop through each row to extract match details
    for row in rows:
        columns = row.find_all('td')
        
        # Extract relevant details from each column
        match_number = columns[0].text.strip()
        date = columns[1].text.strip()
        home_team = columns[2].text.strip()
        away_team = columns[3].text.strip()
        stadium = columns[4].text.strip()
        result = columns[5].text.strip()
        
        # Append the extracted data to the match_data list
        match_data.append({
            'match_number': match_number,
            'date': date,
            'home_team': home_team,
            'away_team': away_team,
            'stadium': stadium,
            'result': result
        })
    
    # Convert the list of dictionaries into a pandas DataFrame
    df_2024_2025 = pd.DataFrame(match_data)
    
    # Print the DataFrame
    #print(df_2024_2025)

else:
    print(f'Failed to retrieve the page. Status code: {response.status_code}')

df_2024_2025["season"] = "2024-2025"

df = df_2024_2025

df['stadium'] = df['stadium'].str.replace(r'[\r\n\t]*1\t*', '', regex=True)
df['stadium'] = df['stadium'].str.replace(r'[\r\n]', '', regex=True)  # Remove '\r\n'


df['date'] = df['date'].str.replace(r'[\r\n]', '', regex=True)  # Remove '\r\n'
df['date'] = df['date'].str.strip()  # Remove leading/trailing spaces


# Step 2: Check if ' kl. ' with any whitespace before or after is in the 'date' column
if df['date'].str.contains(r'\s*kl\.\s*').any():  # Regex allows for any amount of whitespace
    # Split the 'date' column into date and time
    df[['date', 'time']] = df['date'].str.split(r'\s*kl\.\s*', expand=True)
else:
    print("No matching pattern found in some rows, check the data.")

# Step 1: Strip whitespace and split the result into home and away scores
df['result'] = df['result'].str.strip()  # Strip any leading/trailing whitespace

# Handle cases where the result is an empty string (not a played game)
df['result'] = df['result'].replace('', np.nan)  # Replace empty strings with NaN

# Split the result into two columns and handle rows with missing/invalid data
split_results = df['result'].str.split(' - ', expand=True)

df['result'] = df['result'].str.split(' - ', expand=True)
df = df.dropna(subset=['result'])


# Step 1: Strip whitespace and split the result into home and away scores
#df['result'] = df['result'].str.strip()  # Strip any leading/trailing whitespace
df.loc[:, 'home_result'] = df['result'].str.extract(r'(\d)', expand=False)
df.loc[:, 'away_result'] = df['result'].str.extract(r'(\d)$', expand=False)

# Create a new 'winner' column based on the highest score between home and away results
df['winner'] = df.apply(
    lambda row: row['home_team'] if row['home_result'] > row['away_result'] else (row['away_team'] if row['away_result'] > row['home_result'] else None),
    axis=1
)



df['home_result'] = pd.to_numeric(df['home_result'])
df['away_result'] = pd.to_numeric(df['away_result'])

# Count the number of sets won by each team in each game
# For the home team, if the home score >= 3, they won a set. For the away team, if the away score >= 3, they won a set.
df['home_sets_won'] = df['home_result']
df['away_sets_won'] = df['away_result']
df['number_of_sets'] = df['home_sets_won'] + df['away_sets_won']

# Count the number of games won by each team
# If the home team won 3 sets, they won the game. Similarly for the away team.
df['home_game_won'] = df['home_result'].apply(lambda x: 1 if x == 3 else 0)
df['away_game_won'] = df['away_result'].apply(lambda x: 1 if x == 3 else 0)


df['home_sets_lost'] = df['number_of_sets'] - df['home_sets_won']
df['away_sets_lost'] = df['number_of_sets'] - df['away_sets_won']


# Count the number of home games played by each team
home_games_played = df.groupby('home_team').size()

# Count the number of away games played by each team
away_games_played = df.groupby('away_team').size()

# Combine home and away games to get the total number of games played by each team
total_games_played = pd.concat([home_games_played, away_games_played], axis=0).groupby(level=0).sum()


# Create a summary dataframe with home, away, and total games played, sets won, and games won
summary_2024_2025 = pd.DataFrame({
    'home_games_played': home_games_played,
    'away_games_played': away_games_played,
    'total_games_played': total_games_played,
    'sets_won': pd.concat([
        df.groupby('home_team')['home_sets_won'].sum(),
        df.groupby('away_team')['away_sets_won'].sum()
    ], axis=0).groupby(level=0).sum(),
    'sets_lost': pd.concat([
        df.groupby('home_team')['home_sets_lost'].sum(),
        df.groupby('away_team')['away_sets_lost'].sum()
    ], axis=0).groupby(level=0).sum(),    'games_won': pd.concat([
        df.groupby('home_team')['home_game_won'].sum(),
        df.groupby('away_team')['away_game_won'].sum()
    ], axis=0).groupby(level=0).sum()
}).fillna(0)  # Handle teams that have no games played

summary_2024_2025["season"] = "2024-2025"

df_2024_2025 = df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'home_result'] = df['result'].str.extract(r'(\d)', expand=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'away_result'] = df['result'].str.extract(r'(\d)$', expand=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['winner'] = df.apply(
A value is trying to be set o

In [None]:
dfs = [df_2021_2022, df_2022_2023, df_2023_2024, df_2024_2025]
overall_df = pd.concat(dfs, ignore_index=True)
overall_df

Unnamed: 0,match_number,date,home_team,away_team,stadium,result,season,time,home_result,away_result,winner,home_sets_won,away_sets_won,number_of_sets,home_game_won,away_game_won,home_sets_lost,away_sets_lost
0,129818,02-10-21,DTU Volley,Team Køge,Engelsborghallen,3 - 0,2021-2022,14:30,3,0,DTU Volley,3,0,3,1,0,0,3
1,129819,02-10-21,Hvidovre VK.2,Farum-Holte,Frihedens Idrætscenter,0 - 3,2021-2022,15:00,0,3,Farum-Holte,0,3,3,0,1,3,0
2,129820,03-10-21,Gentofte Volley.2,KV 61,Kildeskovshal,3 - 0,2021-2022,12:00,3,0,Gentofte Volley.2,3,0,3,1,0,0,3
3,129822,03-10-21,VLI,Frederiksberg Volley,Kedelhallen,2 - 3,2021-2022,13:30,2,3,Frederiksberg Volley,2,3,5,0,1,3,2
4,129817,03-10-21,Ishøj Volley,Grøndal EV,Strandgårdskolens sportshal,3 - 1,2021-2022,16:00,3,1,Ishøj Volley,3,1,4,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334,141117,11-01-25,Farum-Holte,KV 61,Ny Holtehal,3 - 0,2024-2025,11:00,3,0,Farum-Holte,3,0,3,1,0,0,3
335,141121,11-01-25,Team Køge,Grøndal EV,Spar Nord Arena,2 - 3,2024-2025,13:30,2,3,Grøndal EV,2,3,5,0,1,3,2
336,141120,11-01-25,DTU Volley,Hvidovre VK.2,Engelsborghallen,2 - 3,2024-2025,14:30,2,3,Hvidovre VK.2,2,3,5,0,1,3,2
337,141118,12-01-25,VLI,Gentofte Volley.2,Kedelhallen,3 - 1,2024-2025,11:30,3,1,VLI,3,1,4,1,0,1,3


In [7]:
summaries = [summary_2021_2022, summary_2022_2023, summary_2023_2024, summary_2024_2025]
overall_summary = pd.concat(summaries, ignore_index=False)
overall_summary

Unnamed: 0,home_games_played,away_games_played,total_games_played,sets_won,sets_lost,games_won,season
Amager VK.2,10,10,20,44,26,13,2021-2022
DTU Volley,10,10,20,39,40,11,2021-2022
Farum-Holte,10,10,20,34,43,10,2021-2022
Frederiksberg Volley,10,10,20,54,13,18,2021-2022
Gentofte Volley.2,10,10,20,58,13,19,2021-2022
Grøndal EV,10,10,20,30,48,5,2021-2022
Hvidovre VK.2,10,10,20,28,43,7,2021-2022
Ishøj Volley,11,9,20,42,33,11,2021-2022
KV 61,9,11,20,19,56,2,2021-2022
Team Køge,10,10,20,23,55,4,2021-2022


In [8]:
import json

# Convert DataFrame to JSON
data = overall_df.to_dict(orient="records")  # Convert DataFrame rows to list of dictionaries
with open("../all_seasons.json", "w") as f:
    json.dump(data, f, indent=4)

overall_df.to_csv("../all_seasons.csv", index=False)


In [9]:
frb = overall_df[(overall_df['home_team'] == 'Frederiksberg Volley') | (overall_df['away_team'] == 'Frederiksberg Volley')]

frb['game_type'] = frb.apply(lambda row: 'home' if row['home_team'] == 'Frederiksberg Volley' else ('away' if row['away_team'] == 'Frederiksberg Volley' else 'neutral'), axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frb['game_type'] = frb.apply(lambda row: 'home' if row['home_team'] == 'Frederiksberg Volley' else ('away' if row['away_team'] == 'Frederiksberg Volley' else 'neutral'), axis=1)


In [None]:
# Convert DataFrame to JSON
data = frb.to_dict(orient="records")  # Convert DataFrame rows to list of dictionaries
with open("../all_seasons_frb.json", "w") as f:
    json.dump(data, f, indent=4)

frb.to_csv("../all_seasons_frb.csv", index=False)