In [None]:
def getURL(year):
    import pandas as pd
    dfs = pd.read_html(f'https://www.sports-reference.com/cbb/schools/wisconsin/men/{year}.html')
    per_game_player_stats = dfs[5]
    about_players = dfs[0]
    per_game_team_stats = dfs[1]
    for col in per_game_team_stats.columns:
        if "Unnamed" in col:
            per_game_team_stats = per_game_team_stats.rename(columns={col: "About"})
    each_game = pd.read_html(f'https://www.sports-reference.com/cbb/schools/wisconsin/men/{year}-gamelogs.html')[0]
    each_game.columns = pd.MultiIndex.from_tuples(
        [('About' if 'Unnamed' in level_0 else level_0, level_1) 
            for level_0, level_1 in each_game.columns],
                names=each_game.columns.names
        )
    # level0_cols = ['About', 'Score', 'Opponent']
    keep_cols = ['Gtm', 'Date', 'Opp', 'Type','Rslt','Tm','Opp','OT','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF']
    each_game.columns = each_game.columns.droplevel(0)
    each_game = each_game[keep_cols]
    return [about_players, per_game_team_stats, per_game_player_stats, each_game]


In [3]:
import os
import pandas as pd

# Define the base directory
base_dir = "data"
os.makedirs(base_dir, exist_ok=True)  # Ensure base directory exists

# Define the range of years
years = range(2025, 2026)  # 2020 to 2025 inclusive

# Process each year
for year in years:
    df1, df2, df3, df4 = getURL(year)

    # Define CSV file paths with year in filename
    file_names = [
        f"about_players_{year}.csv",
        f"per_game_team_stats_{year}.csv",
        f"per_game_player_stats{year}.csv",
        f"each_game_stats_{year}.csv"
    ]
    
    dataframes = [df1, df2, df3, df4]

    # Save each dataframe to CSV
    for df, file_name in zip(dataframes, file_names):
        csv_path = os.path.join(base_dir, file_name)
        df.to_csv(csv_path, index=False)
        print(f"Data saved to {csv_path}")


https://www.sports-reference.com/cbb/schools/wisconsin/men/2025.html
https://www.sports-reference.com/cbb/schools/wisconsin/men/2025-gamelogs.html
Data saved to data/about_players_2025.csv
Data saved to data/per_game_team_stats_2025.csv
Data saved to data/per_game_player_stats2025.csv
Data saved to data/each_game_stats_2025.csv


In [None]:
# Going to add year column to each csv
import os
import pandas as pd

def combine_data(folder_path):
    # Loop through each file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            # Extract the year from the file name
            try:
                
                year_str = file_name.split('-')[0][-8:-4]
                season_year = str(int(year_str))

                # Load the CSV file
                file_path = os.path.join(folder_path, file_name)
                df = pd.read_csv(file_path)

                # Add the seasonYear column
                df['seasonYear'] = season_year

                # Save the updated CSV
                df.to_csv(file_path, index=False)
                print(f"Updated {file_name} with seasonYear {season_year}")
            except ValueError:
                print(f"Skipping file {file_name}, invalid format.")

In [1]:
# add_season_year_to_csvs('./data/about_players')
# add_season_year_to_csvs('./data/each_game_stats')
# add_season_year_to_csvs('./data/per_game_player_stats')
# add_season_year_to_csvs('./data/per_game_team_stats')

In [None]:
# Combine year data into one csv
import os
import pandas as pd

def concat_season_csvs(folder_path, output_file_name):
    all_data = []  # List to hold DataFrames

    # Loop through each CSV file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            try:
                # Load the CSV file
                file_path = os.path.join(folder_path, file_name)
                df = pd.read_csv(file_path)

                # Extract the season year from the file name
                year_str = file_name.split('-')[0][-8:-4]
                season_year = str(int(year_str))
                df['seasonYear'] = season_year

                # Add DataFrame to the list
                all_data.append(df)
                print(f"Processed {file_name}")

            except ValueError:
                print(f"Skipping {file_name}, invalid format.")

    # Concatenate all DataFrames
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)

        # Remove year from output file name
        output_file_path = os.path.join(folder_path, output_file_name)

        # Save the combined DataFrame
        combined_df.to_csv(output_file_path, index=False)
        print(f"Successfully saved to {output_file_path}")
    else:
        print("No valid CSV files found to concatenate.")

# Example usage
folder_path = "./data/per_game_team_stats"
output_file_name = "per_game_team_stats.csv"  # Exclude the year
# concat_season_csvs(folder_path, output_file_name)


Processed per_game_team_stats_2023.csv
Processed per_game_team_stats_2022.csv
Processed per_game_team_stats_2020.csv
Processed per_game_team_stats_2021.csv
Processed per_game_team_stats_2025.csv
Processed per_game_team_stats_2024.csv
Successfully saved to ./data/per_game_team_stats/per_game_team_stats.csv
