<a href="https://colab.research.google.com/github/arielba2002/Deep-Picker-Project/blob/16-preprocessed-datasets-are-ready-for-model-training-and-testing/DeepPicker_PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This Notebook main goal is to orginize the scraped data to a tensor structure ready for train/test splitting.


# INSTALL/IMPORT PACKAGES


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import json
import os

# Import Raw Data

In [2]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

# Define potential paths for the shared folder
search_paths = [
    "/content/drive/My Drive",            # My Drive
    "/content/drive/My Drive/Shortcuts"  # Shortcuts in My Drive
]

# Define the shared folder name
shared_folder_name = "Deep-Picker-Project"

# Search for the folder in "My Drive"
data_drive_dir = None
for path in search_paths:
    possible_path = os.path.join(path, shared_folder_name)
    if os.path.exists(possible_path):
        data_drive_dir = possible_path
        break

if not data_drive_dir:
    raise FileNotFoundError(
        f"Could not find the shared folder '{shared_folder_name}' in 'My Drive' or its shortcuts."
    )

# Define paths to raw data and metadata
data_path = f"{data_drive_dir}/Model Training/Data/raw_data.json"
metadata_path = f"{data_drive_dir}/Model Training/Data/metadata.json"
# Import Data Json
with open(data_path, 'r') as json_data:
    data = json.load(json_data)

# Import Metadata Json
with open(metadata_path, 'r') as json_metadata:
    metadata = json.load(json_metadata)

print(json.dumps(data['LAL_2020']['players'][8], indent=4))

Mounted at /content/drive
{
    "id": 2603,
    "playerName": "JaVale McGee",
    "position": "C",
    "age": 32,
    "games": 68,
    "gamesStarted": 68,
    "minutesPg": 1130.0,
    "fieldGoals": 195,
    "fieldAttempts": 306,
    "fieldPercent": 0.637,
    "threeFg": 3,
    "threeAttempts": 6,
    "threePercent": 0.5,
    "twoFg": 192,
    "twoAttempts": 300,
    "twoPercent": 0.64,
    "effectFgPercent": 0.642,
    "ft": 53,
    "ftAttempts": 82,
    "ftPercent": 0.646,
    "offensiveRb": 125,
    "defensiveRb": 265,
    "totalRb": 390,
    "assists": 37,
    "steals": 36,
    "blocks": 94,
    "turnovers": 55,
    "personalFouls": 159,
    "points": 446,
    "team": "LAL",
    "season": 2020,
    "playerId": "mcgeeja01"
}


# Data Preproccessing

In [3]:
# ==============================
# DEFINE GLOBAL CONSTANTS
# ==============================
unwanted_stats = ["id", "playerName", "team", "season", "playerId", "gamesStarted"]
per_game_stats = ["points", "assists", "steals", "blocks", "turnovers",
                  "personalFouls", "offensiveRb", "defensiveRb", "totalRb",
                  "fieldGoals", "fieldAttempts", "threeFg", "threeAttempts",
                  "twoFg", "twoAttempts", "ft", "ftAttempts"]
per_minute_stats = per_game_stats

# Specify teams and ranges to remove
team_year_filters = {
    "TOT": None,          # Remove all seasons of TOT
}

In [4]:
# ==============================
# 1. CREATE PLAYERS DF
# ==============================
def create_players_df(team_data, team_key):
    """
    Converts the nested dictionary structure for a single team_season
    into a pandas DataFrame of player statistics, along with a labels list.

    :param team_data: Dictionary containing a single team's season data.
    :param team_key: The team-season key (e.g., "PHO_2024").
    :return: (df, labels) -> DataFrame of player stats, and a labels list.
    """
    labels = team_data.get("labels", [])
    player_list = team_data.get("players", [])
    df = pd.DataFrame(player_list)

    # Extract team and season information from the team_key
    team_name, season = team_key.split("_")
    df["team"] = team_name
    df["season"] = int(season)

    return df, labels

# ==============================
# 2. STAT REMOVAL
# ==============================
def remove_unwanted_stats(df, stats_to_remove):
    """
    Removes a list of unwanted statistic columns from the DataFrame.

    :param df: Original player statistics DataFrame.
    :param stats_to_remove: List of column names to remove.
    :return: DataFrame with specified columns removed.
    """
    df = df.drop(columns=stats_to_remove, errors='ignore')
    return df

# ==============================
# 3. PER-GAME NORMALIZATION
# ==============================
def per_game_normalize(df, per_game_cols, games_col="games"):
    """
    Performs per-game normalization on specified columns by dividing
    their values by the 'games' column.

    :param df: Player statistics DataFrame.
    :param per_game_cols: List of columns to convert to per-game values.
    :param games_col: Column name for the number of games.
    :return: DataFrame with specified columns converted to per-game stats.
    """
    df = df.copy()
    for col in per_game_cols:
        df[col] = df.apply(lambda row: row[col] / row[games_col] if row[games_col] != 0 else 0, axis=1)
    return df

# ==============================
# NORMALIZE PLAYER STATS TO PER-MINUTES BASIS
# ==============================
def per_minute_normalize(df, per_minute_cols, minutes_col="minutesPg"):
    """
    Normalizes specified per-game stats to a per-minute basis.

    :param df: Player statistics DataFrame.
    :param per_minute_cols: List of columns already normalized to per-game values.
    :param minutes_col: Column name for the minutes per game.
    :return: DataFrame with specified columns normalized to per-minute stats.
    """
    df = df.copy()
    for col in per_minute_cols:
        df[col] = df.apply(
            lambda row: row[col] / row[minutes_col] if row[minutes_col] != 0 else 0, axis=1
        )
    return df

# ==============================
# 4. MIN-MAX SCALING
# ==============================
def minmax_scale_df(df):
    """
    Scales all numeric columns of the DataFrame to the [0, 1] range using MinMaxScaler.
    Non-numeric columns are left as is.

    :param df: DataFrame of stats.
    :return: (scaled_df, scaler) -> scaled DataFrame and the fitted MinMaxScaler object.
    """
    df = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df, scaler

# ==============================
# 5. PREPARE FINAL X, Y (SINGLE TEAM)
# ==============================
def prepare_team_input_and_labels(df, labels):
    """
    Prepares the final input (X) and labels (Y) for a single team.
    :param df: Processed DataFrame (scaled, etc.).
    :param labels: Placeholder for future labels (currently empty or partial).
    :return: (X, Y) where X is a numpy array, Y is a numpy array of labels.
    """
    X = df.to_numpy()
    Y = np.array(labels) if labels else np.array([])
    return X, Y

# ==============================
# 6. END-TO-END PIPELINE (SINGLE TEAM)
# ==============================
def preprocess_team_season_data(team_season_data,
                                team_key,
                                stats_to_remove,
                                per_game_stats,
                                per_minute_stats,
                                games_col='games',
                                minutes_col='minutesPg'):
    """
    End-to-end pipeline to preprocess a single team_season entry.
    1. Convert to DataFrame
    2. Remove unwanted stats
    3. Per-game normalization
    4. Per-minute normalization
    5. MinMax scale
    6. Return (X, Y, scaler)
    """
    # Convert team data to a DataFrame
    df, labels = create_players_df(team_season_data, team_key)

    # Remove unwanted stats
    df = remove_unwanted_stats(df, stats_to_remove)

    # Normalize per-game stats
    df = per_game_normalize(df, per_game_stats, games_col)

    # Normalize per-minute stats (on already per-game normalized stats)
    df = per_minute_normalize(df, per_minute_stats, minutes_col)

    # MinMax scaling
    df, scaler = minmax_scale_df(df)

    # Prepare final X and Y arrays
    X, Y = prepare_team_input_and_labels(df, labels)
    return X, Y, scaler

In [5]:
# ==============================
# EXAMPLE USAGE FOR A SINGLE TEAM
# ==============================
team_key = "LAL_2020"

X_pho, Y_pho, scaler_pho = preprocess_team_season_data(
    data[team_key],
    team_key=team_key,
    stats_to_remove=unwanted_stats,
    per_game_stats=per_game_stats,
    per_minute_stats=per_minute_stats,
)

print("Single team X shape:", X_pho.shape)
print("Single team Y:", Y_pho)

Single team X shape: (20, 26)
Single team Y: []


In [6]:
def preprocess_multiple_teams(all_teams_data,
                              stats_to_remove,
                              per_game_stats,
                              per_minute_stats,
                              teams_to_remove=None,
                              games_col='games',
                              minutes_col='minutesPg',
                              max_players_per_team=None):
    """
    Preprocesses multiple team_season entries at once and produces a 3D array (X) and labels (Y).

    :param all_teams_data: Dict of many team_season entries, e.g. {"PHO_2024": {...}, "LAL_2024": {...}, ...}.
    :param stats_to_remove: List of columns to remove.
    :param per_game_stats: List of columns to normalize per game.
    :param per_minute_stats: List of columns already normalized to per-game, to further normalize to per-minute.
    :param teams_to_remove: Dict of team names to year ranges to remove. (Default: None)
    :param games_col: Name of the column containing the number of games. (Default: 'games')
    :param minutes_col: Name of the column containing minutes per game. (Default: 'minutesPg')
    :param max_players_per_team: Maximum number of players to include per team (pads with zeros if fewer).
                                 If None, use the largest number of players in the dataset.
    :return: (X, Y, scaler) where:
             X: 3D numpy array with shape (n_teams, max_players_per_team, n_features).
             Y: 1D numpy array of team labels (if available).
             scaler: The fitted MinMaxScaler used for scaling.
    """
    all_teams_X = []
    all_labels = []
    feature_scaler = None

    # 1. Preprocess each team using the single-team pipeline
    for team_key, team_data in all_teams_data.items():
        # Skip teams in `teams_to_remove`
        if teams_to_remove:
            team_name, season = team_key.split("_")
            season = int(season)

            # If the team is in the filter, check the year range
            if team_name in teams_to_remove:
                year_range = teams_to_remove[team_name]
                if year_range is None or (year_range[0] <= season <= year_range[1]):
                    continue

        # Use the single-team preprocessing pipeline
        X_team, Y_team, scaler = preprocess_team_season_data(
            team_season_data=team_data,
            team_key=team_key,
            stats_to_remove=stats_to_remove,
            per_game_stats=per_game_stats,
            per_minute_stats=per_minute_stats,
            games_col=games_col,
            minutes_col=minutes_col
        )

        # Save the scaler for reference
        if feature_scaler is None:
            feature_scaler = scaler

        # Add team X and Y to the respective lists
        all_teams_X.append(X_team)
        all_labels.append(Y_team)

    # Determine max players per team for consistent shape
    if max_players_per_team is None:
        max_players_per_team = max([team.shape[0] for team in all_teams_X])

    # 2. Pad teams to have consistent player counts
    padded_teams_X = []
    for X_team in all_teams_X:
        if X_team.shape[0] < max_players_per_team:
            # Pad with zeros if fewer players
            padding = np.zeros((max_players_per_team - X_team.shape[0], X_team.shape[1]))
            padded_team = np.vstack([X_team, padding])
        else:
            # Truncate if more players than allowed
            padded_team = X_team[:max_players_per_team]
        padded_teams_X.append(padded_team)

    # Convert to a 3D numpy array
    X = np.stack(padded_teams_X)  # Shape: (n_teams, max_players_per_team, n_features)

    # Convert Y to a 1D numpy array (team-level labels, if applicable)
    Y = np.array(all_labels)

    return X, Y, feature_scaler

In [7]:
# ==============================
# 8. EXAMPLE WITH MULTIPLE TEAMS
# ==============================
"""
If you had multiple teams in the dictionary, you could do something like this:

team_season_data = {
    "PHO_2024": {...},
    "LAL_2024": {...},
    "MIL_2024": {...},
    ... etc. ...
}

Below, we only have "PHO_2024". For demonstration, let's just reuse the same data
to pretend we have 2 entries. In a real case, you'd replace these duplicates with
actual different team data.
"""

# Mock multiple teams by duplicating the single-team data structure:
mock_all_teams_data = {
    "PHO_2024": data["PHO_2024"],
    "LAL_2020": data["LAL_2020"]
}

X_all, Y_all, scaler_all = preprocess_multiple_teams(
    all_teams_data=mock_all_teams_data, # Specified to run on mock data
    stats_to_remove=unwanted_stats,
    per_game_stats=per_game_stats,
    per_minute_stats=per_minute_stats,
    teams_to_remove=team_year_filters,
)

print("\n*** All Teams Combined ***")
print("Combined X shape:", X_all.shape)
print("Combined Y shape:", Y_all.shape)
print("Combined X:", X_all[0][0])


*** All Teams Combined ***
Combined X shape: (2, 20, 26)
Combined Y shape: (2, 0)
Combined X: ['PF' 0.15384615384615374 0.5416666666666666 0.16414686825053995
 0.1256283244070518 0.05064497414447084 1.0000000000000002
 0.06218079040015867 0.041255516437744826 0.423 0.18307135419249268
 0.04825902137810148 1.0 1.0 0.012892348886795258 0.01633030858994066
 0.6836581709145427 0.03385068015404704 0.16660881638320027
 0.08695393464570833 0.03203252838796053 0.01586750632220955
 0.5516437744830663 0.01633030858994066 0.033321763276640055
 0.07333994872722222]


In [8]:
# ==============================
# 9. FINAL ALL DATA PREPROCCESSING
# ==============================

X_all, Y_all, scaler_all = preprocess_multiple_teams(
    all_teams_data=data, # specified to run on all data
    stats_to_remove=unwanted_stats,
    per_game_stats=per_game_stats,
    per_minute_stats=per_minute_stats,
    teams_to_remove=team_year_filters,
)

print("\n*** All Teams Combined ***")
print("Combined X shape:", X_all.shape)
print("Combined Y shape:", Y_all.shape)
print("Combined X:", X_all[0][0])


*** All Teams Combined ***
Combined X shape: (861, 20, 26)
Combined Y shape: (861, 0)
Combined X: ['PF' 0.08333333333333326 0.9620253164556962 0.4688021354688022
 0.025745548165629694 0.032330925647810434 0.30909090909090864 0.0
 0.0010995494529071016 0.0 0.025745548165629694 0.03612086998498176
 0.33532934131736514 0.23999999999999977 0.026594793678037618
 0.033522849173997 0.4849094567404427 0.09617392548094116
 0.026088822141171426 0.040310853655629454 0.0 0.009833369091039119
 0.09445142571081211 0.007291219695344347 0.012803046556532933
 0.02594152789926229]
