<a href="https://colab.research.google.com/github/arielba2002/Deep-Picker-Project/blob/model%2Fpreprocess-mini-fix/model/PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Pre-Processing
--------------
This Notebook main goal is to orginize the scraped data to a tensor structure ready for train/test splitting.

# Import Packges


In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import json
import os

# Import Raw Data

## Download Raw Data

In [43]:
!gdown --id 1tD_yAa_R3eb-ssKWOu-oqxlRH0lVvQJ_ -O previous_year_data.json

Downloading...
From: https://drive.google.com/uc?id=1tD_yAa_R3eb-ssKWOu-oqxlRH0lVvQJ_
To: /content/previous_year_data.json
100% 8.20M/8.20M [00:00<00:00, 35.8MB/s]


## Load Raw Data

In [44]:
data_path = '/content/previous_year_data.json'

with open(data_path, 'r') as json_data:
    data = json.load(json_data)

print("Sample player data:", json.dumps(data['LAL_2020']["players"][0], indent=4))


Sample player data: {
    "id": 2517,
    "playerName": "LeBron James",
    "position": "PG",
    "age": 35,
    "games": 67,
    "gamesStarted": 67,
    "minutesPg": 2316.0,
    "fieldGoals": 643,
    "fieldAttempts": 1303,
    "fieldPercent": 0.493,
    "threeFg": 148,
    "threeAttempts": 425,
    "threePercent": 0.348,
    "twoFg": 495,
    "twoAttempts": 878,
    "twoPercent": 0.564,
    "effectFgPercent": 0.55,
    "ft": 264,
    "ftAttempts": 381,
    "ftPercent": 0.693,
    "offensiveRb": 66,
    "defensiveRb": 459,
    "totalRb": 525,
    "assists": 684,
    "steals": 78,
    "blocks": 36,
    "turnovers": 261,
    "personalFouls": 118,
    "points": 1698,
    "team": "LAL",
    "season": 2020,
    "playerId": "jamesle01"
}


# Data Preproccessing

## Define Global Constants

In [45]:
unwanted_stats = ["id", "playerName", "team", "season", "playerId", "gamesStarted"]
per_game_stats = [
    "points", "assists", "steals", "blocks", "turnovers", "personalFouls",
    "offensiveRb", "defensiveRb", "totalRb", "fieldGoals", "fieldAttempts",
    "threeFg", "threeAttempts", "twoFg", "twoAttempts", "ft", "ftAttempts"
]
years_to_remove = [1996, 2023, 2024]
per_minute_stats = per_game_stats
team_year_filters = {"TOT": None}  # Remove all seasons of TOT


## Helper Functions (Utils)

In [46]:
def create_players_df(team_data, team_key):
    """
    Converts nested dictionary structure for a single team_season
    into a pandas DataFrame of player statistics and labels.
    """
    labels = team_data.get("labels", [])
    player_list = team_data.get("players", [])
    df = pd.DataFrame(player_list)

    # Extract team and season information from the team_key
    team_name, season = team_key.split("_")
    df["team"] = team_name
    df["season"] = int(season)

    return df, labels

def remove_year_data(json_data, years):
    # Build the suffix for the specific year (e.g., '1996')
    for year in years:
      year_suffix = f"_{year}"

      # Iterate over a copy of the keys to avoid modifying the dictionary while iterating
      for key in list(json_data.keys()):
          if key.endswith(year_suffix):
              del json_data[key]  # Remove the key and its value

    return json_data

def remove_unwanted_stats(df, stats_to_remove):
    """Removes unwanted statistic columns from the DataFrame."""
    return df.drop(columns=stats_to_remove, errors="ignore")

def normalize_columns(df, cols, divisor_col):
    """Normalizes specified columns by dividing them by a divisor column."""
    df = df.copy()
    for col in cols:
        if col in df.columns and divisor_col in df.columns:
            df[col] = df.apply(lambda row: row[col] / row[divisor_col] if row[divisor_col] != 0 else 0, axis=1)
    return df

def one_hot_encode_positions(df, position_col="position"):
    """
    Converts the player position column into one-hot encoded columns,
    ensuring consistent output for the 5 standard basketball positions.

    :param df: Player statistics DataFrame.
    :param position_col: Name of the column containing player positions.
    :return: DataFrame with one-hot encoded position columns.
    """
    if position_col in df.columns:
        standard_positions = ["PG", "SG", "SF", "PF", "C"]

        # Initialize zero columns
        for pos in standard_positions:
            df[f"{position_col}_{pos}"] = 0

        # Iterate through rows to set appropriate columns to 1
        for idx, value in df[position_col].dropna().items():
            # Normalize and split multi-position strings
            positions = [p.strip().upper() for p in value.replace("/", "-").split("-")]
            for pos in positions:
                col_name = f"{position_col}_{pos}"
                if col_name in df.columns:
                    df.at[idx, col_name] = 1

        # Drop the original column
        df = df.drop(columns=[position_col])

    return df


def prepare_team_input_and_labels(df, labels):
    """Prepares the final input (X) and labels (Y) for a single team."""
    X = df.to_numpy()
    Y = np.array(labels) if labels else np.array([])
    return X, Y

def should_remove_team(team_key, teams_to_remove):
    """Determines if a team should be removed based on filters."""
    if not teams_to_remove:
        return False

    team_name, season = team_key.split("_")
    season = int(season)

    if team_name in teams_to_remove:
        year_range = teams_to_remove[team_name]
        if year_range is None or (year_range[0] <= season <= year_range[1]):
            return True
    return False

def pad_teams_to_max_players(all_teams_X, max_players):
    """Pads or truncates all teams to have consistent player counts."""
    padded_teams = []
    for X_team in all_teams_X:
        if X_team.shape[0] < max_players:
            padding = np.zeros((max_players - X_team.shape[0], X_team.shape[1]))
            padded_team = np.vstack([X_team, padding])
        else:
            padded_team = X_team[:max_players]
        padded_teams.append(padded_team)
    return padded_teams


In [47]:
data = remove_year_data(data, years_to_remove)

## Single Team Preprocess Pipeline

In [48]:
def preprocess_team_season_data(team_season_data, team_key, stats_to_remove, per_game_stats, per_minute_stats,
                                games_col="games", minutes_col="minutesPg", position_col="position"):
    """End-to-end pipeline to preprocess a single team_season entry."""
    df, labels = create_players_df(team_season_data, team_key)
    df = remove_unwanted_stats(df, stats_to_remove)
    df = normalize_columns(df, per_minute_stats, minutes_col)
    df = one_hot_encode_positions(df, position_col)
    X, Y = prepare_team_input_and_labels(df, labels)

    return X, Y


##Multiple Team Preprocess Pipeline

In [49]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def minmax_scale_last_axis(X):
    """
    MinMax scales values along the last axis, treating each index in that axis
    independently. For each position i in the last axis, it gathers all values
    across the first two axes and scales them to the [0, 1] range.

    Parameters:
    - X: np.ndarray of shape (A, B, C), or more generally (..., C)

    Returns:
    - X_scaled: same shape as X, with values scaled per index in last axis
    """
    original_shape = X.shape
    last_dim = original_shape[-1]

    # Flatten everything but the last dimension
    reshaped = X.reshape(-1, last_dim)  # shape: (A*B*..., C)

    # Scale each column (i.e., each index in the last axis) independently
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(reshaped)

    # Reshape back to original
    X_scaled = scaled.reshape(original_shape)

    return X_scaled, scaler

In [50]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def minmax_scale_2d(X):
    """
    MinMax scales each column of a 2D matrix independently to the [0, 1] range.

    Parameters:
    - X: np.ndarray of shape (n_rows, n_cols)

    Returns:
    - X_scaled: np.ndarray of same shape, scaled per column
    - scaler: fitted MinMaxScaler object
    """
    if X.ndim != 2:
        raise ValueError("Input must be a 2D numpy array")

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    return X_scaled, scaler

In [51]:
def preprocess_multiple_teams(all_teams_data, stats_to_remove, per_game_stats, per_minute_stats,
                              teams_to_remove=None, games_col="games", minutes_col="minutesPg",
                              max_players_per_team=None, position_col="position"):
    """Preprocesses multiple team_season entries at once."""
    all_teams_X = []
    all_labels = []

    for team_key, team_data in all_teams_data.items():
        print(team_key)
        if should_remove_team(team_key, teams_to_remove):
            continue

        X_team, Y_team = preprocess_team_season_data(
            team_season_data=team_data,
            team_key=team_key,
            stats_to_remove=stats_to_remove,
            per_game_stats=per_game_stats,
            per_minute_stats=per_minute_stats,
            games_col=games_col,
            minutes_col=minutes_col,
            position_col=position_col
        )
        all_teams_X.append(X_team)
        all_labels.append(Y_team)

    if max_players_per_team is None:
        max_players_per_team = max(team.shape[0] for team in all_teams_X)

    padded_teams_X = pad_teams_to_max_players(all_teams_X, max_players_per_team)
    X = np.stack(padded_teams_X)
    Y = np.array(all_labels)

    X, x_scaler = minmax_scale_last_axis(X)
    Y, y_scaler = minmax_scale_2d(Y)

    return X, Y, x_scaler, y_scaler


## Small Examples (single team  & multiple teams - 2 teams)

In [52]:
def run_examples():
    """Run example usage for single and multiple teams."""
    team_key = "LAL_2020"
    X_lal, Y_lal = preprocess_team_season_data(
        team_season_data=data[team_key],
        team_key=team_key,
        stats_to_remove=unwanted_stats,
        per_game_stats=per_game_stats,
        per_minute_stats=per_minute_stats
    )

    mock_all_teams_data = {
        "LAL_2020": data["LAL_2020"],
        "PHO_1997": data["PHO_1997"],
        "PHO_1998": data["PHO_1998"],
        "LAL_2022": data["LAL_2022"]
    }

    X_all, Y_all, feature_scaler, label_scaler = preprocess_multiple_teams(
        all_teams_data=mock_all_teams_data,
        stats_to_remove=unwanted_stats,
        per_game_stats=per_game_stats,
        per_minute_stats=per_minute_stats,
        teams_to_remove=team_year_filters
    )

    print("\n*** All Teams Combined ***")
    print("Combined X shape:", X_all.shape)
    print("Combined Y shape:", Y_all.shape)
    print("Sample Combined X:", X_all[0][0])
    print("Sample Combined Y:", Y_all[0])

# Uncomment to run examples
run_examples()


LAL_2020
PHO_1997
PHO_1998
LAL_2022

*** All Teams Combined ***
Combined X shape: (4, 8, 30)
Combined Y shape: (4, 17)
Sample Combined X: [0.86666667 0.63414634 0.6688687  0.86076467 0.94088279 0.52040816
 0.52613702 0.56924324 0.78911565 0.76173446 0.81118291 0.63779528
 0.61290323 0.48591383 0.6210376  0.42599278 0.153731   0.51975692
 0.39188256 1.         0.36046935 0.21482745 1.         0.07915354
 0.85686931 1.         0.         0.         0.         0.        ]
Sample Combined Y: [0.64351852 0.         0.72222222 1.         0.5        0.37106918
 1.         0.67142857 0.6741573  1.         1.         0.
 0.         0.         0.84931507 0.33333333 1.        ]


## Final All Data Preprocessing

In [53]:
# Process training data and get scalers
X_train, Y_train, feature_scaler, label_scaler = preprocess_multiple_teams(
    all_teams_data=data,
    stats_to_remove=unwanted_stats,
    per_game_stats=per_game_stats,
    per_minute_stats=per_minute_stats,
    teams_to_remove=team_year_filters,
    max_players_per_team=8  # Fixed number of players per team
)

print("\n*** All Teams Combined ***")
print("Combined X shape:", X_train.shape)
print("Combined Y shape:", Y_train.shape)
print("Combined X:", X_train[0][0])
print("Combined Y:", Y_train[0])


ATL_1997
BOS_1997
CHH_1997
CHI_1997
CLE_1997
DAL_1997
DEN_1997
DET_1997
GSW_1997
HOU_1997
IND_1997
LAC_1997
LAL_1997
MIA_1997
MIL_1997
MIN_1997
NJN_1997
NYK_1997
ORL_1997
PHI_1997
PHO_1997
POR_1997
SAC_1997
SAS_1997
SEA_1997
TOR_1997
TOT_1997
UTA_1997
VAN_1997
ATL_1998
BOS_1998
CHH_1998
CHI_1998
CLE_1998
DAL_1998
DEN_1998
DET_1998
GSW_1998
HOU_1998
IND_1998
LAC_1998
LAL_1998
MIA_1998
MIL_1998
MIN_1998
NJN_1998
NYK_1998
ORL_1998
PHI_1998
PHO_1998
POR_1998
SAC_1998
SAS_1998
SEA_1998
TOR_1998
TOT_1998
UTA_1998
VAN_1998
ATL_1999
BOS_1999
CHH_1999
CHI_1999
CLE_1999
DAL_1999
DEN_1999
DET_1999
GSW_1999
HOU_1999
IND_1999
LAC_1999
LAL_1999
MIA_1999
MIL_1999
MIN_1999
NJN_1999
NYK_1999
ORL_1999
PHI_1999
PHO_1999
POR_1999
SAC_1999
SAS_1999
SEA_1999
TOR_1999
TOT_1999
UTA_1999
VAN_1999
WAS_1999
ATL_2000
BOS_2000
CHH_2000
CHI_2000
CLE_2000
DAL_2000
DEN_2000
DET_2000
GSW_2000
HOU_2000
IND_2000
LAC_2000
LAL_2000
MIA_2000
MIL_2000
MIN_2000
NJN_2000
NYK_2000
ORL_2000
PHI_2000
PHO_2000
POR_2000
SAC_2000
S