# Dataset generation pipeline

## Step 1: data gather

In [8]:
import numpy as np
import pandas as pd
from nba_api.stats.static import teams

In [85]:
nba_teams = pd.DataFrame(teams.get_teams())
nba_teams[0:5]

Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [3]:
from nba_api.stats.endpoints import leaguegamefinder

In [4]:
# Season year:
# Season type: https://github.com/swar/nba_api/blob/master/docs/nba_api/stats/library/parameters.md#seasontype
gamefinder = leaguegamefinder.LeagueGameFinder(
    season_nullable="2023-24"
)

In [5]:
games_df = gamefinder.get_data_frames()[0]

In [6]:
games_df[0:5]

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42023,1610612754,IND,Indiana Pacers,42300126,2024-05-02,IND vs. MIL,W,240,120,...,0.882,7,37,44,33,7,3,10,26,22.0
1,42023,1610612749,MIL,Milwaukee Bucks,42300126,2024-05-02,MIL @ IND,L,242,98,...,0.656,15,25,40,19,4,0,12,18,-22.0
2,42023,1610612752,NYK,New York Knicks,42300116,2024-05-02,NYK @ PHI,W,240,118,...,0.75,20,28,48,28,6,8,9,17,3.0
3,42023,1610612755,PHI,Philadelphia 76ers,42300116,2024-05-02,PHI vs. NYK,L,240,115,...,0.957,16,30,46,20,6,5,11,22,-3.0
4,42023,1610612742,DAL,Dallas Mavericks,42300175,2024-05-01,DAL @ LAC,W,241,123,...,0.75,9,31,40,27,5,6,8,22,30.0


In [7]:
games_df.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

* SEASON_ID: The unique identifier for the NBA season.
* TEAM_ID: The unique identifier for the team.
* TEAM_ABBREVIATION: The abbreviation of the team's name.
* TEAM_NAME: The full name of the team.
* GAME_ID: The unique identifier for the game.
* GAME_DATE: The date of the game.
* MATCHUP: The matchup of the game (e.g., "LAL vs. BOS" indicates Los Angeles Lakers vs. Boston Celtics).
* WL: The result of the game (W for win, L for loss).
* MIN: The total minutes played in the game.
* PTS: The total points scored by the team.
* FGM: The total field goals made by the team.
* FGA: The total field goals attempted by the team.
* FG_PCT: The field goal percentage (FGM/FGA) of the team.
* FG3M: The total three-point field goals made by the team.
* FG3A: The total three-point field goals attempted by the team.
* FG3_PCT: The three-point field goal percentage (FG3M/FG3A) of the team.
* FTM: The total free throws made by the team.
* FTA: The total free throws attempted by the team.
  FT_PCT: The free throw percentage (FTM/FTA) of the team.
* OREB: The total offensive rebounds grabbed by the team.
* DREB: The total defensive rebounds grabbed by the team.
* REB: The total rebounds grabbed by the team.
* AST: The total assists made by the team.
* STL: The total steals made by the team.
* BLK: The total blocks made by the team.
* TOV: The total turnovers committed by the team.
* PF: The total personal fouls committed by the team.
* PLUS_MINUS: The plus-minus statistic, indicating the point differential when the player is on the court.

In [21]:
len(games)

4263

In [28]:
games[0:1].to_dict()

{'SEASON_ID': {0: '42023'},
 'TEAM_ID': {0: 1610612756},
 'TEAM_ABBREVIATION': {0: 'PHX'},
 'TEAM_NAME': {0: 'Phoenix Suns'},
 'GAME_ID': {0: '0042300163'},
 'GAME_DATE': {0: '2024-04-26'},
 'MATCHUP': {0: 'PHX vs. MIN'},
 'WL': {0: 'L'},
 'MIN': {0: 238},
 'PTS': {0: 109},
 'FGM': {0: 35},
 'FGA': {0: 76},
 'FG_PCT': {0: 0.461},
 'FG3M': {0: 11},
 'FG3A': {0: 28},
 'FG3_PCT': {0: 0.393},
 'FTM': {0: 28},
 'FTA': {0: 32},
 'FT_PCT': {0: 0.875},
 'OREB': {0: 5},
 'DREB': {0: 23},
 'REB': {0: 28},
 'AST': {0: 24},
 'STL': {0: 8},
 'BLK': {0: 3},
 'TOV': {0: 9},
 'PF': {0: 26},
 'PLUS_MINUS': {0: -17.0}}

In [53]:

def home_teams_win_percentage(df):
    home_games = df[df['MATCHUP'].str.contains('vs.')]

    win_percentage = home_games.groupby('TEAM_ID')['WL'].apply(lambda x: (x == 'W').sum() / len(x)).reset_index()
    win_percentage.rename(columns={'WL': 'home_team_win_percentage'}, inplace=True)

    return win_percentage

def away_teams_win_percentage(df):
    away_games = df[df['MATCHUP'].str.contains('@')]

    win_percentage = away_games.groupby('TEAM_ID')['WL'].apply(lambda x: (x == 'W').sum() / len(x)).reset_index()
    win_percentage.rename(columns={'WL': 'away_team_win_percentage'}, inplace=True)

    return win_percentage


def home_teams_average_points_scored(df):
    home_games = df[df['MATCHUP'].str.contains('vs.')]

    avg_points_scored = home_games.groupby('TEAM_ID')['PTS'].mean().reset_index()
    avg_points_scored.rename(columns={'PTS': 'home_team_average_points_scored'}, inplace=True)

    return avg_points_scored

def away_teams_average_points_scored(df):
    away_games = df[df['MATCHUP'].str.contains('@')]

    avg_points_scored = away_games.groupby('TEAM_ID')['PTS'].mean().reset_index()
    avg_points_scored.rename(columns={'PTS': 'away_team_average_points_scored'}, inplace=True)

    return avg_points_scored


def add_team_names(df, nba_teams):
    team_id_to_name = {team['id']: team['full_name'] for team in nba_teams.to_dict(orient="records")}
    df['TEAM_NAME'] = df['TEAM_ID'].map(team_id_to_name)   
    return df

def get_home_winner(winner_str):
    if winner_str == "W":
        return 1
    return 0

In [73]:
home_win_percentage_df

Unnamed: 0,TEAM_ID,home_team_win_percentage,TEAM_NAME
0,12315,1.000000,
1,1610612737,0.545455,Atlanta Hawks
2,1610612738,0.891304,Boston Celtics
3,1610612739,0.652174,Cleveland Cavaliers
4,1610612740,0.478261,New Orleans Pelicans
...,...,...,...
64,1612709931,0.583333,
65,1612709932,0.520000,
66,1612709933,0.520000,
67,1612709971,0.000000,


In [54]:
home_win_percentage_df = add_team_names(home_teams_win_percentage(games), nba_teams)
away_win_percentage_df = add_team_names(away_teams_win_percentage(games), nba_teams)
home_avg_points_df = add_team_names(home_teams_average_points_scored(games), nba_teams)
away_avg_points_df = add_team_names(away_teams_average_points_scored(games), nba_teams)
#import numpy as np

def get_value_for_team(df, team_id, column):
    values = df[ df["TEAM_ID"] == team_id][column].values
    if len(values) > 0:
        return values[0]
    else:
        return np.nan
        
def get_team_ids_from_matchup_str(matchup_str: str, teams_df: pd.DataFrame):
    if "vs." in matchup_str:
        teams = matchup_str.split("vs.")
        home_t = teams[0].strip()
        away_t = teams[1].strip()
    elif "@" in matchup_str:
        teams = matchup_str.split("@")
        home_t = teams[1].strip()
        away_t = teams[0].strip()

    return teams_df[teams_df["abbreviation"] == home_t]["id"].values[0], teams_df[teams_df["abbreviation"] == away_t]["id"].values[0]
    

In [62]:
home_win_percentage_df[ home_win_percentage_df["TEAM_ID"] == 1610612738]

Unnamed: 0,TEAM_ID,home_team_win_percentage,TEAM_NAME
2,1610612738,0.891304,Boston Celtics


In [63]:
away_win_percentage_df[ away_win_percentage_df["TEAM_ID"] == 1610612738]

Unnamed: 0,TEAM_ID,away_team_win_percentage,TEAM_NAME
6,1610612738,0.673913,Boston Celtics


In [64]:
home_avg_points_df[ home_win_percentage_df["TEAM_ID"] == 1610612738]

Unnamed: 0,TEAM_ID,home_team_average_points_scored,TEAM_NAME
2,1610612738,122.152174,Boston Celtics


In [65]:
away_avg_points_df[ away_win_percentage_df["TEAM_ID"] == 1610612738]

Unnamed: 0,TEAM_ID,away_team_average_points_scored,TEAM_NAME
6,1610612738,117.173913,Boston Celtics


The goal is to have a dataset like the following. Each row is the information for a match and the target is W if the home_team won, L if the home_team loose.

TARGET, DURATION, IS_TEAM_1_HOME, TEAM_1_HOME_WIN_PERC, TEAM_1_AWAY_WIN_PERC, TEAM_2_HOME_WIN_PERC, TEAM_2_AWAY_WIN_PERC

Upgrade:
- Hot encode team names

Upgrade 2:
- add rolling averages (points scored last month)

In [66]:
home_avg_points_df[home_avg_points_df["TEAM_ID"]==1612709973]

Unnamed: 0,TEAM_ID,home_team_average_points_scored,TEAM_NAME
68,1612709973,26.5,


In [67]:
away_avg_points_df[away_avg_points_df["TEAM_ID"]==1612709973]

Unnamed: 0,TEAM_ID,away_team_average_points_scored,TEAM_NAME


In [68]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [69]:
teams_ids = set()
for index, row in games.iterrows():
    teams_ids.add(row["TEAM_ID"])

In [72]:
teams_stats = {}
for team_id in teams_ids:
    teams_stats[team_id] = {}
    teams_stats[team_id]["home_win_percentage"] = get_value_for_team(home_win_percentage_df, team_id, "home_team_win_percentage")
    teams_stats[team_id]["home_avg_points"] = get_value_for_team(home_avg_points_df, team_id, "home_team_average_points_scored")

In [236]:
dataset = []

for index, row in games.iterrows():

    team_id = row["TEAM_ID"]
    home_won = get_home_winner(row["WL"])
    minutes = row["MIN"]

    home_win_percentage = get_value_for_team(home_win_percentage_df, team_id, "home_team_win_percentage")
    home_avg_points = get_value_for_team(home_avg_points_df, team_id, "home_team_average_points_scored")
    
    
    game_ids = games[games["GAME_ID"] == row["GAME_ID"]]
    away_team_id = game_ids[game_ids["TEAM_ID"] != team_id]["TEAM_ID"].values[0]

    away_win_percentage = get_value_for_team(away_win_percentage_df, away_team_id, "away_team_win_percentage")
    away_avg_points = get_value_for_team(away_avg_points_df, away_team_id, "away_team_average_points_scored")

    dataset.append(
        (home_won, minutes, home_win_percentage, home_avg_points, away_win_percentage, away_avg_points)
    )

    

In [238]:
columns = ["home_won", "minutes", "home_win_percentage", "home_avg_points", "away_win_percentage", "away_avg_points"]

In [239]:
dataset = pd.DataFrame(dataset, columns=columns)

In [240]:
dataset

Unnamed: 0,home_won,minutes,home_win_percentage,home_avg_points,away_win_percentage,away_avg_points
0,0,238,0.590909,115.500000,0.666667,114.177778
1,1,237,0.755556,112.555556,0.586957,116.217391
2,1,266,0.659091,124.840909,0.431818,115.818182
3,1,240,0.613636,117.659091,0.619048,115.309524
4,0,240,0.595745,113.510638,0.577778,115.844444
...,...,...,...,...,...,...
4258,1,239,0.755556,112.555556,0.577778,115.844444
4259,0,240,0.222222,104.296296,0.500000,116.000000
4260,1,238,,,0.040000,101.840000
4261,0,239,,,0.040000,101.840000


## Step 2: preprocessing 

In [246]:
dataset

Unnamed: 0,home_won,minutes,home_win_percentage,home_avg_points,away_win_percentage,away_avg_points
0,0,238,0.590909,115.500000,0.666667,114.177778
1,1,237,0.755556,112.555556,0.586957,116.217391
2,1,266,0.659091,124.840909,0.431818,115.818182
3,1,240,0.613636,117.659091,0.619048,115.309524
4,0,240,0.595745,113.510638,0.577778,115.844444
...,...,...,...,...,...,...
4256,1,239,0.755556,112.555556,0.577778,115.844444
4257,0,240,0.613636,117.659091,0.666667,114.177778
4258,1,239,0.755556,112.555556,0.577778,115.844444
4259,0,240,0.222222,104.296296,0.500000,116.000000


In [243]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4263 entries, 0 to 4262
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   home_won             4263 non-null   int64  
 1   minutes              4263 non-null   int64  
 2   home_win_percentage  4246 non-null   float64
 3   home_avg_points      4246 non-null   float64
 4   away_win_percentage  4255 non-null   float64
 5   away_avg_points      4255 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 200.0 KB


==> Some columns contain null values

In [244]:
dataset.dropna(inplace=True)

In [245]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4244 entries, 0 to 4262
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   home_won             4244 non-null   int64  
 1   minutes              4244 non-null   int64  
 2   home_win_percentage  4244 non-null   float64
 3   home_avg_points      4244 non-null   float64
 4   away_win_percentage  4244 non-null   float64
 5   away_avg_points      4244 non-null   float64
dtypes: float64(4), int64(2)
memory usage: 232.1 KB


Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting scipy>=1.6.0
  Downloading scipy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.4.0-py3-none-any.whl (17 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.4.0-py3-none-any.whl (301 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.2/301.2 KB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m[31m6.7 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: threadpoolctl, scipy, joblib, s

      home_won   minutes  home_win_percentage  home_avg_points  \
0            0 -0.294653             0.265087         0.232087   
1            1 -0.397415             1.282494        -0.271192   
2            1  2.582677             0.686406         1.828683   
3            1 -0.089130             0.405527         0.601130   
4            0 -0.089130             0.294968        -0.107944   
...        ...       ...                  ...              ...   
4256         1 -0.191891             1.282494        -0.271192   
4257         0 -0.089130             0.405527         0.601130   
4258         1 -0.191891             1.282494        -0.271192   
4259         0 -0.089130            -2.013156        -1.682907   
4262         1 -0.089130            -2.013156        -1.682907   

      away_win_percentage  away_avg_points  
0                1.445983         0.307031  
1                0.907302         0.671263  
2               -0.141122         0.599973  
3                1.124173  

## TEAMS

In [30]:
def get_win_percentage_for_team(games_df, team_id, home_game: bool):
    loc = "vs." if home_game else "@"
    col_name = "win_percentage_at_home" if home_game else "win_percentage_away"
    games = games_df[(games_df['TEAM_ID'] == team_id) & games_df['MATCHUP'].str.contains(loc)  ]
    win_percentage = games.groupby('TEAM_ID')['WL'].apply(lambda x: (x == 'W').sum() / len(x)).reset_index()
    win_percentage.rename(columns={'WL': col_name}, inplace=True)
    return win_percentage

In [10]:
def get_average_points_scored_for_team(games_df, team_id, home_game: bool):
    loc = "vs." if home_game else "@"
    col_name = 'average_points_scored_at_home' if home_game else "average_points_scored_away"
    
    games = games_df[(games_df['TEAM_ID'] == team_id) & games_df['MATCHUP'].str.contains(loc)  ]
    avg_points_scored = games.groupby('TEAM_ID')['PTS'].mean().reset_index()
    avg_points_scored.rename(columns={'PTS': col_name}, inplace=True)
    return avg_points_scored

In [14]:
def get_stats_for_team(team_id: int, games_df):
    df_wp_home = get_win_percentage_for_team(games_df, team_id, False)
    df_wp_away = get_win_percentage_for_team(games_df, team_id, True)
    merged_wp_df = pd.merge(df_wp_home, df_wp_away, on='TEAM_ID')

    df_ap_home = get_average_points_scored_for_team(games_df, team_id, False)
    df_ap_away = get_average_points_scored_for_team(games_df, team_id, True)
    merged_ap_df = pd.merge(df_ap_home, df_ap_away, on='TEAM_ID')
    
    merged_stats = pd.merge(merged_wp_df, merged_ap_df, on='TEAM_ID')
    return merged_stats

In [26]:
def get_nba_team_ids(games_df):
    teams_ids = set()
    for i, game in games_df.iterrows():
        teams_ids.add(game["TEAM_ID"])    
    return teams_ids

In [111]:
teams_ids = get_nba_team_ids(games_df)

In [125]:
def get_teams_stats(teams_ids, games_df):
    teams_stats = []
    for team_id in teams_ids:
        teams_stats.append(get_stats_for_team(team_id, games_df))
    teams_stats = pd.concat(teams_stat).reset_index().drop("index", axis=1)
    return teams_stats

In [126]:
teams_stats = teams_stats(teams_ids, games_df)

  teams_stats = pd.concat(teams_stat).reset_index().drop("index", axis=1)


In [127]:
len(teams_ids) # Some teams will be dropped

79

In [128]:
teams_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62 entries, 0 to 61
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   TEAM_ID                        62 non-null     int64  
 1   win_percentage_away            62 non-null     float64
 2   win_percentage_at_home         62 non-null     float64
 3   average_points_scored_away     62 non-null     float64
 4   average_points_scored_at_home  62 non-null     float64
dtypes: float64(4), int64(1)
memory usage: 2.5 KB


In [129]:
teams_stats.head() 

Unnamed: 0,TEAM_ID,win_percentage_away,win_percentage_at_home,average_points_scored_away,average_points_scored_at_home
0,1610612737,0.340909,0.545455,114.522727,120.886364
1,1610612738,0.673913,0.891304,117.173913,122.152174
2,1610612739,0.488889,0.652174,109.444444,113.391304
3,1610612740,0.630435,0.478261,113.782609,112.434783
4,1610612741,0.431818,0.488889,112.0,112.088889


In [41]:
def get_home_winner(winner_str):
    if winner_str == "W":
        return 1
    return 0

In [53]:
teams_stats[teams_stats["TEAM_ID"] == 1610612737].drop()


Unnamed: 0,TEAM_ID,win_percentage_away,win_percentage_at_home,average_points_scored_away,average_points_scored_at_home
0,1610612737,0.340909,0.545455,114.522727,120.886364


In [56]:
home_team_stats = teams_stats[teams_stats["TEAM_ID"] == 1610612737].drop("TEAM_ID", axis=1)
away_team_stats = teams_stats[teams_stats["TEAM_ID"] == 1610612739].drop("TEAM_ID", axis=1)
pd.concat([home_team_stats, away_team_stats], axis=1)



Unnamed: 0,win_percentage_away,win_percentage_at_home,average_points_scored_away,average_points_scored_at_home,win_percentage_away.1,win_percentage_at_home.1,average_points_scored_away.1,average_points_scored_at_home.1
0,0.340909,0.545455,114.522727,120.886364,0.488889,0.652174,109.444444,113.391304


In [106]:
def get_dataset():
    dataset = []
    
    for i, game in games_df.iterrows():
        # get home and away teams
        home_team_id = game["TEAM_ID"]
        game_ids = games_df[games_df["GAME_ID"] == game["GAME_ID"]]
        away_team_id = game_ids[game_ids["TEAM_ID"] != team_id]["TEAM_ID"].values[0]
    
        # get home and away teams stats
        home_team_stats = teams_stats[teams_stats["TEAM_ID"] == home_team_id].drop("TEAM_ID", axis=1).rename(columns = {
            'win_percentage_away': 'home_team_win_percentage_away', 
            'win_percentage_at_home': 'home_team_win_percentage_at_home',
            'average_points_scored_away': 'home_team_average_points_scored_away',
            'average_points_scored_at_home': 'home_team_average_points_scored_at_home'})
    
        away_team_stats = teams_stats[teams_stats["TEAM_ID"] == away_team_id].drop("TEAM_ID", axis=1).rename(columns = {
            'win_percentage_away': 'away_team_win_percentage_away', 
            'win_percentage_at_home': 'away_team_win_percentage_at_home',
            'average_points_scored_away': 'away_team_average_points_scored_away',
            'average_points_scored_at_home': 'away_team_average_points_scored_at_home'})
        
        merged_stats = pd.concat([home_team_stats, away_team_stats], axis=1)
        
        
        # add other fields
        home_won = get_home_winner(game["WL"])
        merged_stats.insert(0, "home_team_won", home_won, True)

        dataset.append(
            merged_stats
        )
    
    return pd.concat(dataset).reset_index().drop("index", axis=1).dropna()
    

In [107]:
dataset = get_dataset()

In [109]:
dataset.head(10)

Unnamed: 0,home_team_won,home_team_win_percentage_away,home_team_win_percentage_at_home,home_team_average_points_scored_away,home_team_average_points_scored_at_home,away_team_win_percentage_away,away_team_win_percentage_at_home,away_team_average_points_scored_away,away_team_average_points_scored_at_home
0,1,0.468085,0.673913,119.319149,124.76087,0.468085,0.673913,119.319149,124.76087
1,0,0.413043,0.744681,115.369565,120.212766,0.468085,0.673913,119.319149,124.76087
2,1,0.555556,0.625,113.977778,108.583333,0.555556,0.625,113.977778,108.583333
3,0,0.521739,0.595745,113.326087,114.723404,0.555556,0.625,113.977778,108.583333
4,1,0.586957,0.6,116.0,117.511111,0.586957,0.6,116.0,117.511111
5,1,0.673913,0.891304,117.173913,122.152174,0.673913,0.891304,117.173913,122.152174
6,0,0.627907,0.583333,115.325581,113.083333,0.586957,0.6,116.0,117.511111
7,0,0.531915,0.531915,107.12766,111.297872,0.673913,0.891304,117.173913,122.152174
8,0,0.555556,0.625,113.977778,108.583333,0.555556,0.625,113.977778,108.583333
9,1,0.521739,0.595745,113.326087,114.723404,0.555556,0.625,113.977778,108.583333


In [104]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4272 entries, 0 to 4282
Data columns (total 8 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   home_team_win_percentage_away            4272 non-null   float64
 1   home_team_win_percentage_at_home         4272 non-null   float64
 2   home_team_average_points_scored_away     4272 non-null   float64
 3   home_team_average_points_scored_at_home  4272 non-null   float64
 4   away_team_win_percentage_away            4272 non-null   float64
 5   away_team_win_percentage_at_home         4272 non-null   float64
 6   away_team_average_points_scored_away     4272 non-null   float64
 7   away_team_average_points_scored_at_home  4272 non-null   float64
dtypes: float64(8)
memory usage: 300.4 KB


In [76]:
nba_teams.head()

Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


In [91]:
def get_stats_of_teams(home_team: str, away_team: str):
    home_team_id = nba_teams[nba_teams["full_name"]==home_team]["id"].item()
    away_team_id = nba_teams[nba_teams["full_name"]==away_team]["id"].item()
    
    # get home and away teams stats
    home_team_stats = teams_stats[teams_stats["TEAM_ID"] == home_team_id].drop("TEAM_ID", axis=1).rename(columns = {
        'win_percentage_away': 'home_team_win_percentage_away', 
        'win_percentage_at_home': 'home_team_win_percentage_at_home',
        'average_points_scored_away': 'home_team_average_points_scored_away',
        'average_points_scored_at_home': 'home_team_average_points_scored_at_home'})

    away_team_stats = teams_stats[teams_stats["TEAM_ID"] == away_team_id].drop("TEAM_ID", axis=1).rename(columns = {
        'win_percentage_away': 'away_team_win_percentage_away', 
        'win_percentage_at_home': 'away_team_win_percentage_at_home',
        'average_points_scored_away': 'away_team_average_points_scored_away',
        'average_points_scored_at_home': 'away_team_average_points_scored_at_home'})
    
    return pd.concat([home_team_stats, away_team_stats], axis=1)
    
    

In [97]:
get_stats_of_teams("Atlanta Hawks","Boston Celtics")

Unnamed: 0,home_team_win_percentage_away,home_team_win_percentage_at_home,home_team_average_points_scored_away,home_team_average_points_scored_at_home,away_team_win_percentage_away,away_team_win_percentage_at_home,away_team_average_points_scored_away,away_team_average_points_scored_at_home
0,0.340909,0.545455,114.522727,120.886364,0.673913,0.891304,117.173913,122.152174


In [None]:
! pip install scikit-learn

In [101]:
from sklearn.preprocessing import StandardScaler

features_to_standardize = [
    "home_team_win_percentage_away", 
    "home_team_win_percentage_at_home", 
    "home_team_average_points_scored_away", 
    "home_team_average_points_scored_at_home",
    "away_team_win_percentage_away", 
    "away_team_win_percentage_at_home", 
    "away_team_average_points_scored_away", 
    "away_team_average_points_scored_at_home"
]

scaler = StandardScaler()

dataset[features_to_standardize] = scaler.fit_transform(dataset[features_to_standardize])

print(dataset)


      home_team_win_percentage_away  home_team_win_percentage_at_home  \
0                          0.095167                          0.780598   
1                         -0.278001                          1.219021   
2                          0.688194                          0.477570   
3                          0.458928                          0.296327   
4                          0.901084                          0.322690   
...                             ...                               ...   
4278                       0.901084                          0.322690   
4279                       1.490627                          1.286393   
4280                       0.901084                          0.322690   
4281                       1.490627                          1.286393   
4282                      -2.807138                         -2.017732   

      home_team_average_points_scored_away  \
0                                 1.477201   
1                              