In [57]:
import pandas as pd
from pathlib import Path
import io, requests

All columns from the kaggle dataset:

`Index(['season_id', 'team_id_home', 'team_abbreviation_home', 'team_name_home',
       'game_id', 'game_date', 'matchup_home', 'wl_home', 'min', 'fgm_home',
       'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home', 'fg3_pct_home',
       'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home', 'dreb_home',
       'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home', 'pf_home',
       'pts_home', 'plus_minus_home', 'video_available_home', 'team_id_away',
       'team_abbreviation_away', 'team_name_away', 'matchup_away', 'wl_away',
       'fgm_away', 'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away',
       'fg3_pct_away', 'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away',
       'dreb_away', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away',
       'pf_away', 'pts_away', 'plus_minus_away', 'video_available_away',
       'season_type'],
      dtype='object')`

In [128]:
# ---------------------------------------------------------------------
# 1.  Load Kaggle game metadata (date, season, id)
# ---------------------------------------------------------------------
CSV_DIR   = Path("data/csv")
DATA_DIR  = Path("data")

game_df = pd.read_csv(CSV_DIR / "game.csv",
                      usecols=["game_id", "game_date", "season_id", "team_id_home", "wl_home", 'fgm_home',
       'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home', 'fg3_pct_home',
       'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home', 'dreb_home',
       'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home', 'pf_home',
       'plus_minus_home', 'team_id_away',
       'team_name_away', 'matchup_away', 
       'fgm_away', 'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away',
       'fg3_pct_away', 'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away',
       'dreb_away', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away',
       'pf_away', 'pts_away', 'plus_minus_away'])
game_df["date_game"] = pd.to_datetime(game_df["game_date"]).dt.date

In [129]:
# ---------------------------------------------------------------------
# 2.  Load Kaggle line scores (already 1 row per TEAM per GAME)
# ---------------------------------------------------------------------
line_df = pd.read_csv(CSV_DIR / "line_score.csv",
                      usecols=["game_id", "team_abbreviation_home", "team_abbreviation_away", "pts_home"])

# bring in the date + season
kaggle_long = line_df.merge(game_df,
                            on="game_id", how="inner")

In [124]:
# ---------------------------------------------------------------------
# 3.  Load & reshape FiveThirtyEight Elo → long format
# ---------------------------------------------------------------------
elo_538_raw = pd.read_csv(DATA_DIR / "nbaallelo.csv")
elo_538_raw["date_game"] = pd.to_datetime(elo_538_raw["date_game"]).dt.date

elo_team1 = (
    elo_538_raw[["date_game", "team_id", "elo_i", "elo_n"]]
      .rename(columns={"team_id": "TEAM_ABBREVIATION",
                       "elo_i": "elo_pre_538",
                       "elo_n": "elo_post_538"})
)
elo_team2 = (
    elo_538_raw[["date_game", "opp_id", "opp_elo_i", "opp_elo_n"]]
      .rename(columns={"opp_id": "TEAM_ABBREVIATION",
                       "opp_elo_i": "elo_pre_538",
                       "opp_elo_n": "elo_post_538"})
)

elo_538_long = pd.concat([elo_team1, elo_team2], ignore_index=True)

In [130]:
kaggle_long.head()

Unnamed: 0,game_id,team_abbreviation_home,pts_home,team_abbreviation_away,season_id,team_id_home,game_date,wl_home,fgm_home,fga_home,...,dreb_away,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,date_game
0,24600001,HUS,66.0,NYK,21946,1610610035,1946-11-01 00:00:00,L,25.0,,...,,,,,,,,68.0,2,1946-11-01
1,24600003,BOM,56.0,PIT,21946,1610610034,1946-11-02 00:00:00,W,20.0,59.0,...,,,,,,,25.0,51.0,-5,1946-11-02
2,24600002,BOS,53.0,PRO,21946,1610610032,1946-11-02 00:00:00,W,21.0,,...,,,,,,,,53.0,-6,1946-11-02
3,24600004,CHS,63.0,NYK,21946,1610610025,1946-11-02 00:00:00,W,21.0,,...,,,,,,,22.0,47.0,-16,1946-11-02
4,24600005,WAS,50.0,DEF,21946,1610610028,1946-11-02 00:00:00,L,10.0,,...,,,,,,,,50.0,17,1946-11-02


In [132]:
# ---------------------------------------------------------------------
# 4.  Reconcile historical team codes (minimal starter dict)
# ---------------------------------------------------------------------
alias = {
    "NJN": "BKN", "BRK": "BKN",   # Nets
    "NOH": "NOP", "NOK": "NOP",   # Pelicans
    "CHH": "CHA",                 # Old Hornets
    "SEA": "OKC",                 # Sonics → Thunder
}
kaggle_long["team_abbreviation_home"] = kaggle_long["team_abbreviation_home"].replace(alias)

kaggle_long.rename(columns={"team_abbreviation_home": "TEAM_ABBREVIATION"}, inplace=True)

elo_538_long.rename(columns={"team_id": "TEAM_ABBREVIATION"}, inplace=True)
elo_538_long["TEAM_ABBREVIATION"] = elo_538_long["TEAM_ABBREVIATION"].replace(alias)


In [134]:
# ---------------------------------------------------------------------
# 5.  Merge on date + franchise code
# ---------------------------------------------------------------------
merged = kaggle_long.merge(
    elo_538_long,
    on=["date_game", "TEAM_ABBREVIATION"],
    how="inner"
)

In [135]:
merged.head(-10)

Unnamed: 0,game_id,TEAM_ABBREVIATION,pts_home,team_abbreviation_away,season_id,team_id_home,game_date,wl_home,fgm_home,fga_home,...,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,date_game,elo_pre_538,elo_post_538
0,24600002,BOS,53.0,PRO,21946,1610610032,1946-11-02 00:00:00,W,21.0,,...,,,,,,53.0,-6,1946-11-02,1300.0000,1294.8458
1,24600002,BOS,53.0,PRO,21946,1610610032,1946-11-02 00:00:00,W,21.0,,...,,,,,,53.0,-6,1946-11-02,1300.0000,1294.8458
2,24600004,CHS,63.0,NYK,21946,1610610025,1946-11-02 00:00:00,W,21.0,,...,,,,,22.0,47.0,-16,1946-11-02,1300.0000,1309.6521
3,24600004,CHS,63.0,NYK,21946,1610610025,1946-11-02 00:00:00,W,21.0,,...,,,,,22.0,47.0,-16,1946-11-02,1300.0000,1309.6521
4,24600008,BOS,55.0,CHS,21946,1610612738,1946-11-05 00:00:00,L,23.0,,...,,,,,,57.0,2,1946-11-05,1294.8458,1288.4139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80717,41400303,CLE,114.0,ATL,42014,1610612739,2015-05-24 00:00:00,W,40.0,97.0,...,20.0,9.0,3.0,7.0,25.0,111.0,-3,2015-05-24,1701.5128,1703.7124
80718,41400314,HOU,128.0,GSW,42014,1610612745,2015-05-25 00:00:00,W,43.0,76.0,...,26.0,9.0,9.0,15.0,29.0,115.0,-13,2015-05-25,1643.4324,1658.9934
80719,41400314,HOU,128.0,GSW,42014,1610612745,2015-05-25 00:00:00,W,43.0,76.0,...,26.0,9.0,9.0,15.0,29.0,115.0,-13,2015-05-25,1643.4324,1658.9934
80720,41400304,CLE,118.0,ATL,42014,1610612739,2015-05-26 00:00:00,W,43.0,87.0,...,22.0,4.0,3.0,8.0,20.0,88.0,-30,2015-05-26,1703.7124,1712.1232


In [136]:
# ---------------------------------------------------------------------
# 6.  Save the merged DataFrame to a CSV file
# ---------------------------------------------------------------------
merged.to_csv("data/raw/combined_kaggle_538_elo.csv", index=False)
