# Transform and Load Stages

In [1]:
import os
import time
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from pandas import DataFrame
from rich.pretty import pprint
from sqlalchemy import (
    Boolean,
    Column,
    Float,
    ForeignKey,
    Integer,
    MetaData,
    String,
    Table,
    create_engine,
)

### Read in the CSVs

In [2]:
# Data directory path.
data_dir = Path.cwd().joinpath("data")

# Team Stats ---------------------------------------------------------------------------
with open(data_dir.joinpath("stat_types_and_seasons.txt"), "r") as f:
    data = f.read()
    stat_types = data.split("\n")[:-1]

all_stats = {}
for season_type in ["Regular Season", "Playoffs"]:
    stats = {}
    for stat_type in stat_types:
        # Split and join the stat_type string with underscores for the filepath
        # below.
        split_string = stat_type.split()
        stat_type_underscores = "_".join(split_string).lower()
        if season_type == "Playoffs":
            stat_type_underscores = f"playoffs_{stat_type_underscores}"

        # Create filepath for reading from
        table_filepath = data_dir.joinpath(f"{stat_type_underscores}.csv")

        # Read the DataFrame from file
        stats_df = pd.read_csv(table_filepath)
        stats[stat_type] = stats_df
    all_stats[season_type] = stats
team_stats = all_stats["Regular Season"]
team_stats_playoffs = all_stats["Playoffs"]

# Playoffs -----------------------------------------------------------------------------
playoff_teams_filepath = data_dir.joinpath("playoff_teams.csv")
playoff_teams = pd.read_csv(playoff_teams_filepath)

# Champions ----------------------------------------------------------------------------
champions_filepath = data_dir.joinpath("champions.csv")
champions = pd.read_csv(champions_filepath)

### Test That the Data Loaded Correctly

In [3]:
for stat_type, stat_table in team_stats.items():
    print(f"Regular Season {stat_type} length: {len(stat_table)}")

# An example output.
team_stats["Teams General Advanced"]

Regular Season Teams General Traditional length: 832
Regular Season Teams General Advanced length: 832
Regular Season Teams General Misc length: 832
Regular Season Teams Clutch Traditional length: 832


Unnamed: 0,SEASON,TEAM,GP,W,L,MIN,OFFRTG,DEFRTG,NETRTG,AST%,...,ASTRATIO,OREB%,DREB%,REB%,TOV%,EFG%,TS%,PACE,PIE,POSS
0,2023-24,Boston Celtics,51,39,12,2473.0,120.3,110.6,9.7,60.1,...,18.6,28.8,72.5,51.5,12.6,56.7,60.1,99.11,54.7,5103
1,2023-24,Denver Nuggets,52,36,16,2496.0,118.2,113.6,4.5,65.7,...,20.5,31.0,71.9,51.7,12.8,55.9,58.6,97.19,52.6,5051
2,2023-24,Minnesota Timberwolves,52,36,16,2511.0,114.7,108.6,6.1,64.0,...,19.2,27.8,72.9,51.4,15.1,56.4,59.9,98.15,54.3,5142
3,2023-24,Oklahoma City Thunder,51,35,16,2468.0,119.3,111.4,7.9,61.1,...,19.5,24.9,68.0,47.7,12.4,57.6,61.2,100.51,54.0,5163
4,2023-24,Cleveland Cavaliers,50,34,16,2410.0,115.8,110.2,5.6,63.0,...,19.3,28.9,72.3,51.1,13.8,55.7,58.7,98.78,53.4,4959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827,1996-97,Philadelphia 76ers,82,22,60,3956.0,102.6,109.5,-6.9,56.4,...,15.2,35.9,65.4,50.4,17.9,47.0,51.8,97.05,45.3,8006
828,1996-97,Denver Nuggets,82,21,61,3986.0,103.1,109.7,-6.6,64.4,...,17.4,32.4,66.6,49.6,17.5,48.6,53.0,93.67,46.1,7776
829,1996-97,San Antonio Spurs,82,20,62,3946.0,102.1,110.8,-8.8,58.8,...,16.3,34.4,64.3,49.0,17.1,47.2,51.2,88.45,44.0,7268
830,1996-97,Boston Celtics,82,15,67,3981.0,102.9,110.1,-7.2,58.4,...,16.1,32.9,65.8,48.2,16.7,47.4,52.0,96.78,43.4,8014


In [4]:
for stat_type, stat_table in team_stats_playoffs.items():
    print(f"Playoffs {stat_type} length: {len(stat_table)}")

# An example output.
team_stats_playoffs["Teams General Advanced"]

Playoffs Teams General Traditional length: 432
Playoffs Teams General Advanced length: 432
Playoffs Teams General Misc length: 432
Playoffs Teams Clutch Traditional length: 421


Unnamed: 0,SEASON,TEAM,GP,W,L,MIN,OFFRTG,DEFRTG,NETRTG,AST%,...,ASTRATIO,OREB%,DREB%,REB%,TOV%,EFG%,TS%,PACE,PIE,POSS
0,2022-23,Denver Nuggets,20,16,4,965.0,118.2,110.2,8.0,60.9,...,19.1,30.0,76.2,53.8,12.1,55.7,59.3,95.18,55.3,1919
1,2022-23,Miami Heat,23,13,10,1109.0,113.8,111.9,1.9,59.3,...,17.9,27.2,70.8,48.7,12.3,53.1,56.7,94.87,49.8,2189
2,2022-23,Boston Celtics,20,11,9,965.0,116.1,112.9,3.2,59.1,...,18.2,26.6,71.6,49.7,12.9,56.0,59.4,95.83,52.6,1929
3,2022-23,Los Angeles Lakers,16,8,8,773.0,112.7,110.3,2.4,60.7,...,18.2,26.1,70.9,49.8,12.0,52.9,57.3,99.04,53.2,1595
4,2022-23,Philadelphia 76ers,11,7,4,533.0,110.2,112.1,-2.0,52.0,...,15.2,29.6,76.6,52.7,13.3,51.0,55.5,92.08,48.3,1024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,1996-97,Portland Trail Blazers,4,1,3,192.0,102.6,109.9,-7.4,60.3,...,16.4,29.9,66.5,47.8,17.7,47.9,52.8,87.75,41.8,350
428,1996-97,Charlotte Hornets,3,0,3,144.0,107.1,117.7,-10.6,50.5,...,14.9,35.5,74.8,52.3,14.9,49.2,53.5,89.00,42.0,268
429,1996-97,Los Angeles Clippers,3,0,3,144.0,104.9,118.9,-13.9,50.0,...,14.0,26.8,63.5,43.2,13.6,47.6,52.7,88.17,36.2,264
430,1996-97,Minnesota Timberwolves,3,0,3,144.0,111.2,122.9,-11.7,60.0,...,18.6,31.6,63.6,46.1,10.4,48.7,51.6,90.00,42.5,269


In [5]:
playoff_teams

Unnamed: 0,2022-23,2021-22,2020-21,2019-20,2018-19,2017-18,2016-17,2015-16,2014-15,2013-14,...,2005-06,2004-05,2003-04,2002-03,2001-02,2000-01,1999-00,1998-99,1997-98,1996-97
0,Denver Nuggets,Golden State Warriors,Milwaukee Bucks,Los Angeles Lakers,Toronto Raptors,Golden State Warriors,Golden State Warriors,Cleveland Cavaliers,Golden State Warriors,San Antonio Spurs,...,Miami Heat,San Antonio Spurs,Detroit Pistons,San Antonio Spurs,Los Angeles Lakers,Los Angeles Lakers,Los Angeles Lakers,San Antonio Spurs,Chicago Bulls,Chicago Bulls
1,Miami Heat,Boston Celtics,Phoenix Suns,Miami Heat,Golden State Warriors,Cleveland Cavaliers,Cleveland Cavaliers,Golden State Warriors,Cleveland Cavaliers,Miami Heat,...,Dallas Mavericks,Detroit Pistons,Los Angeles Lakers,New Jersey Nets,New Jersey Nets,Philadelphia 76ers,Indiana Pacers,New York Knicks,Utah Jazz,Utah Jazz
2,Boston Celtics,Miami Heat,Atlanta Hawks,Boston Celtics,Milwaukee Bucks,Houston Rockets,Boston Celtics,Oklahoma City Thunder,Houston Rockets,Indiana Pacers,...,Detroit Pistons,Miami Heat,Indiana Pacers,Dallas Mavericks,Sacramento Kings,Milwaukee Bucks,Portland Trail Blazers,Indiana Pacers,Indiana Pacers,Houston Rockets
3,Los Angeles Lakers,Dallas Mavericks,LA Clippers,Denver Nuggets,Portland Trail Blazers,Boston Celtics,San Antonio Spurs,Toronto Raptors,Atlanta Hawks,Oklahoma City Thunder,...,Phoenix Suns,Phoenix Suns,Minnesota Timberwolves,Detroit Pistons,Boston Celtics,San Antonio Spurs,New York Knicks,Portland Trail Blazers,Los Angeles Lakers,Miami Heat
4,Philadelphia 76ers,Milwaukee Bucks,Brooklyn Nets,Toronto Raptors,Philadelphia 76ers,New Orleans Pelicans,Washington Wizards,Miami Heat,Los Angeles Clippers,Washington Wizards,...,Los Angeles Clippers,Seattle SuperSonics,New Jersey Nets,Sacramento Kings,Dallas Mavericks,Charlotte Hornets,Miami Heat,Utah Jazz,Charlotte Hornets,New York Knicks
5,New York Knicks,Phoenix Suns,Philadelphia 76ers,LA Clippers,Denver Nuggets,Philadelphia 76ers,Houston Rockets,San Antonio Spurs,Washington Wizards,Los Angeles Clippers,...,Cleveland Cavaliers,Dallas Mavericks,Sacramento Kings,Los Angeles Lakers,Charlotte Hornets,Toronto Raptors,Philadelphia 76ers,Los Angeles Lakers,San Antonio Spurs,Seattle SuperSonics
6,Phoenix Suns,Memphis Grizzlies,Utah Jazz,Milwaukee Bucks,Houston Rockets,Utah Jazz,Toronto Raptors,Portland Trail Blazers,Memphis Grizzlies,Portland Trail Blazers,...,San Antonio Spurs,Indiana Pacers,San Antonio Spurs,Philadelphia 76ers,Detroit Pistons,Dallas Mavericks,Phoenix Suns,Philadelphia 76ers,New York Knicks,Los Angeles Lakers
7,Golden State Warriors,Philadelphia 76ers,Denver Nuggets,Houston Rockets,Boston Celtics,Toronto Raptors,Utah Jazz,Atlanta Hawks,Chicago Bulls,Brooklyn Nets,...,New Jersey Nets,Washington Wizards,Miami Heat,Boston Celtics,San Antonio Spurs,Sacramento Kings,Utah Jazz,Atlanta Hawks,Seattle SuperSonics,Atlanta Hawks
8,Sacramento Kings,Minnesota Timberwolves,Dallas Mavericks,Oklahoma City Thunder,San Antonio Spurs,Indiana Pacers,LA Clippers,Charlotte Hornets,San Antonio Spurs,Atlanta Hawks,...,Los Angeles Lakers,Boston Celtics,New Orleans Hornets,Orlando Magic,Indiana Pacers,New York Knicks,Milwaukee Bucks,Detroit Pistons,Houston Rockets,Detroit Pistons
9,Atlanta Hawks,New Orleans Pelicans,Los Angeles Lakers,Utah Jazz,LA Clippers,Milwaukee Bucks,Atlanta Hawks,Indiana Pacers,Brooklyn Nets,Dallas Mavericks,...,Chicago Bulls,Houston Rockets,Dallas Mavericks,Portland Trail Blazers,Philadelphia 76ers,Utah Jazz,Sacramento Kings,Miami Heat,Miami Heat,Orlando Magic


In [6]:
champions

Unnamed: 0,SEASON,TEAM
0,2022-23,Denver Nuggets
1,2021-22,Golden State Warriors
2,2020-21,Milwaukee Bucks
3,2019-20,Los Angeles Lakers
4,2018-19,Toronto Raptors
5,2017-18,Golden State Warriors
6,2016-17,Golden State Warriors
7,2015-16,Cleveland Cavaliers
8,2014-15,Golden State Warriors
9,2013-14,San Antonio Spurs


<br>
<hr>
<br>

## Transform Stage
### Current Variables

#### Playoff Teams
* `playoff_teams`: ***DataFrame***:
    * each season is a column
    * each row value for a given column is a playoff team for that season
    * values within the same row have no relation to each other

#### Champions
* `champions`: ***DataFrame***:
    * each row is a season and the champion team from that season

#### Team Stats and Playoff Team Stats
* `team_stats` and `team_stats_playoffs`: ***dict*** of the form `{<stat_type>: <DataFrame>, ...}` with the following 4 stat_type keys:
    * "Teams General Traditional"
    * "Teams General Advanced"
    * "Teams General Misc"
    * "Teams Clutch Traditional"
    
### Transformations
* transformations are described in each block below

### New Variables Created From Current Variables
* `playoff_teams_long`: ***DataFrame***:
    * created from `playoff_teams`
    * creates a "long" format of the `playoff_teams` DataFrame with a season and team column
* `season_records`: ***DataFrame***:
    * created from `team_stats["Teams General Traditional"]`
    * collects the stat categories that the traditional, advanced, and misc tables have in common:
        * `GP`, `W`, `L`, `PLAYOFFS`, and `CHAMPIONS` columns
* `playoff_records`: ***DataFrame***:
    * created from `team_stats_playoffs["Teams General Traditional"]`
    * collects the stat categories that the traditional, advanced, and misc tables have in common:
        * `GP`, `W`, `L`, and `CHAMPIONS` columns
* `teams`: ***DataFrame***:
    * created from `team_stats["Teams General Traditional"]`
    * collects all the team names
* `seasons`: ***DataFrame***:
    * created from `team_stats["Teams General Traditional"]`
    * collects all the seasons

### Queries for Traditional, Advanced and Misc Stat Tables
* Because common categories will be stored only once and in a separate table (see `season_records` and `playoff_records`) from their original source table (the traditional tables from `team_stats` and `team_stats_playoffs`), these original source tables will have to be built back up via a SQL join when reading from the database:
    * Retrieve `team_stats["Teams General Traditional"]` using raw SQL:
        ```SQL
        SELECT *
          FROM season_records
               INNER JOIN teams_traditional
               ON season_records.TEAM = teams_traditional.TEAM
               AND season_records.SEASON = teams_traditional.SEASON
         WHERE season_records.SEASON <= 2022
        ```        
    * Retrieve `team_stats["Teams General Traditional"]` using SQLAlchemy and ORM mapped classes `SeasonRecords` and `TeamsTraditional`:
        ```python
        with app.app_context():
            query = (
                db.select(SeasonRecords, TeamsTraditional)
                .join(
                    TeamsTraditional,
                    (SeasonRecords.SEASON == TeamsTraditional.SEASON)
                    & (SeasonRecords.TEAM == TeamsTraditional.TEAM),
                )
                .where(SeasonRecords.SEASON <= 2022)
            )
            teams_traditional = pd.read_sql(sql=query, con=db.engine)
            teams_traditional = df.drop(columns=["SEASON_1", "TEAM_1"])
        ```
        * See `query_db_prototype.ipynb` for more information on retrieving data from the database.

### Season Variables

In [7]:
seasons = team_stats["Teams General Traditional"]["SEASON"].unique()
seasons = pd.to_datetime(seasons, format="%Y-%y").year

playoff_seasons = team_stats_playoffs["Teams General Traditional"]["SEASON"].unique()
playoff_seasons = pd.to_datetime(playoff_seasons, format="%Y-%y").year.astype("string")

### Transform `playoff_teams` and Create `playoff_teams_long`
#### Convert Column Names of `playoffs_df` to the Second Year as a String

In [8]:
playoff_teams.columns = playoff_seasons
playoff_teams = playoff_teams.astype("string")
playoff_teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 27 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   2023    16 non-null     string
 1   2022    16 non-null     string
 2   2021    16 non-null     string
 3   2020    16 non-null     string
 4   2019    16 non-null     string
 5   2018    16 non-null     string
 6   2017    16 non-null     string
 7   2016    16 non-null     string
 8   2015    16 non-null     string
 9   2014    16 non-null     string
 10  2013    16 non-null     string
 11  2012    16 non-null     string
 12  2011    16 non-null     string
 13  2010    16 non-null     string
 14  2009    16 non-null     string
 15  2008    16 non-null     string
 16  2007    16 non-null     string
 17  2006    16 non-null     string
 18  2005    16 non-null     string
 19  2004    16 non-null     string
 20  2003    16 non-null     string
 21  2002    16 non-null     string
 22  2001    16 non-null     stri

In [9]:
playoff_teams.head()

Unnamed: 0,2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,...,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997
0,Denver Nuggets,Golden State Warriors,Milwaukee Bucks,Los Angeles Lakers,Toronto Raptors,Golden State Warriors,Golden State Warriors,Cleveland Cavaliers,Golden State Warriors,San Antonio Spurs,...,Miami Heat,San Antonio Spurs,Detroit Pistons,San Antonio Spurs,Los Angeles Lakers,Los Angeles Lakers,Los Angeles Lakers,San Antonio Spurs,Chicago Bulls,Chicago Bulls
1,Miami Heat,Boston Celtics,Phoenix Suns,Miami Heat,Golden State Warriors,Cleveland Cavaliers,Cleveland Cavaliers,Golden State Warriors,Cleveland Cavaliers,Miami Heat,...,Dallas Mavericks,Detroit Pistons,Los Angeles Lakers,New Jersey Nets,New Jersey Nets,Philadelphia 76ers,Indiana Pacers,New York Knicks,Utah Jazz,Utah Jazz
2,Boston Celtics,Miami Heat,Atlanta Hawks,Boston Celtics,Milwaukee Bucks,Houston Rockets,Boston Celtics,Oklahoma City Thunder,Houston Rockets,Indiana Pacers,...,Detroit Pistons,Miami Heat,Indiana Pacers,Dallas Mavericks,Sacramento Kings,Milwaukee Bucks,Portland Trail Blazers,Indiana Pacers,Indiana Pacers,Houston Rockets
3,Los Angeles Lakers,Dallas Mavericks,LA Clippers,Denver Nuggets,Portland Trail Blazers,Boston Celtics,San Antonio Spurs,Toronto Raptors,Atlanta Hawks,Oklahoma City Thunder,...,Phoenix Suns,Phoenix Suns,Minnesota Timberwolves,Detroit Pistons,Boston Celtics,San Antonio Spurs,New York Knicks,Portland Trail Blazers,Los Angeles Lakers,Miami Heat
4,Philadelphia 76ers,Milwaukee Bucks,Brooklyn Nets,Toronto Raptors,Philadelphia 76ers,New Orleans Pelicans,Washington Wizards,Miami Heat,Los Angeles Clippers,Washington Wizards,...,Los Angeles Clippers,Seattle SuperSonics,New Jersey Nets,Sacramento Kings,Dallas Mavericks,Charlotte Hornets,Miami Heat,Utah Jazz,Charlotte Hornets,New York Knicks


#### Create `playoff_teams_long` DataFrame

In [10]:
playoff_teams_long = playoff_teams.melt(var_name="SEASON", value_name="TEAM")
playoff_teams_long["SEASON"] = playoff_teams_long["SEASON"].astype("int32")
playoff_teams_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SEASON  432 non-null    int32 
 1   TEAM    432 non-null    string
dtypes: int32(1), string(1)
memory usage: 5.2 KB


In [11]:
playoff_teams_long.head()

Unnamed: 0,SEASON,TEAM
0,2023,Denver Nuggets
1,2023,Miami Heat
2,2023,Boston Celtics
3,2023,Los Angeles Lakers
4,2023,Philadelphia 76ers


### Transform `champions`

In [12]:
champions["SEASON"] = pd.to_datetime(champions["SEASON"], format="%Y-%y").dt.year
champions["TEAM"] = champions["TEAM"].astype("string")
champions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SEASON  27 non-null     int32 
 1   TEAM    27 non-null     string
dtypes: int32(1), string(1)
memory usage: 456.0 bytes


In [13]:
champions.head()

Unnamed: 0,SEASON,TEAM
0,2023,Denver Nuggets
1,2022,Golden State Warriors
2,2021,Milwaukee Bucks
3,2020,Los Angeles Lakers
4,2019,Toronto Raptors


### Column Name Map

In [14]:
col_name_map = {
    "ASTRATIO": "AST_RATIO",
    "PTSOFF\xa0TO": "PTS_OFF_TO",
    "2NDPTS": "2ND_PTS",
    "OPP\xa0PTSOFF\xa0TO": "OPP_PTS_OFF_TO",
    "OPP2ND\xa0PTS": "OPP_2ND_PTS",
    "OPPFBPS": "OPP_FBPS",
    "OPPPITP": "OPP_PITP",
}

### Transform `team_stats`

#### Helper Function

In [15]:
def transform_team_stats_table(
    stats_table_input: DataFrame,
    col_mapper: dict[str, str],
    is_clutch_table: bool = False,
    include_records_table: bool = False,
) -> DataFrame | tuple[DataFrame, DataFrame]:
    """Transform regular season stat tables for loading into database."""
    stats_table = stats_table_input.copy()

    # Correct the multi-line column names via `col_mapper`.
    stats_table = stats_table.rename(columns=col_mapper)

    # Convert `SEASON` column's year range to an integer of the second year.
    stats_table["SEASON"] = pd.to_datetime(
        stats_table["SEASON"], format="%Y-%y"
    ).dt.year

    # Intialize PLAYOFFS and CHAMPION boolean columns
    stats_table["PLAYOFFS"] = False
    stats_table["CHAMPION"] = False

    # Loop over each season and that season's playoff teams
    for season, playoff_teams_df in playoff_teams.items():
        # Compare the teams from the `season` to `playoff_teams_df` from the same
        # `season`, the boolean will be True if the team was/is a playoff team.
        season = int(season)
        playoffs_boolean_column = stats_table.loc[
            stats_table["SEASON"] == season, "TEAM"
        ].isin(playoff_teams_df)
        stats_table.loc[
            stats_table["SEASON"] == season, "PLAYOFFS"
        ] = playoffs_boolean_column

        # Compare the teams from the `season` to the champion from the same `season`,
        # the boolean will be True if the team was the champion team.
        champion_series = champions.loc[
            champions["SEASON"] == season, "TEAM"
        ].reset_index(drop=True)
        champion = champion_series.loc[0]
        stats_table.loc[
            (stats_table["SEASON"] == season) & (stats_table["TEAM"] == champion),
            "CHAMPION",
        ] = True

    # Change int64 columns to int32.
    dtypes = stats_table.dtypes
    dtypes_int64 = dtypes.loc[dtypes == "int64"]
    stats_table[[*dtypes_int64.index]] = stats_table[[*dtypes_int64.index]].astype(
        "int32"
    )

    # Change "TEAM" column to string dtype.
    stats_table["TEAM"] = stats_table["TEAM"].astype("string")

    # Either return the clutch table as is or drop the common categories and possibly
    # add a records table.
    records_cols = ["SEASON", "TEAM", "GP", "W", "L", "MIN", "PLAYOFFS", "CHAMPION"]
    if is_clutch_table:
        return stats_table
    elif include_records_table:
        records_table = stats_table[records_cols]
        stats_table = stats_table.drop(columns=records_cols[2:])
        return stats_table, records_table
    else:
        stats_table = stats_table.drop(columns=records_cols[2:])
        return stats_table

#### Do the Transformation

In [16]:
for stat_type, stats_table in team_stats.items():
    if stat_type == "Teams General Traditional":
        team_stats[stat_type], season_records = transform_team_stats_table(
            stats_table, col_mapper=col_name_map, include_records_table=True
        )
    elif stat_type == "Teams Clutch Traditional":
        team_stats[stat_type] = transform_team_stats_table(
            stats_table, col_mapper=col_name_map, is_clutch_table=True
        )
    else:
        team_stats[stat_type] = transform_team_stats_table(
            stats_table, col_mapper=col_name_map
        )
team_stats["Teams General Advanced"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 832 entries, 0 to 831
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEASON     832 non-null    int32  
 1   TEAM       832 non-null    string 
 2   OFFRTG     832 non-null    float64
 3   DEFRTG     832 non-null    float64
 4   NETRTG     832 non-null    float64
 5   AST%       832 non-null    float64
 6   AST/TO     832 non-null    float64
 7   AST_RATIO  832 non-null    float64
 8   OREB%      832 non-null    float64
 9   DREB%      832 non-null    float64
 10  REB%       832 non-null    float64
 11  TOV%       832 non-null    float64
 12  EFG%       832 non-null    float64
 13  TS%        832 non-null    float64
 14  PACE       832 non-null    float64
 15  PIE        832 non-null    float64
 16  POSS       832 non-null    int32  
dtypes: float64(14), int32(2), string(1)
memory usage: 104.1 KB


#### Results

In [17]:
team_stats["Teams General Misc"].columns

Index(['SEASON', 'TEAM', 'PTS_OFF_TO', '2ND_PTS', 'FBPS', 'PITP',
       'OPP_PTS_OFF_TO', 'OPP_2ND_PTS', 'OPP_FBPS', 'OPP_PITP'],
      dtype='object')

In [18]:
team_stats["Teams Clutch Traditional"].head(5)

Unnamed: 0,SEASON,TEAM,GP,W,L,WIN%,MIN,PTS,FGM,FGA,...,AST,TOV,STL,BLK,BLKA,PF,PFD,+/-,PLAYOFFS,CHAMPION
0,2024,Dallas Mavericks,22,16,6,72.7,3.0,8.6,2.6,6.0,...,1.6,0.5,0.5,0.4,0.4,1.6,1.9,2.4,False,False
1,2024,Los Angeles Lakers,22,15,7,68.2,4.4,11.3,3.2,6.9,...,1.7,1.1,0.6,0.6,0.3,1.9,3.5,0.9,False,False
2,2024,Boston Celtics,24,16,8,66.7,3.9,10.0,2.9,6.3,...,1.7,0.9,0.4,0.8,0.3,2.0,2.2,2.4,False,False
3,2024,Denver Nuggets,27,18,9,66.7,3.4,8.9,3.0,5.8,...,1.8,0.9,0.6,0.3,0.3,1.8,2.2,2.1,False,False
4,2024,Milwaukee Bucks,27,18,9,66.7,3.7,10.1,2.8,6.1,...,1.7,1.0,0.4,0.4,0.4,1.7,2.8,1.7,False,False


In [19]:
season_records.head(5)

Unnamed: 0,SEASON,TEAM,GP,W,L,MIN,PLAYOFFS,CHAMPION
0,2024,Boston Celtics,51,39,12,48.5,False,False
1,2024,Denver Nuggets,52,36,16,48.0,False,False
2,2024,Minnesota Timberwolves,52,36,16,48.3,False,False
3,2024,Oklahoma City Thunder,51,35,16,48.4,False,False
4,2024,Cleveland Cavaliers,50,34,16,48.2,False,False


### Transform `team_stats_playoffs`
#### Helper Function

In [20]:
def transform_team_stats_playoffs_table(
    stats_table_input: DataFrame,
    col_mapper: dict[str, str],
    is_clutch_table: bool = False,
    include_records_table: bool = False,
) -> DataFrame | tuple[DataFrame, DataFrame]:
    """Transform playoff stat tables for loading into database."""
    stats_table = stats_table_input.copy()

    # Correct the multi-line column names via `col_mapper`.
    stats_table = stats_table.rename(columns=col_mapper)

    # Convert `SEASON` column's year range to an integer of the second year.
    stats_table["SEASON"] = pd.to_datetime(
        stats_table["SEASON"], format="%Y-%y"
    ).dt.year
    # Intialize CHAMPION boolean column
    stats_table["CHAMPION"] = False

    # Loop over each season and that season's playoff teams
    for season in playoff_seasons:
        # Compare the teams from the `season` to the champion from the same `season`,
        # the boolean will be True if the team was the champion team.
        season = int(season)
        champion_series = champions.loc[
            champions["SEASON"] == season, "TEAM"
        ].reset_index(drop=True)
        champion = champion_series.loc[0]
        stats_table.loc[
            (stats_table["SEASON"] == season) & (stats_table["TEAM"] == champion),
            "CHAMPION",
        ] = True

    # Change int64 columns to int32.
    dtypes = stats_table.dtypes
    dtypes_int64 = dtypes.loc[dtypes == "int64"]
    stats_table[[*dtypes_int64.index]] = stats_table[[*dtypes_int64.index]].astype(
        "int32"
    )

    # Change "TEAM" column to string dtype.
    stats_table["TEAM"] = stats_table["TEAM"].astype("string")

    # Either return the clutch table as is or drop the common categories and possibly
    # add a records table.
    records_cols = ["SEASON", "TEAM", "GP", "W", "L", "MIN", "CHAMPION"]
    if is_clutch_table:
        return stats_table
    elif include_records_table:
        records_table = stats_table[records_cols]
        stats_table = stats_table.drop(columns=records_cols[2:])
        return stats_table, records_table
    else:
        stats_table = stats_table.drop(columns=records_cols[2:])
        return stats_table

#### Do the Transformation

In [21]:
for stat_type, playoff_stats_table in team_stats_playoffs.items():
    # Collect the records table from the traditional stats table
    if stat_type == "Teams General Traditional":
        (
            team_stats_playoffs[stat_type],
            playoff_records,
        ) = transform_team_stats_playoffs_table(
            playoff_stats_table, col_mapper=col_name_map, include_records_table=True
        )
    # The clutch table is treated differently due to the different number of games
    # played compared to the other tables.
    elif stat_type == "Teams Clutch Traditional":
        team_stats_playoffs[stat_type] = transform_team_stats_playoffs_table(
            playoff_stats_table, col_mapper=col_name_map, is_clutch_table=True
        )
    else:
        team_stats_playoffs[stat_type] = transform_team_stats_playoffs_table(
            playoff_stats_table, col_mapper=col_name_map
        )

#### Results

In [22]:
team_stats_playoffs["Teams General Advanced"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEASON     432 non-null    int32  
 1   TEAM       432 non-null    string 
 2   OFFRTG     432 non-null    float64
 3   DEFRTG     432 non-null    float64
 4   NETRTG     432 non-null    float64
 5   AST%       432 non-null    float64
 6   AST/TO     432 non-null    float64
 7   AST_RATIO  432 non-null    float64
 8   OREB%      432 non-null    float64
 9   DREB%      432 non-null    float64
 10  REB%       432 non-null    float64
 11  TOV%       432 non-null    float64
 12  EFG%       432 non-null    float64
 13  TS%        432 non-null    float64
 14  PACE       432 non-null    float64
 15  PIE        432 non-null    float64
 16  POSS       432 non-null    int32  
dtypes: float64(14), int32(2), string(1)
memory usage: 54.1 KB


In [23]:
team_stats_playoffs["Teams General Advanced"].head(10)

Unnamed: 0,SEASON,TEAM,OFFRTG,DEFRTG,NETRTG,AST%,AST/TO,AST_RATIO,OREB%,DREB%,REB%,TOV%,EFG%,TS%,PACE,PIE,POSS
0,2023,Denver Nuggets,118.2,110.2,8.0,60.9,2.22,19.1,30.0,76.2,53.8,12.1,55.7,59.3,95.18,55.3,1919
1,2023,Miami Heat,113.8,111.9,1.9,59.3,2.01,17.9,27.2,70.8,48.7,12.3,53.1,56.7,94.87,49.8,2189
2,2023,Boston Celtics,116.1,112.9,3.2,59.1,1.95,18.2,26.6,71.6,49.7,12.9,56.0,59.4,95.83,52.6,1929
3,2023,Los Angeles Lakers,112.7,110.3,2.4,60.7,2.09,18.2,26.1,70.9,49.8,12.0,52.9,57.3,99.04,53.2,1595
4,2023,Philadelphia 76ers,110.2,112.1,-2.0,52.0,1.54,15.2,29.6,76.6,52.7,13.3,51.0,55.5,92.08,48.3,1024
5,2023,Golden State Warriors,110.9,111.1,-0.2,67.2,1.96,19.2,30.4,70.8,50.4,14.1,52.5,55.3,101.92,49.8,1325
6,2023,New York Knicks,107.7,107.8,0.0,53.1,1.28,14.6,34.8,70.7,52.5,15.8,49.2,53.7,92.73,50.0,1022
7,2023,Phoenix Suns,116.1,118.3,-2.2,57.0,2.03,18.2,27.0,68.1,48.1,12.3,55.0,58.8,98.41,49.5,1082
8,2023,Sacramento Kings,109.3,111.6,-2.2,53.7,1.56,15.1,33.9,70.0,51.3,13.6,49.3,53.2,103.93,46.7,728
9,2023,Atlanta Hawks,114.3,118.8,-4.5,55.7,1.91,17.3,29.7,76.4,51.1,12.7,52.9,55.8,101.67,45.8,608


In [24]:
playoff_records.head()

Unnamed: 0,SEASON,TEAM,GP,W,L,MIN,CHAMPION
0,2023,Denver Nuggets,20,16,4,48.3,True
1,2023,Philadelphia 76ers,11,7,4,48.5,False
2,2023,Miami Heat,23,13,10,48.2,False
3,2023,Boston Celtics,20,11,9,48.3,False
4,2023,New York Knicks,11,6,5,48.0,False


### Create `teams` and `seasons` DataFrames
#### `teams` DataFrame

In [25]:
df = team_stats["Teams General Traditional"].copy()
teams = pd.DataFrame(df["TEAM"].unique(), columns=pd.Index(["TEAM"])).reset_index(
    drop=True
)

#### Results

In [26]:
teams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   TEAM    38 non-null     string
dtypes: string(1)
memory usage: 436.0 bytes


In [27]:
len(teams)

38

In [28]:
teams.head()

Unnamed: 0,TEAM
0,Boston Celtics
1,Denver Nuggets
2,Minnesota Timberwolves
3,Oklahoma City Thunder
4,Cleveland Cavaliers


#### `seasons` DataFrame

In [29]:
seasons = pd.DataFrame(df["SEASON"].unique(), columns=pd.Index(["SEASON"]))

#### Results

In [30]:
seasons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   SEASON  28 non-null     int32
dtypes: int32(1)
memory usage: 244.0 bytes


In [31]:
seasons.head()

Unnamed: 0,SEASON
0,2024
1,2023
2,2022
3,2021
4,2020


<br>
<hr>
<br>

## Load Stage
### Build Dict Containing All DataFrames and a Mapping of Types

In [32]:
# Keys will be table names.
tables = {
    "teams": teams,
    "seasons": seasons,
    "season_records": season_records,
    "playoff_records": playoff_records,
    "teams_traditional": team_stats["Teams General Traditional"],
    "teams_advanced": team_stats["Teams General Advanced"],
    "teams_misc": team_stats["Teams General Misc"],
    "teams_clutch": team_stats["Teams Clutch Traditional"],
    "playoffs_traditional": team_stats_playoffs["Teams General Traditional"],
    "playoffs_advanced": team_stats_playoffs["Teams General Advanced"],
    "playoffs_misc": team_stats_playoffs["Teams General Misc"],
    "playoffs_clutch": team_stats_playoffs["Teams Clutch Traditional"],
    "playoff_teams": playoff_teams,
    "playoff_teams_long": playoff_teams_long,
    "champions": champions,
}

# Mapping of DataFrame data types to SQLAlchemy data types.
type_map = {
    "int32": Integer,
    "string": String,
    "float64": Float,
    "bool": Boolean,
}

### Build SQLAlchemy `Column` Objects for each Table

In [33]:
table_columns = {}
for name, table in tables.items():
    columns = []
    for col_name, dtype in table.dtypes.items():
        # Primary and foreign keys
        if col_name == "TEAM":
            if name == "teams":
                columns.append(Column(col_name, type_map[str(dtype)], primary_key=True))
            else:
                columns.append(
                    Column(col_name, ForeignKey("teams.TEAM"), primary_key=True)
                )
        elif col_name == "SEASON":
            if name == "seasons":
                columns.append(Column(col_name, type_map[str(dtype)], primary_key=True))
            else:
                columns.append(
                    Column(col_name, ForeignKey("seasons.SEASON"), primary_key=True)
                )
        else:
            columns.append(Column(col_name, type_map[str(dtype)]))
    table_columns[name] = columns

#### Verify Table Columns

In [34]:
table_columns.keys()

dict_keys(['teams', 'seasons', 'season_records', 'playoff_records', 'teams_traditional', 'teams_advanced', 'teams_misc', 'teams_clutch', 'playoffs_traditional', 'playoffs_advanced', 'playoffs_misc', 'playoffs_clutch', 'playoff_teams', 'playoff_teams_long', 'champions'])

In [35]:
for name, columns in table_columns.items():
    print(f"\n{name}")
    pprint(columns, indent_guides=False, expand_all=True)


teams



seasons



season_records



playoff_records



teams_traditional



teams_advanced



teams_misc



teams_clutch



playoffs_traditional



playoffs_advanced



playoffs_misc



playoffs_clutch



playoff_teams



playoff_teams_long



champions


### Construct Database URI

In [36]:
# Create environment variables from .env.
load_dotenv(dotenv_path=Path.cwd().parents[1] / "src" / ".env")

# Host, port and database values.
db = os.environ.get("PGDATABASE")
host = os.environ.get("PGHOST")
port = os.environ.get("PGPORT")

# User credentials.
user = os.environ.get("PGUSER")
pwd = os.environ.get("PGPASSWORD")

# Build database URI.
db_uri = f"postgresql://{user}:{pwd}@{host}:{port}/{db}"

### Create Database Tables

In [37]:
# create the engine.
engine = create_engine(db_uri)

metadata = MetaData()

for name, table in tables.items():
    Table(
        name,
        metadata,
        *table_columns[name],
    )

# Drop any tables that exist already.
metadata.drop_all(engine)
# Create the tables in the database based on the metadata.
metadata.create_all(engine)

### Load Database

In [38]:
start_time = time.perf_counter()
with engine.connect() as connection:
    for name, table in tables.items():
        table.to_sql(name=name, con=engine, if_exists="append", index=False)
    connection.commit()

print(f"Database loading complete. {time.perf_counter() - start_time:.2f}s elapsed.")

Database loading complete. 10.90s elapsed.
