In [1]:
import duckdb
import polars as pl
from rich import print
from IPython.display import display
%load_ext rich
con = duckdb.connect()

For the sake of simplicty, we clean out certain aspects of the data before:

In [2]:
df = pl.read_csv('all_seasons.csv')
df = df.unique(subset = ["player_name","season"])
qr = """
SELECT
    player_name,
    age,
    team_abbreviation,
    college,
    player_height,
    player_weight,
    draft_year,
    draft_round,
    draft_number,
    season
    FROM df,
    LIMIT 5;
"""
con.sql(qr)


┌─────────────────┬────────┬───────────────────┬───────────────────────┬───────────────┬────────────────────┬────────────┬─────────────┬──────────────┬─────────┐
│   player_name   │  age   │ team_abbreviation │        college        │ player_height │   player_weight    │ draft_year │ draft_round │ draft_number │ season  │
│     varchar     │ double │      varchar      │        varchar        │    double     │       double       │  varchar   │   varchar   │   varchar    │ varchar │
├─────────────────┼────────┼───────────────────┼───────────────────────┼───────────────┼────────────────────┼────────────┼─────────────┼──────────────┼─────────┤
│ Anthony Johnson │   [1;36m34.0[0m │ ORL               │ College of Charleston │         [1;36m190.5[0m │           [1;36m88.45044[0m │ [1;36m1997[0m       │ [1;36m2[0m           │ [1;36m39[0m           │ [1;36m2008[0m-[1;36m09[0m │
│ Joel Bolomboy   │   [1;36m23.0[0m │ UTA               │ Weber State           │        [1;36m205

In [3]:
qr = """
DROP TABLE IF EXISTS players_seasons;

DROP TYPE IF EXISTS season_stats;
CREATE TYPE season_stats AS STRUCT(
    season INTEGER,
    gp INTEGER,
    pts FLOAT,
    reb FLOAT,
    ast FLOAT
);

CREATE TABLE players (
    player_name TEXT,
    age INTEGER,
    team_abbreviation TEXT,
    college TEXT,
    player_height FLOAT,
    player_weight FLOAT,
    draft_year TEXT,
    draft_round TEXT,
    draft_number TEXT,
    season_stats season_stats[],
    current_season INTEGER,
    PRIMARY KEY(player_name, current_season, team_abbreviation, college)
)
"""
con.sql(qr)

In [4]:
qr="""
SELECT
    MIN(season[:4]::INTEGER) AS min_year,
    MAX(season[:4]::INTEGER) AS max_year,
    MIN(season) AS min_year_text,
    MAX(season) AS max_year_text,
FROM read_csv('all_seasons.csv')
"""
dfr = con.sql(qr).pl()
print(dfr)


In [5]:
qr = """
FROM players
LIMIT 5
"""
con.sql(qr)


┌─────────────┬───────┬───────────────────┬─────────┬───────────────┬───────────────┬────────────┬─────────────┬──────────────┬───────────────────────────────────────────────────────────────────────┬────────────────┐
│ player_name │  age  │ team_abbreviation │ college │ player_height │ player_weight │ draft_year │ draft_round │ draft_number │                             season_stats                              │ current_season │
│   varchar   │ int32 │      varchar      │ varchar │     float     │     float     │  varchar   │   varchar   │   varchar    │ [1;35mstruct[0m[1m([0mseason integer, gp integer, pts float, reb float, ast float[1m)[0m[1m[[0m[1m][0m │     int32      │
├─────────────┴───────┴───────────────────┴─────────┴───────────────┴───────────────┴────────────┴─────────────┴──────────────┴───────────────────────────────────────────────────────────────────────┴────────────────┤
│                                                                                       

In [6]:
qr="""
WITH today AS (
    SELECT * FROM df
    WHERE season='1996-97'
)
FROM today
LIMIT 5
"""
con.sql(qr)


┌───────┬─────────────────┬───────────────────┬────────┬───────────────┬────────────────────┬──────────────────┬─────────┬────────────┬─────────────┬──────────────┬───────┬────────┬────────┬────────┬────────────┬──────────────────────┬─────────────────────┬─────────┬────────┬─────────┬─────────┐
│  v0   │   player_name   │ team_abbreviation │  age   │ player_height │   player_weight    │     college      │ country │ draft_year │ draft_round │ draft_number │  gp   │  pts   │  reb   │  ast   │ net_rating │       oreb_pct       │      dreb_pct       │ usg_pct │ ts_pct │ ast_pct │ season  │
│ int64 │     varchar     │      varchar      │ double │    double     │       double       │     varchar      │ varchar │  varchar   │   varchar   │   varchar    │ int64 │ double │ double │ double │   double   │        double        │       double        │ double  │ double │ double  │ varchar │
├───────┼─────────────────┼───────────────────┼────────┼───────────────┼────────────────────┼───────────────

In [7]:
qr = """
WITH yesterday AS (
    SELECT * FROM players
    WHERE current_season={current_season}
), today AS (
    SELECT * FROM df
    WHERE season='{season}'
)
SELECT
    COALESCE(t.player_name, y.player_name) AS player_name,
    COALESCE(t.age, y.age) AS age,
    COALESCE(t.team_abbreviation, y.team_abbreviation) AS team_abbreviation,
    COALESCE(t.college, y.college) AS college,
    COALESCE(t.player_height, y.player_height) AS player_height,
    COALESCE(t.player_weight, y.player_weight) AS player_weight,
    COALESCE(t.draft_year, y.draft_year) AS draft_year,
    COALESCE(t.draft_round, y.draft_round) AS draft_round,
    COALESCE(t.draft_number, y.draft_number) AS draft_number,
    -- Update for season_stats
    CASE WHEN y.season_stats IS NULL
        THEN ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    WHEN t.season IS NOT NULL
        THEN y.season_stats || ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    ELSE y.season_stats
    END AS season_stats,
    -- Update for current_season,
    COALESCE(t.season[:4]::INTEGER, y.current_season+1) AS current_season
FROM today t
FULL OUTER JOIN yesterday y
ON t.player_name = y.player_name;
"""
query = qr.format(current_season=1995, season='1996-97')
print(query)
dfr = con.sql(query).pl()
display(df.head())

Unnamed: 0_level_0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,draft_number,gp,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
i64,str,str,f64,f64,f64,str,str,str,str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
5773,"""Anthony Johnson""","""ORL""",34.0,190.5,88.45044,"""College of Charleston""","""USA""","""1997""","""2""","""39""",80,5.3,1.8,2.5,4.0,0.023,0.089,0.157,0.505,0.223,"""2008-09"""
9488,"""Joel Bolomboy""","""UTA""",23.0,205.74,106.59412,"""Weber State""","""Ukraine""","""2016""","""2""","""52""",12,1.8,1.4,0.2,11.3,0.083,0.255,0.18,0.59,0.065,"""2016-17"""
4351,"""Earl Boykins""","""DEN""",30.0,165.1,60.327736,"""Eastern Michigan""","""USA""","""Undrafted""","""Undrafted""","""Undrafted""",60,12.6,1.4,3.8,-3.3,0.015,0.047,0.234,0.513,0.254,"""2005-06"""
1257,"""Jelani McCoy""","""SEA""",21.0,208.28,111.13004,"""UCLA""","""USA""","""1998""","""2""","""33""",26,5.1,3.0,0.2,-6.6,0.096,0.171,0.147,0.704,0.023,"""1998-99"""
9723,"""Terrence Ross""","""ORL""",27.0,200.66,93.439952,"""Washington""","""USA""","""2012""","""1""","""8""",24,8.7,3.0,1.6,-3.8,0.016,0.102,0.166,0.505,0.088,"""2017-18"""


In [8]:
query_insert = """INSERT INTO players""" + query
con.sql(query_insert)

In [9]:
for year in range(1997,2022):
    season = '-'.join([str(year+1),str(year+2)[2:]])
    query = qr.format(current_season=year, season=season)
    # print(query)
    query_insert = """INSERT INTO players""" + query
    # print(f"Inserted: {season}")
    con.sql(query_insert)

In [10]:
qr = """
SELECT * FROM players
WHERE current_season=2021
AND player_name='Kobe Bryant'
"""
con.sql(qr)



┌─────────────┬───────┬───────────────────┬─────────┬───────────────┬───────────────┬────────────┬─────────────┬──────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [11]:
qr = """
SELECT
    player_name,
    UNNEST(season_stats) FROM players
WHERE current_season=2021
AND player_name='Kobe Bryant'
"""
con.sql(qr)


┌─────────────┬─────────────────────────────────────────────────────────────────────┐
│ player_name │                        [1;35munnest[0m[1m([0mseason_stats[1m)[0m                         │
│   varchar   │ [1;35mstruct[0m[1m([0mseason integer, gp integer, pts float, reb float, ast float[1m)[0m │
├─────────────┼─────────────────────────────────────────────────────────────────────┤
│ Kobe Bryant │ [1m{[0m[32m'season'[0m: [1;36m1998[0m, [32m'gp'[0m: [1;36m50[0m, [32m'pts'[0m: [1;36m19.9[0m, [32m'reb'[0m: [1;36m5.3[0m, [32m'ast'[0m: [1;36m3.8[0m[1m}[0m     │
│ Kobe Bryant │ [1m{[0m[32m'season'[0m: [1;36m1999[0m, [32m'gp'[0m: [1;36m66[0m, [32m'pts'[0m: [1;36m22.5[0m, [32m'reb'[0m: [1;36m6.3[0m, [32m'ast'[0m: [1;36m4.9[0m[1m}[0m     │
│ Kobe Bryant │ [1m{[0m[32m'season'[0m: [1;36m2000[0m, [32m'gp'[0m: [1;36m68[0m, [32m'pts'[0m: [1;36m28.5[0m, [32m'reb'[0m: [1;36m5.9[0m, [32m'ast'[0m: [1;36m5.0[0m[1m}[0m  

In [12]:

qr = """
WITH expansion AS (
    SELECT
        player_name,
        UNNEST(season_stats)::season_stats AS season_stats
    FROM players
    WHERE current_season=2021
    AND player_name='Kobe Bryant'
)
SELECT
    player_name,
    season_stats.*
FROM expansion
"""
con.sql(qr)


┌─────────────┬────────┬───────┬───────┬───────┬───────┐
│ player_name │ season │  gp   │  pts  │  reb  │  ast  │
│   varchar   │ int32  │ int32 │ float │ float │ float │
├─────────────┼────────┼───────┼───────┼───────┼───────┤
│ Kobe Bryant │   [1;36m1998[0m │    [1;36m50[0m │  [1;36m19.9[0m │   [1;36m5.3[0m │   [1;36m3.8[0m │
│ Kobe Bryant │   [1;36m1999[0m │    [1;36m66[0m │  [1;36m22.5[0m │   [1;36m6.3[0m │   [1;36m4.9[0m │
│ Kobe Bryant │   [1;36m2000[0m │    [1;36m68[0m │  [1;36m28.5[0m │   [1;36m5.9[0m │   [1;36m5.0[0m │
│ Kobe Bryant │   [1;36m2001[0m │    [1;36m80[0m │  [1;36m25.2[0m │   [1;36m5.5[0m │   [1;36m5.5[0m │
│ Kobe Bryant │   [1;36m2002[0m │    [1;36m82[0m │  [1;36m30.0[0m │   [1;36m6.9[0m │   [1;36m5.9[0m │
│ Kobe Bryant │   [1;36m2003[0m │    [1;36m65[0m │  [1;36m24.0[0m │   [1;36m5.5[0m │   [1;36m5.1[0m │
│ Kobe Bryant │   [1;36m2004[0m │    [1;36m66[0m │  [1;36m27.6[0m │   [1;36m5.9[0m │   [1;

In [13]:
qr = """
CREATE TYPE scoring_class AS ENUM ('star', 'good', 'average', 'bad')
"""
qr = """
DROP TABLE IF EXISTS players
CREATE TABLE players (
    player_name TEXT,
    age INTEGER,
    team_abbreviation TEXT,
    college TEXT,
    player_height FLOAT,
    player_weight FLOAT,
    draft_year TEXT,
    draft_round TEXT,
    draft_number TEXT,
    season_stats season_stats[],
    scoring_class scoring_class,
    current_season INTEGER,
    PRIMARY KEY(player_name, current_season, team_abbreviation, college)
)
"""