In [1]:
import duckdb
import polars as pl
from rich import print
from IPython.display import display
%load_ext rich
con = duckdb.connect()

For the sake of simplicty, we clean out certain aspects of the data before:

In [2]:
df = pl.read_csv('data/all_seasons.csv')
df = df.unique(subset= ["player_name","season"])
qr = """
SELECT
    player_name,
    age,
    team_abbreviation,
    college,
    player_height,
    player_weight,
    draft_year,
    draft_round,
    draft_number,
    season
    FROM df,
    LIMIT 5;
"""
print(
    con.sql(qr).pl()
)

In [3]:
qr = """
DROP TABLE IF EXISTS players_seasons;

DROP TYPE IF EXISTS season_stats;
CREATE TYPE season_stats AS STRUCT(
    season INTEGER,
    gp INTEGER,
    pts FLOAT,
    reb FLOAT,
    ast FLOAT
);

CREATE TABLE players (
    player_name TEXT,
    age INTEGER,
    team_abbreviation TEXT,
    college TEXT,
    player_height FLOAT,
    player_weight FLOAT,
    draft_year TEXT,
    draft_round TEXT,
    draft_number TEXT,
    season_stats season_stats[],
    current_season INTEGER,
    PRIMARY KEY(player_name, current_season, team_abbreviation, college)
)
"""
con.sql(qr)

In [4]:
qr="""
SELECT
    MIN(season[:4]::INTEGER) AS min_year,
    MAX(season[:4]::INTEGER) AS max_year,
    MIN(season) AS min_year_text,
    MAX(season) AS max_year_text,
FROM read_csv('data/all_seasons.csv')
"""
dfr = con.sql(qr).pl()
display(dfr)


min_year,max_year,min_year_text,max_year_text
i32,i32,str,str
1996,2022,"""1996-97""","""2022-23"""


In [5]:
qr = """
FROM players
LIMIT 5
"""
display(
    con.sql(qr).pl()
)

player_name,age,team_abbreviation,college,player_height,player_weight,draft_year,draft_round,draft_number,season_stats,current_season
str,i32,str,str,f32,f32,str,str,str,list[struct[5]],i32


In [6]:
qr="""
WITH today AS (
    SELECT * FROM df
    WHERE season='1996-97'
)
FROM today
LIMIT 5
"""
display(
    con.sql(qr).pl()
)

v0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,draft_number,gp,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
i64,str,str,f64,f64,f64,str,str,str,str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
404,"""Antonio Lang""","""CLE""",25.0,203.2,104.32616,"""Duke""","""USA""","""1994""","""2""","""29""",64,2.7,2.0,0.5,0.7,0.081,0.118,0.136,0.467,0.068,"""1996-97"""
426,"""Aleksandar Djordjevic""","""POR""",29.0,187.96,89.811216,"""None""","""USA""","""Undrafted""","""Undrafted""","""Undrafted""",8,3.1,0.6,0.6,4.8,0.021,0.07,0.179,0.687,0.135,"""1996-97"""
195,"""Brett Szabo""","""BOS""",29.0,210.82,104.32616,"""Augustana (SD)""","""USA""","""Undrafted""","""Undrafted""","""Undrafted""",70,2.2,2.4,0.2,-11.1,0.089,0.196,0.119,0.517,0.039,"""1996-97"""
409,"""Bo Outlaw""","""LAC""",26.0,203.2,95.25432,"""Houston""","""USA""","""Undrafted""","""Undrafted""","""Undrafted""",82,7.6,5.5,1.9,4.0,0.092,0.152,0.129,0.602,0.113,"""1996-97"""
207,"""Scott Brooks""","""NYK""",31.0,180.34,74.84268,"""California-Irvine""","""USA""","""Undrafted""","""Undrafted""","""Undrafted""",38,1.5,0.5,0.8,3.8,0.032,0.058,0.116,0.625,0.191,"""1996-97"""


In [7]:
qr = """
WITH yesterday AS (
    SELECT * FROM players
    WHERE current_season={current_season}
), today AS (
    SELECT * FROM df
    WHERE season='{season}'
)
SELECT
    COALESCE(t.player_name, y.player_name) AS player_name,
    COALESCE(t.age, y.age) AS age,
    COALESCE(t.team_abbreviation, y.team_abbreviation) AS team_abbreviation,
    COALESCE(t.college, y.college) AS college,
    COALESCE(t.player_height, y.player_height) AS player_height,
    COALESCE(t.player_weight, y.player_weight) AS player_weight,
    COALESCE(t.draft_year, y.draft_year) AS draft_year,
    COALESCE(t.draft_round, y.draft_round) AS draft_round,
    COALESCE(t.draft_number, y.draft_number) AS draft_number,
    -- Update for season_stats
    CASE WHEN y.season_stats IS NULL
        THEN ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    WHEN t.season IS NOT NULL
        THEN y.season_stats || ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    ELSE y.season_stats
    END AS season_stats,
    -- Update for current_season,
    COALESCE(t.season[:4]::INTEGER, y.current_season+1) AS current_season
FROM today t
FULL OUTER JOIN yesterday y
ON t.player_name = y.player_name;
"""
query = qr.format(current_season=1995, season='1996-97')
print(query)
dfr = con.sql(query).pl()
display(dfr.head())

player_name,age,team_abbreviation,college,player_height,player_weight,draft_year,draft_round,draft_number,season_stats,current_season
str,f64,str,str,f64,f64,str,str,str,list[struct[5]],i32
"""Antonio Lang""",25.0,"""CLE""","""Duke""",203.2,104.32616,"""1994""","""2""","""29""","[{1996,64,2.7,2.0,0.5}]",1996
"""Aleksandar Djordjevic""",29.0,"""POR""","""None""",187.96,89.811216,"""Undrafted""","""Undrafted""","""Undrafted""","[{1996,8,3.1,0.6,0.6}]",1996
"""Brett Szabo""",29.0,"""BOS""","""Augustana (SD)""",210.82,104.32616,"""Undrafted""","""Undrafted""","""Undrafted""","[{1996,70,2.2,2.4,0.2}]",1996
"""Bo Outlaw""",26.0,"""LAC""","""Houston""",203.2,95.25432,"""Undrafted""","""Undrafted""","""Undrafted""","[{1996,82,7.6,5.5,1.9}]",1996
"""Scott Brooks""",31.0,"""NYK""","""California-Irvine""",180.34,74.84268,"""Undrafted""","""Undrafted""","""Undrafted""","[{1996,38,1.5,0.5,0.8}]",1996


In [8]:
query_insert = """INSERT INTO players""" + query
con.sql(query_insert)

In [9]:
for year in range(1997,2022):
    season = '-'.join([str(year+1),str(year+2)[2:]])
    query = qr.format(current_season=year, season=season)
    # print(query)
    query_insert = """INSERT INTO players""" + query
    # print(f"Inserted: {season}")
    con.sql(query_insert)

In [10]:
qr = """
SELECT * FROM players
WHERE current_season=2021
AND player_name='Kobe Bryant'
"""
display(
    con.sql(qr).pl()
)


player_name,age,team_abbreviation,college,player_height,player_weight,draft_year,draft_round,draft_number,season_stats,current_season
str,i32,str,str,f32,f32,str,str,str,list[struct[5]],i32
"""Kobe Bryant""",37,"""LAL""","""None""",198.119995,96.161507,"""1996""","""1""","""13""","[{1998,50,19.9,5.3,3.8}, {1999,66,22.5,6.3,4.9}, … {2015,66,17.6,3.7,2.8}]",2021


In [11]:
qr = """
SELECT
    player_name,
    UNNEST(season_stats) FROM players
WHERE current_season=2021
AND player_name='Kobe Bryant'
"""
display(
    con.sql(qr).pl()
)

player_name,unnest(season_stats)
str,struct[5]
"""Kobe Bryant""","{1998,50,19.9,5.3,3.8}"
"""Kobe Bryant""","{1999,66,22.5,6.3,4.9}"
"""Kobe Bryant""","{2000,68,28.5,5.9,5.0}"
"""Kobe Bryant""","{2001,80,25.200001,5.5,5.5}"
"""Kobe Bryant""","{2002,82,30.0,6.9,5.9}"
…,…
"""Kobe Bryant""","{2011,58,27.9,5.4,4.6}"
"""Kobe Bryant""","{2012,78,27.299999,5.6,6.0}"
"""Kobe Bryant""","{2013,6,13.8,4.3,6.3}"
"""Kobe Bryant""","{2014,35,22.299999,5.7,5.6}"


In [12]:

qr = """
WITH expansion AS (
    SELECT
        player_name,
        UNNEST(season_stats)::season_stats AS season_stats
    FROM players
    WHERE current_season=2021
    AND player_name='Kobe Bryant'
)
SELECT
    player_name,
    season_stats.*
FROM expansion
"""
display(
    con.sql(qr).pl()
)

player_name,season,gp,pts,reb,ast
str,i32,i32,f32,f32,f32
"""Kobe Bryant""",1998,50,19.9,5.3,3.8
"""Kobe Bryant""",1999,66,22.5,6.3,4.9
"""Kobe Bryant""",2000,68,28.5,5.9,5.0
"""Kobe Bryant""",2001,80,25.200001,5.5,5.5
"""Kobe Bryant""",2002,82,30.0,6.9,5.9
…,…,…,…,…,…
"""Kobe Bryant""",2011,58,27.9,5.4,4.6
"""Kobe Bryant""",2012,78,27.299999,5.6,6.0
"""Kobe Bryant""",2013,6,13.8,4.3,6.3
"""Kobe Bryant""",2014,35,22.299999,5.7,5.6


In [13]:
qr = """
DROP TYPE IF EXISTS scoring_class;
CREATE TYPE scoring_class AS ENUM ('star', 'good', 'average', 'bad');
"""
con.sql(qr)
qr = """
DROP TABLE IF EXISTS players;
CREATE TABLE players (
    player_name TEXT,
    age INTEGER,
    team_abbreviation TEXT,
    college TEXT,
    player_height FLOAT,
    player_weight FLOAT,
    draft_year TEXT,
    draft_round TEXT,
    draft_number TEXT,
    season_stats season_stats[],
    scoring_class scoring_class,
    years_since_last_season INTEGER,
    current_season INTEGER,
    PRIMARY KEY(player_name, current_season, team_abbreviation, college)
)
"""
con.sql(qr)

In [14]:
qr = """
WITH yesterday AS (
    SELECT * FROM players
    WHERE current_season={current_season}
), today AS (
    SELECT * FROM df
    WHERE season='{season}'
)
SELECT
    COALESCE(t.player_name, y.player_name) AS player_name,
    COALESCE(t.age, y.age) AS age,
    COALESCE(t.team_abbreviation, y.team_abbreviation) AS team_abbreviation,
    COALESCE(t.college, y.college) AS college,
    COALESCE(t.player_height, y.player_height) AS player_height,
    COALESCE(t.player_weight, y.player_weight) AS player_weight,
    COALESCE(t.draft_year, y.draft_year) AS draft_year,
    COALESCE(t.draft_round, y.draft_round) AS draft_round,
    COALESCE(t.draft_number, y.draft_number) AS draft_number,
    -- Update for season_stats
    CASE WHEN y.season_stats IS NULL
        THEN ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    WHEN t.season IS NOT NULL
        THEN y.season_stats || ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    ELSE y.season_stats
    END AS season_stats,
    -- Scoring
    CASE
        WHEN t.season IS NOT NULL THEN
            CASE
                WHEN t.pts > 20 THEN 'star'
                WHEN t.pts > 15 THEN 'good'
                WHEN t.pts > 10 THEN 'average'
                ELSE 'bad'
            END::scoring_class
        ELSE y.scoring_class
    END AS scoring_class,
    -- Years active
    CASE WHEN t.season IS NOT NULL THEN 0
        ELSE y.years_since_last_season + 1
    END AS years_since_last_season,
    -- Update for current_season,
    COALESCE(t.season[:4]::INTEGER, y.current_season+1) AS current_season
FROM today t
FULL OUTER JOIN yesterday y
ON t.player_name = y.player_name;
"""
query = qr.format(current_season=1995, season='1996-97')
print(query)
dfr = con.sql(query).pl()
display(dfr.head())

player_name,age,team_abbreviation,college,player_height,player_weight,draft_year,draft_round,draft_number,season_stats,scoring_class,years_since_last_season,current_season
str,f64,str,str,f64,f64,str,str,str,list[struct[5]],cat,i32,i32
"""Antonio Lang""",25.0,"""CLE""","""Duke""",203.2,104.32616,"""1994""","""2""","""29""","[{1996,64,2.7,2.0,0.5}]","""bad""",0,1996
"""Aleksandar Djordjevic""",29.0,"""POR""","""None""",187.96,89.811216,"""Undrafted""","""Undrafted""","""Undrafted""","[{1996,8,3.1,0.6,0.6}]","""bad""",0,1996
"""Brett Szabo""",29.0,"""BOS""","""Augustana (SD)""",210.82,104.32616,"""Undrafted""","""Undrafted""","""Undrafted""","[{1996,70,2.2,2.4,0.2}]","""bad""",0,1996
"""Bo Outlaw""",26.0,"""LAC""","""Houston""",203.2,95.25432,"""Undrafted""","""Undrafted""","""Undrafted""","[{1996,82,7.6,5.5,1.9}]","""bad""",0,1996
"""Scott Brooks""",31.0,"""NYK""","""California-Irvine""",180.34,74.84268,"""Undrafted""","""Undrafted""","""Undrafted""","[{1996,38,1.5,0.5,0.8}]","""bad""",0,1996


In [15]:
query = qr.format(current_season=1996, season='1997-98')
print(query)
dfr = con.sql(query).pl()
display(dfr.head())

player_name,age,team_abbreviation,college,player_height,player_weight,draft_year,draft_round,draft_number,season_stats,scoring_class,years_since_last_season,current_season
str,f64,str,str,f64,f64,str,str,str,list[struct[5]],cat,i32,i32
"""Grant Hill""",25.0,"""DET""","""Duke""",203.2,102.0582,"""1994""","""1""","""3""","[{1997,81,21.1,7.7,6.8}]","""star""",0,1997
"""Stanley Roberts""",28.0,"""MIN""","""Louisiana State""",213.36,131.54168,"""1991""","""1""","""23""","[{1997,74,6.2,4.9,0.4}]","""bad""",0,1997
"""Robert Pack""",29.0,"""DAL""","""Southern California""",187.96,86.18248,"""Undrafted""","""Undrafted""","""Undrafted""","[{1997,12,7.8,2.8,3.5}]","""bad""",0,1997
"""Vinny Del Negro""",31.0,"""SAS""","""North Carolina State""",193.04,90.7184,"""1988""","""2""","""29""","[{1997,54,9.5,2.8,3.4}]","""bad""",0,1997
"""Sean Rooks""",28.0,"""LAL""","""Arizona""",208.28,117.93392,"""1992""","""2""","""30""","[{1997,41,3.4,2.9,0.6}]","""bad""",0,1997


In [16]:
for year in range(1998,2022):
    season = '-'.join([str(year+1),str(year+2)[2:]])
    query = qr.format(current_season=year, season=season)
    # print(query)
    query_insert = """INSERT INTO players""" + query
    # print(f"Inserted: {season}")
    con.sql(query_insert)

In [17]:
qr="""
SELECT COUNT(*), player_name FROM players
GROUP BY player_name
HAVING COUNT(*)>1
LIMIT 5;
"""
display(
    con.sql(qr).pl()
)

count_star(),player_name
i64,str
24,"""Elden Campbell"""
24,"""Shawn Kemp"""
24,"""Matt Bullard"""
24,"""Tim Thomas"""
24,"""Bryant Stith"""


In [18]:
qr = """
SELECT * FROM players
WHERE player_name='Michael Curry'
AND current_season=2022
"""
dfr=con.sql(qr).pl()
display(dfr)

player_name,age,team_abbreviation,college,player_height,player_weight,draft_year,draft_round,draft_number,season_stats,scoring_class,years_since_last_season,current_season
str,i32,str,str,f32,f32,str,str,str,list[struct[5]],cat,i32,i32
"""Michael Curry""",36,"""IND""","""Georgia Southern""",195.580002,95.254318,"""Undrafted""","""Undrafted""","""Undrafted""","[{1999,82,6.2,1.3,1.1}, {2000,68,5.2,1.8,1.9}, … {2004,18,1.7,1.5,0.8}]","""bad""",18,2022


In [19]:
qr = """
SELECT
    player_name,
    array_length(season_stats) AS total_seasons,
    (season_stats[1]::season_stats).pts AS first_season,
    (season_stats[array_length(season_stats)]::season_stats).pts AS last_season,
FROM players
WHERE player_name='Michael Curry'
AND current_season=2022
"""
dfr=con.sql(qr).pl()
display(dfr)

player_name,total_seasons,first_season,last_season
str,i64,f32,f32
"""Michael Curry""",6,6.2,1.7


In [20]:
qr = """
WITH points_fl AS (
SELECT
    player_name,
    array_length(season_stats) AS total_seasons,
    (season_stats[1]::season_stats).pts AS first_season,
    (season_stats[array_length(season_stats)]::season_stats).pts AS last_season,
    current_season
FROM players
)
SELECT
    player_name,
    total_seasons,
    last_season /
        CASE
            WHEN first_season=0 THEN 1 ELSE first_season
        END
    AS ratio,
    current_season
FROM points_fl
WHERE current_season=2022
ORDER BY ratio DESC
LIMIT 10
"""
dfr=con.sql(qr).pl()
display(dfr)

player_name,total_seasons,ratio,current_season
str,i64,f32,i32
"""Brandon Williams""",2,42.999996,2022
"""David Wingate""",2,20.0,2022
"""Ndudi Ebi""",2,16.875,2022
"""Pavel Podkolzin""",2,15.0,2022
"""Julius Randle""",9,12.55,2022
"""Terry Rozier""",8,11.722222,2022
"""Nathan Jawai""",2,10.666666,2022
"""Jared Harper""",3,10.571429,2022
"""Nick Richards""",3,10.25,2022
"""Louis King""",4,10.0,2022
