In [1]:
import duckdb
import polars as pl
from rich import print
from IPython.display import display
%load_ext rich
con = duckdb.connect()

In [2]:
df = pl.read_csv('data/all_seasons.csv')
df = df.unique(subset= ["player_name","season"])
qr = """
SELECT
    player_name,
    age,
    team_abbreviation,
    college,
    player_height,
    player_weight,
    draft_year,
    draft_round,
    draft_number,
    season
    FROM df,
    LIMIT 5;
"""
print(
    con.sql(qr).pl()
)

In [3]:
qr = """
DROP TYPE IF EXISTS season_stats;
CREATE TYPE season_stats AS STRUCT(
    season INTEGER,
    gp INTEGER,
    pts FLOAT,
    reb FLOAT,
    ast FLOAT
);
DROP TYPE IF EXISTS scoring_class;
CREATE TYPE scoring_class AS ENUM ('star', 'good', 'average', 'bad');
"""
con.sql(qr)
qr = """
DROP TABLE IF EXISTS players;
CREATE TABLE players (
    player_name TEXT,
    age INTEGER,
    team_abbreviation TEXT,
    college TEXT,
    player_height FLOAT,
    player_weight FLOAT,
    draft_year TEXT,
    draft_round TEXT,
    draft_number TEXT,
    season_stats season_stats[],
    scoring_class scoring_class,
    years_since_last_season INTEGER,
    current_season INTEGER,
    PRIMARY KEY(player_name, current_season, team_abbreviation, college)
)
"""
con.sql(qr)

In [4]:
qr = """
WITH yesterday AS (
    SELECT * FROM players
    WHERE current_season={current_season}
), today AS (
    SELECT * FROM df
    WHERE season='{season}'
)
SELECT
    COALESCE(t.player_name, y.player_name) AS player_name,
    COALESCE(t.age, y.age) AS age,
    COALESCE(t.team_abbreviation, y.team_abbreviation) AS team_abbreviation,
    COALESCE(t.college, y.college) AS college,
    COALESCE(t.player_height, y.player_height) AS player_height,
    COALESCE(t.player_weight, y.player_weight) AS player_weight,
    COALESCE(t.draft_year, y.draft_year) AS draft_year,
    COALESCE(t.draft_round, y.draft_round) AS draft_round,
    COALESCE(t.draft_number, y.draft_number) AS draft_number,
    -- Update for season_stats
    CASE WHEN y.season_stats IS NULL
        THEN ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    WHEN t.season IS NOT NULL
        THEN y.season_stats || ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    ELSE y.season_stats
    END AS season_stats,
    -- Scoring
    CASE
        WHEN t.season IS NOT NULL THEN
            CASE
                WHEN t.pts > 20 THEN 'star'
                WHEN t.pts > 15 THEN 'good'
                WHEN t.pts > 10 THEN 'average'
                ELSE 'bad'
            END::scoring_class
        ELSE y.scoring_class
    END AS scoring_class,
    -- Years active
    CASE WHEN t.season IS NOT NULL THEN 0
        ELSE y.years_since_last_season + 1
    END AS years_since_last_season,
    -- Update for current_season,
    COALESCE(t.season[:4]::INTEGER, y.current_season+1) AS current_season
FROM today t
FULL OUTER JOIN yesterday y
ON t.player_name = y.player_name;
"""
for year in range(1995,2022):
    season = '-'.join([str(year+1),str(year+2)[2:]])
    query = qr.format(current_season=year, season=season)
    # print(query)
    query_insert = """INSERT INTO players""" + query
    # print(f"Inserted: {season}")
    con.sql(query_insert)
print(query)
dfr = con.sql(query).pl()
display(dfr.head())

player_name,age,team_abbreviation,college,player_height,player_weight,draft_year,draft_round,draft_number,season_stats,scoring_class,years_since_last_season,current_season
str,f64,str,str,f64,f64,str,str,str,list[struct[5]],cat,i32,i32
"""Udonis Haslem""",43.0,"""MIA""","""Florida""",200.66,106.59412,"""Undrafted""","""Undrafted""","""Undrafted""","[{2003,75,7.3,6.3,0.7}, {2004,80,10.9,9.1,1.4}, … {2022,7,3.9,1.6,0.0}]","""bad""",0,2022
"""Andre Iguodala""",39.0,"""GSW""","""Arizona""",198.12,97.52228,"""2004""","""1""","""9""","[{2004,82,9.0,5.7,3.0}, {2005,82,12.3,5.9,3.1}, … {2022,8,2.1,2.1,2.4}]","""bad""",0,2022
"""Rudy Gay""",36.0,"""UTA""","""Connecticut""",203.2,113.398,"""2006""","""1""","""8""","[{2006,78,10.8,4.5,1.3}, {2007,81,20.1,6.2,2.0}, … {2022,56,5.2,2.9,1.0}]","""bad""",0,2022
"""Thaddeus Young""",35.0,"""TOR""","""Georgia Tech""",203.2,106.59412,"""2007""","""1""","""12""","[{2007,74,8.2,4.2,0.8}, {2008,75,15.3,5.0,1.1}, … {2022,54,4.4,3.1,1.4}]","""bad""",0,2022
"""Russell Westbrook""",34.0,"""LAC""","""UCLA""",190.5,90.7184,"""2008""","""1""","""4""","[{2008,82,15.3,4.9,5.3}, {2009,82,16.1,4.9,8.0}, … {2022,73,15.9,5.8,7.5}]","""good""",0,2022
