In [1]:
import duckdb
import polars as pl
from rich import print
from IPython.display import display
%load_ext rich
con = duckdb.connect()

For the sake of simplicty, we clean out certain aspects of the data before:

In [9]:
df = pl.read_csv('all_seasons.csv')
df = df.unique(subset = ["player_name","team_abbreviation","season"])
qr = """
SELECT
    player_name,
    age,
    team_abbreviation,
    college,
    player_height,
    player_weight,
    draft_year,
    draft_round,
    draft_number,
    season
    FROM df,
    LIMIT 5;
"""
con.sql(qr)


┌───────────────────┬────────┬───────────────────┬─────────────┬───────────────┬───────────────────┬────────────┬─────────────┬──────────────┬─────────┐
│    player_name    │  age   │ team_abbreviation │   college   │ player_height │   player_weight   │ draft_year │ draft_round │ draft_number │ season  │
│      varchar      │ double │      varchar      │   varchar   │    double     │      double       │  varchar   │   varchar   │   varchar    │ varchar │
├───────────────────┼────────┼───────────────────┼─────────────┼───────────────┼───────────────────┼────────────┼─────────────┼──────────────┼─────────┤
│ Alonzo Gee        │   [1;36m24.0[0m │ CLE               │ Alabama     │        [1;36m198.12[0m │          [1;36m99.79024[0m │ Undrafted  │ Undrafted   │ Undrafted    │ [1;36m2010[0m-[1;36m11[0m │
│ Willy Hernangomez │   [1;36m24.0[0m │ CHA               │ [3;35mNone[0m        │        [1;36m210.82[0m │         [1;36m108.86208[0m │ [1;36m2015[0m       │ [1;36m2[

In [3]:
qr = """
DROP TABLE IF EXISTS players_seasons;

DROP TYPE IF EXISTS season_stats;
CREATE TYPE season_stats AS STRUCT(
    season INTEGER,
    gp INTEGER,
    pts FLOAT,
    reb FLOAT,
    ast FLOAT
);

CREATE TABLE players (
    player_name TEXT,
    age INTEGER,
    team_abbreviation TEXT,
    college TEXT,
    player_height FLOAT,
    player_weight FLOAT,
    draft_year TEXT,
    draft_round TEXT,
    draft_number TEXT,
    season_stats season_stats[],
    current_season INTEGER,
    PRIMARY KEY(player_name, current_season, team_abbreviation)
)
"""
con.sql(qr)

In [4]:
qr="""
SELECT
    MIN(season[:4]::INTEGER) AS min_year,
    MAX(season[:4]::INTEGER) AS max_year,
    MIN(season) AS min_year_text,
    MAX(season) AS max_year_text,
FROM read_csv('all_seasons.csv')
"""
dfr = con.sql(qr).pl()
print(dfr)


In [5]:
qr = """
FROM players
LIMIT 5
"""
con.sql(qr)


┌─────────────┬───────┬───────────────────┬─────────┬───────────────┬───────────────┬────────────┬─────────────┬──────────────┬───────────────────────────────────────────────────────────────────────┬────────────────┐
│ player_name │  age  │ team_abbreviation │ college │ player_height │ player_weight │ draft_year │ draft_round │ draft_number │                             season_stats                              │ current_season │
│   varchar   │ int32 │      varchar      │ varchar │     float     │     float     │  varchar   │   varchar   │   varchar    │ [1;35mstruct[0m[1m([0mseason integer, gp integer, pts float, reb float, ast float[1m)[0m[1m[[0m[1m][0m │     int32      │
├─────────────┴───────┴───────────────────┴─────────┴───────────────┴───────────────┴────────────┴─────────────┴──────────────┴───────────────────────────────────────────────────────────────────────┴────────────────┤
│                                                                                       

In [8]:
qr = """
WITH yesterday AS (
    SELECT * FROM players
    WHERE current_season={current_season}
), today AS (
    --SELECT * FROM read_csv('all_seasons.csv')
    SELECT * FROM df
    WHERE season='{season}'
)
SELECT
    COALESCE(t.player_name, y.player_name) AS player_name,
    COALESCE(t.age, y.age) AS age,
    COALESCE(t.team_abbreviation, y.team_abbreviation) AS team_abbreviation,
    COALESCE(t.college, y.college) AS college,
    COALESCE(t.player_height, y.player_height) AS player_height,
    COALESCE(t.player_weight, y.player_weight) AS player_weight,
    COALESCE(t.draft_year, y.draft_year) AS draft_year,
    COALESCE(t.draft_round, y.draft_round) AS draft_round,
    COALESCE(t.draft_number, y.draft_number) AS draft_number,
    -- Update for season_stats
    CASE WHEN y.season_stats IS NULL
        THEN ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    WHEN t.season IS NOT NULL
        THEN y.season_stats || ARRAY[ROW(
            t.season[:4]::INTEGER,
            t.gp,
            t.pts,
            t.reb,
            t.ast
        )::season_stats]
    ELSE y.season_stats
    END AS season_stats,
    -- Update for current_season,
    COALESCE(t.season[:4]::INTEGER, y.current_season+1) AS current_season
FROM today t
FULL OUTER JOIN yesterday y
ON t.player_name = y.player_name;
"""
query = qr.format(current_season=1995, season='1996-97')
print(query)
df = con.sql(query).pl()
display(df.head())

In [7]:
query_insert = """INSERT INTO players""" + query
con.sql(query_insert)

In [None]:
for year in range(1997,2022):
    season = '-'.join([str(year+1),str(year+2)[2:]])
    query = qr.format(current_season=year, season=season)
    # print(query)
    query_insert = """INSERT INTO players""" + query
    con.sql(query_insert)

In [None]:
tdf = con.sql(query).pl()
tdf.sql("""
    SELECT
        *
    FROM self
    WHERE player_name='Marcus Williams'""")

In [None]:
kwyear