In [15]:
import json
import graphviz
import re
import polars as pl
import datetime as dt
from numpy import log

In [16]:
pl.Config.set_tbl_rows(20)
pl.Config.set_tbl_cols(20)
pl.Config.set_float_precision(2)


polars.config.Config

In [17]:
filepath: str = "../data/records05-29_23-57.json"
lf: pl.LazyFrame = pl.read_json(filepath).lazy()

In [41]:
print(lf.collect())

shape: (339_287, 5)
┌──────────┬────────────┬────────────────────────────┬──────────────────────────────┬──────────────┐
│ position ┆ date       ┆ song                       ┆ artist                       ┆ wks_on_chart │
│ ---      ┆ ---        ┆ ---                        ┆ ---                          ┆ ---          │
│ i64      ┆ str        ┆ str                        ┆ str                          ┆ str          │
╞══════════╪════════════╪════════════════════════════╪══════════════════════════════╪══════════════╡
│ 1        ┆ 2025-04-26 ┆ Luther                     ┆ Kendrick Lamar & SZA         ┆ 21           │
│ 2        ┆ 2025-04-26 ┆ Die With A Smile           ┆ Lady Gaga & Bruno Mars       ┆ 35           │
│ 3        ┆ 2025-04-26 ┆ Nokia                      ┆ Drake                        ┆ 9            │
│ 4        ┆ 2025-04-26 ┆ Pink Pony Club             ┆ Chappell Roan                ┆ 44           │
│ 5        ┆ 2025-04-26 ┆ Ordinary                   ┆ Alex Warren     

In [42]:
def get_record_table(lf) -> pl.LazyFrame:
    return lf.with_columns(
                date=pl.col("date").cast(pl.Date),
                position=pl.col("position").cast(pl.UInt8),
                id=pl.arange(0, pl.len()).sort(descending=True),
                # artists=(pl.col("artist").str.split("Featuring").list.first().str.split("With").list.first().str.strip_chars().str.split("&")),
                # features=pl.col("artist")
                # .str.split("Featuring").list.first().str.split("With")
                # .list.get(index=1, null_on_oob=True).str.strip_chars()
                # .str.split("&"),
            ).filter(
                pl.col("date") >= dt.date(1960, 1, 1)
            ).select(
                [
                    "id",
                    "date",
                    "position",
                    "song",
                ]
            ).collect()

In [33]:
record_tbl: pl.DataFrame = get_record_table(lf)

In [43]:
record_tbl

id,date,position,song
i64,date,u8,str
339286,2025-04-26,1,"""Luther"""
339285,2025-04-26,2,"""Die With A Smile"""
339284,2025-04-26,3,"""Nokia"""
339283,2025-04-26,4,"""Pink Pony Club"""
339282,2025-04-26,5,"""Ordinary"""
339281,2025-04-26,6,"""A Bar Song (Tipsy)"""
339280,2025-04-26,7,"""Lose Control"""
339279,2025-04-26,8,"""All The Way"""
339278,2025-04-26,9,"""Beautiful Things"""
339277,2025-04-26,10,"""I'm The Problem"""


In [None]:
def get_song_table(lf) -> pl.DataFrame:
    decades: range = range(1970, 2030, 10)
    return (
        lf.cast({"date": pl.Date})
        .group_by(["song", "artist"])
        .agg(
            power=(1 / pl.col("position")).sum(),
            longevity=(1 / (pl.col("position").log1p())).sum(),
            weeks_on_chart=pl.len(),
            proportion_top10=((pl.col("position") <= 10).sum() / pl.len()),
            earliest=pl.min("date"),
            latest=pl.max("date"),
        )
        .with_row_index("id")
        .sort(by="earliest")
        .with_columns(
            decade=(
                pl.col("earliest")
                .dt.year()
                .cut(
                    breaks=decades,
                    labels=[f"{x - 10}s" for x in decades] + ["2020s"],
                    left_closed=True,
                )
            )
        )
        .sort(by="power", descending=True)
        .select(["id", "song", "power", "longevity", "earliest", "latest", "decade"])
        .collect()
    )


In [None]:
def rating_expr(col: str) -> pl.Expr:
    return 10 * (pl.col(col).rank("average") / pl.len())

In [None]:
def percentile_expr(col: str, decade_cut: bool=False) -> pl.Expr:
        rank: pl.Expr = pl.col(col).rank("average")
        rank: pl.Expr = rank.over("decade") if decade_cut else rank
        denom: pl.Expr = pl.col(col).len()
        denom: pl.Expr = denom.over("decade") if decade_cut else denom
        return (rank / denom) * 100

In [49]:
song_tbl = get_song_table(lf)

In [50]:
song_tbl

id,song,power,longevity,earliest,latest,decade
u32,str,f64,f64,date,date,cat
7820,"""A Bar Song (Tipsy)""",28.08,50.40,2024-04-27,2025-05-31,"""2020s"""
30444,"""All I Want For Christmas Is Yo…",24.26,47.25,2000-01-08,2025-01-04,"""2000s"""
19890,"""As It Was""",23.83,45.61,2022-04-16,2023-06-10,"""2020s"""
1187,"""Last Night""",23.64,45.24,2023-02-11,2024-03-30,"""2020s"""
16121,"""Old Town Road""",21.11,36.78,2019-03-16,2020-01-18,"""2010s"""
24589,"""Uptown Funk!""",19.91,38.96,2014-11-29,2016-03-05,"""2010s"""
3938,"""Despacito""",19.15,36.07,2017-02-04,2018-01-20,"""2010s"""
4254,"""Stay""",18.83,40.86,2021-07-24,2022-10-01,"""2020s"""
23183,"""Shape Of You""",18.38,38.21,2017-01-28,2018-03-03,"""2010s"""
3250,"""Closer""",18.19,36.14,2016-08-20,2017-08-12,"""2010s"""


In [None]:
record_tbl

id,date,position,song
i64,date,u8,str
339286,2025-04-26,1,"""Luther"""
339285,2025-04-26,2,"""Die With A Smile"""
339284,2025-04-26,3,"""Nokia"""
339283,2025-04-26,4,"""Pink Pony Club"""
339282,2025-04-26,5,"""Ordinary"""
339281,2025-04-26,6,"""A Bar Song (Tipsy)"""
339280,2025-04-26,7,"""Lose Control"""
339279,2025-04-26,8,"""All The Way"""
339278,2025-04-26,9,"""Beautiful Things"""
339277,2025-04-26,10,"""I'm The Problem"""


In [82]:
def artist_table(lf: pl.LazyFrame) -> pl.DataFrame:
    return lf.select(
        pl.col("artist")
        .str.split(" Featuring ")
        .list.first().str.split(" With ")
        .list.eval(pl.element().str.split("&"))
        .explode().explode()
    ).reverse().with_row_index("id").collect()

In [83]:
artist_table(lf)

id,artist
u32,str
0,"""The Kirby Stone Four"""
1,"""Nat King Cole"""
2,"""Eydie Gorme"""
3,"""Tennessee Ernie Ford"""
4,"""Jimmy Reed"""
5,"""The Champs"""
6,"""The Champs"""
7,""" Johnny"""
8,"""Robert """
9,"""Billy Vaughn And His Orchestra"""


In [84]:
lf.collect()

position,date,song,artist,wks_on_chart
i64,str,str,str,str
1,"""2025-04-26""","""Luther""","""Kendrick Lamar & SZA""","""21"""
2,"""2025-04-26""","""Die With A Smile""","""Lady Gaga & Bruno Mars""","""35"""
3,"""2025-04-26""","""Nokia""","""Drake""","""9"""
4,"""2025-04-26""","""Pink Pony Club""","""Chappell Roan""","""44"""
5,"""2025-04-26""","""Ordinary""","""Alex Warren""","""10"""
6,"""2025-04-26""","""A Bar Song (Tipsy)""","""Shaboozey""","""53"""
7,"""2025-04-26""","""Lose Control""","""Teddy Swims""","""87"""
8,"""2025-04-26""","""All The Way""","""BigXthaPlug Featuring Bailey Z…","""2"""
9,"""2025-04-26""","""Beautiful Things""","""Benson Boone""","""64"""
10,"""2025-04-26""","""I'm The Problem""","""Morgan Wallen""","""11"""


In [None]:
def records_table(lf) -> pl.DataFrame:
    

In [79]:
def get_artist_table(df, record_lf) -> pl.DataFrame:
    cols = ["artists", "features"]
    lfs = []
    for col in cols:
        lfs.append(df.lazy().explode(col).group_by(col).agg(
            power=pl.mean("power"),
            longevity=pl.mean("longevity"),
            artist_percentile=pl.mean("percentile"),
            earliest=pl.col("earliest").min(),
            latest=pl.col("latest").max(),
            top_track=pl.col("song").filter(percentile=pl.max("percentile")).first(),
            top_track_id=pl.col("track_id").filter(percentile=pl.max("percentile")).first(),
            num_tracks=pl.len().cast(pl.UInt32),
            weeks_on_chart=pl.sum("weeks_on_chart"),
        ).sort("earliest", descending=True).with_row_index("id")
        .join(record_lf.lazy().explode(col).group_by(col).agg(
            peak_position=pl.min("position")
        ), how="inner", on=col).select([
            "id",
            col,
            "power",
            "longevity",
            "artist_percentile",
            "top_track",
            "top_track_id",
            "earliest",
            "latest",
            "weeks_on_chart",
            "num_tracks",
            "peak_position"
        ]).rename({col: "artist"}).collect())
    return lfs[0].vstack(lfs[1])

In [80]:
artist_tbl = get_artist_table(song_tbl, record_tbl)

In [81]:
artist_tbl

id,artist,power,longevity,artist_percentile,top_track,top_track_id,earliest,latest,weeks_on_chart,num_tracks,peak_position
u32,str,f64,f64,f64,str,u32,date,date,u32,u32,u8
3460,"""Matthew Sweet""",6.48,7.67,70.74,"""Sick Of Myself""",21344,1995-05-27,1995-10-07,20,1,58
7891,""" Kim Weston""",5.35,4.85,51.01,"""It Takes Two""",8699,1964-10-24,1967-03-25,17,2,14
1033,""" Khalid""",7.54,8.32,79.29,"""Eastside""",4362,2018-06-09,2022-10-29,90,3,9
6039,"""Paul Anka/Odia Coates""",7.72,6.90,73.11,"""(I Believe) There's Nothing St…",19645,1975-07-26,1975-10-18,13,1,15
660,""" Juhn""",2.67,2.81,27.38,"""Bandido""",14683,2021-02-20,2021-04-03,5,1,82
8390,"""Nathaniel Mayer And The Fabulo…",6.78,6.27,65.26,"""Village Of Love""",17086,1962-04-28,1962-07-14,12,1,22
3011,"""Memphis Bleek (""",5.16,5.76,54.61,"""It's Alright""",14155,1998-08-08,1998-10-24,12,1,61
4609,"""Patti LaBelle """,9.82,9.72,97.71,"""On My Own""",8861,1986-03-22,1986-08-23,23,1,1
132,"""Gracie Abrams""",5.18,5.57,53.73,"""That's So True""",2087,2024-06-22,2025-05-31,76,5,6
5504,"""Arpeggio""",2.83,2.94,28.84,"""Love And Desire (Part I)""",29027,1979-03-10,1979-04-07,5,1,70


In [205]:
song_tbl.filter(pl.col("track_id") == 4361)

track_id,song,artists,features,power_score,longevity_score,weeks_on_chart,proportion_top10,earliest,latest,decade,overall_percentile,decade_percentile
u32,str,list[str],list[str],f64,f64,u32,f64,date,date,cat,f64,f64
4361,"""Young Wheezy""","[""NAV ""]","["" Gunna""]",0.01,0.23,1,0.0,2020-11-21,2020-11-21,"""2020s""",5.3,13.15
