In [1]:
import polars as pl
import json
from datetime import date
from transform import load_data, get_artist_table, get_song_table, get_junction_table
from pathlib import Path


In [2]:
pl.Config.set_tbl_cols(20)
pl.Config.set_tbl_rows(100)
pl.Config.set_fmt_str_lengths(70)

polars.config.Config

In [3]:
data_table = load_data()
data_table.collect()

id,date,position,song,artist
u32,date,u8,str,str
0,1958-08-09,1,"""Poor Little Fool""","""Ricky Nelson"""
1,1958-08-09,2,"""Nel Blu Dipinto Di Blu (Volare)""","""Domenico Modugno"""
2,1958-08-09,3,"""Patricia""","""Perez Prado And His Orchestra"""
3,1958-08-09,4,"""Splish Splash""","""Bobby Darin"""
4,1958-08-09,5,"""When""","""Kalin Twins"""
5,1958-08-09,6,"""My True Love""","""Jack Scott"""
6,1958-08-09,7,"""Hard Headed Woman""","""Elvis Presley With The Jordanaires"""
7,1958-08-09,8,"""Rebel-'rouser""","""Duane Eddy His Twangy Guitar And The Rebels"""
8,1958-08-09,9,"""Just A Dream""","""Jimmy Clanton And His Rockets"""
9,1958-08-09,10,"""Willie And The Hand Jive""","""The Johnny Otis Show"""


In [4]:
song_tbl = get_song_table(data_table)

In [5]:
song_tbl.collect().head()

id,song
u32,str
0,"""Itsy Bitsy Teenie Weenie Yellow Polkadot Bikini"""
1,"""Can I"""
2,"""Video Killed The Radio Star"""
3,"""Change Of Heart"""
4,"""Endless Nights"""


In [6]:
artist_tbl = data_table.pipe(get_artist_table)

In [10]:
junction_tbl.collect().filter(
    pl.col("role") == "featuring"
)

song_id,artist_id,role
u32,u32,str
27502,8906,"""featuring"""
19487,6180,"""featuring"""
2960,2611,"""featuring"""
10583,2611,"""featuring"""
12072,2611,"""featuring"""
29155,2611,"""featuring"""
30324,2611,"""featuring"""
26268,1981,"""featuring"""
15069,8906,"""featuring"""
21575,8906,"""featuring"""


In [8]:
junction_tbl = data_table.pipe(get_junction_table, song_tbl, artist_tbl)

In [9]:
junction_tbl.collect().head(20)

song_id,artist_id,role
u32,u32,str
8715,6143,"""main"""
6753,505,"""main"""
22000,7875,"""main"""
12707,531,"""main"""
6104,2474,"""main"""
8994,4055,"""main"""
19563,6660,"""main"""
716,1811,"""main"""
12782,839,"""main"""
17311,839,"""main"""


In [None]:
def create_table_song(lf):
    # score_calcs: dict =  {
    #     "pos_weighted": (1 / pl.col("position")).sum,
    #     "longevity_weighted": (101 - pl.col("position")).truediv(100).sum,
    #     "unweighted": (pl.lit(100).log1p() - pl.col("position").log1p()).sum,
    # }
    decade_cuts: range =  range(1970, 2030, 10)
    decade_labels = [f"{decade}s" for decade in range(1960, 2030, 10)]
    return (
        lf.cast({"date": pl.Date})
        .group_by(["song", "artist"])
        .agg(
            # position_score=score_calcs["pos_weighted"](),
            # longevity_score=score_calcs["longevity_weighted"](),
            # overall_score=score_calcs["unweighted"](),
            chart_debut=pl.min("date"),
            latest_appearance=pl.max("date"),
        )
        .with_row_index("id")
        .sort(by="chart_debut")
        .with_columns(
            decade=(
                pl.col("chart_debut")
                .dt.year()
                .cut(
                    breaks=decade_cuts,
                    labels=decade_labels,
                    left_closed=True
                )
            )
        )
        .sort(by="id", descending=True)
        .select(
            [
                "id",
                "song",
                "artist",
                "chart_debut",
                "latest_appearance",
                "decade",
            ]
        )
        .collect()
    )


In [None]:
def z_score_normalize(song_df, cols):
    z_scores = []
    for col in cols:
        z_scores.append(song_df.select(
            col.mean().over("decade").alias("mu"),
            col.std().over("decade").alias("sigma"),
        ).select(
            (abs(col - pl.col("mu")) / pl.col("sigma")).alias(f"{col.name}_normalized")
        ))
    return 