In [None]:
import polars as pl
import json
from datetime import date

In [2]:
pl.Config.set_tbl_cols(20)
pl.Config.set_tbl_rows(100)
pl.Config.set_fmt_str_lengths(70)

polars.config.Config

In [3]:
def clean(lf):
    return (
        lf.cast({"date": pl.Date, "position": pl.UInt8})
        .sort(by="date")
        .with_row_index("id")
        .select(["id", "date", "position", "song", "artist"])
    )

In [4]:
lf0: pl.LazyFrame = clean(pl.read_json("../data/records05-29_23-57.json").lazy())

In [5]:
def handle_edge_cases(lf):
    edge_pat_1 = r"(?i)duet with"
    # matches any occurance of "duet with" (case insensitive)
    edge_pat_2 = r"(?i)\((feat\.*[a-z]*)|(&)|(with)"
    # matches any occurance of an opening parenthese immediately followed by a first class seperator (DEFINE CONST) or "&"
    edge_pat_3: str = r"(?i)&\s(the|his|her|original)(.*)"
    # matches any occurance of "& " followed by the, his, her, or original; captures the previous word and the rest of the string 
    return lf.unique(subset=["song", "artist"]).with_columns(
        # filter out duplicate song entries; include artist in the subset to avoid grouping songs with the same title together
        artist=pl.col("artist").replace(edge_pat_1, "&")
        # replace "duet with" with & since it practically acts as a second class seperator rather than a first class seperator 
    ).with_columns(
        artist=pl.when(pl.col("artist").str.contains(edge_pat_2))
        .then(pl.col("artist").str.replace_all(r"[()]", ""))
        .otherwise(pl.col("artist"))
        # strip parentheses from entries that only use parenthese to include a feature since it complicated whitespace when regexing later
    ).with_columns(
        artist=pl.col("artist").str.replace_all(edge_pat_3, r"and $1$2")
        # entried matching edge_pat_3 contain band names and should not be seperated, so "&"" is replaced with "and" which is not recognized as a seperator
    )

In [6]:
def split_features(lf: pl.LazyFrame):
    split_pattern: str = r"(?i)(\sfeat\.*[a-z]*\s)|(\swith\s)"
    roles = ["main", "featured"]
    sep: str = "-"
    mainlf = lf.with_columns(
        artist=pl.col("artist")
        .str.replace(split_pattern, sep)
        .str.split_exact(sep, 1)
        .struct[0],
        role=pl.lit("main"),
    ).drop_nulls()
    featlf = lf.with_columns(
        artist=pl.col("artist")
        .str.replace(split_pattern, sep)
        .str.split_exact(sep, 1)
        .struct[1],
        role=pl.lit("featured"),
    ).drop_nulls()
    junction_lf = pl.concat([mainlf, featlf]).cast({"role": pl.Enum(roles)})
    return junction_lf

In [7]:
def split_artists(lf: pl.LazyFrame):
    seperator_sub: str = r"(?i)(\s*[&/+,]\s*)|(x\s)"
    sep: str = "!~!"
    transformed_lf = lf.with_columns(
        artist=pl.col("artist")
        .str.replace_all(seperator_sub, sep)
        .str.split(sep).list.eval(pl.element().str.strip_chars()),
    ).explode(["artist"])
    return transformed_lf

In [8]:
lf1 = handle_edge_cases(lf0)

In [9]:
lf2 = split_features(lf1)

In [10]:
lf3 = split_artists(lf2)

In [11]:
lf3.filter(
    pl.col("song") == "Dance With Me"
).collect()

id,date,position,song,artist,role
u32,date,u8,str,str,enum
101081,1978-03-04,95,"""Dance With Me""","""Peter Brown""","""main"""
88188,1975-07-19,89,"""Dance With Me""","""Orleans""","""main"""
215477,2000-07-29,91,"""Dance With Me""","""Debelah Morgan""","""main"""
220759,2001-09-15,73,"""Dance With Me""","""112""","""main"""
6196,1959-10-10,97,"""Dance With Me""","""The Drifters""","""main"""
37788,1965-10-30,89,"""Dance With Me""","""The Mojo Men""","""main"""
101081,1978-03-04,95,"""Dance With Me""","""Betty Wright""","""featured"""


In [59]:
def create_table_song(lf):
    score_calcs: dict =  {
        "pos_weighted": (1 / pl.col("position")).sum,
        "longevity_weighted": (101 - pl.col("position")).truediv(100).sum,
        "unweighted": (pl.lit(100).log1p() - pl.col("position").log1p()).sum,
    }
    decade_cuts: range =  range(1970, 2030, 10)
    decade_labels = [f"{decade}s" for decade in range(1960, 2030, 10)]
    return (
        lf.cast({"date": pl.Date})
        .group_by(["song", "artist"])
        .agg(
            position_score=score_calcs["pos_weighted"](),
            longevity_score=score_calcs["longevity_weighted"](),
            overall_score=score_calcs["unweighted"](),
            chart_debut=pl.min("date"),
            latest_appearance=pl.max("date"),
        )
        .with_row_index("id")
        .sort(by="chart_debut")
        .with_columns(
            decade=(
                pl.col("chart_debut")
                .dt.year()
                .cut(
                    breaks=decade_cuts,
                    labels=decade_labels,
                    left_closed=True
                )
            )
        )
        .sort(by="overall_score", descending=True)
        .select(
            [
                "id",
                "song",
                "artist",
                "position_score",
                "longevity_score",
                "overall_score",
                "chart_debut",
                "latest_appearance",
                "decade",
            ]
        )
        .collect()
    )


In [47]:
song_df0 = create_table_song(lf0)

In [None]:
song_df0

In [13]:
song_df1 = handle_edge_cases(song_df0.lazy())

In [14]:
song_df2 = split_features(song_df1)

In [15]:
song_df3 = split_artists(song_df2)

In [None]:
song_df3.collect()

In [17]:
song_df3.filter(
    pl.col("song") == "Dance With Me"
).collect()

id,song,artist,power,longevity,earliest,latest,decade,role
u32,str,str,f64,f64,date,date,cat,enum
16830,"""Dance With Me""","""Debelah Morgan""",1.28647,8.88086,2000-07-29,2001-02-10,"""2000s""","""main"""
19365,"""Dance With Me""","""Orleans""",0.964578,5.766735,1975-07-19,1975-11-15,"""1970s""","""main"""
15218,"""Dance With Me""","""The Drifters""",0.495775,4.277965,1959-10-10,1960-01-16,"""1950s""","""main"""
29611,"""Dance With Me""","""Peter Brown""",1.132627,7.989054,1978-03-04,1978-09-09,"""1970s""","""main"""
11727,"""Dance With Me""","""112""",0.299309,4.17527,2001-09-15,2002-01-26,"""2000s""","""main"""
7229,"""Dance With Me""","""The Mojo Men""",0.082857,1.395303,1965-10-30,1965-12-04,"""1960s""","""main"""
29611,"""Dance With Me""","""Betty Wright""",1.132627,7.989054,1978-03-04,1978-09-09,"""1970s""","""featured"""


In [18]:
artist_df0 = song_df3.sort("latest", descending=True).unique(subset=["artist"]).select("artist")

In [19]:
artist_df1 = artist_df0.with_row_index("id").sort("id", descending=True)

In [20]:
junction_lf = song_df3.join(artist_df1, on="artist", how="inner", suffix="artist")

In [21]:
junction_lf = junction_lf.select(
        id_song="id",
        id_artist="idartist",
        role_artist="role",
)

In [22]:
junction_lf.collect()

id_song,id_artist,role_artist
u32,u32,enum
24272,695,"""main"""
14122,5215,"""main"""
13633,1832,"""main"""
26268,1514,"""main"""
1153,1903,"""main"""
1153,1847,"""main"""
33,3274,"""main"""
3189,8085,"""main"""
26161,3766,"""main"""
4352,3984,"""main"""


In [None]:
def z_score_normalize(song_df, cols):
    z_scores = []
    for col in cols:
        z_scores.append(song_df.select(
            col.mean().over("decade").alias("mu"),
            col.std().over("decade").alias("sigma"),
        ).select(
            (abs(col - pl.col("mu")) / pl.col("sigma")).alias(f"{col.name}_normalized")
        ))
    return 

In [56]:
temp = z_score_normalize(song_df0)

In [57]:
temp

id,song,artist,position_focused_score,longevity_focused_score,overall_score,chart_debut,latest_appearance,decade,normalized_zscore
u32,str,str,f64,f64,f64,date,date,cat,f64
18194,"""A Bar Song (Tipsy)""","""Shaboozey""",28.083405,55.74,182.722215,2024-04-27,2025-05-31,"""2020s""",19.110634
25072,"""As It Was""","""Harry Styles""",23.832485,55.86,166.386799,2022-04-16,2023-06-10,"""2020s""",16.179889
2369,"""Last Night""","""Morgan Wallen""",23.635364,55.54,166.888103,2023-02-11,2024-03-30,"""2020s""",16.043986
25834,"""Rockin' Around The Christmas Tree""","""Brenda Lee""",14.383943,49.12,132.550255,1960-12-10,2025-01-04,"""1960s""",15.487992
26306,"""All I Want For Christmas Is You""","""Mariah Carey""",24.257703,58.48,167.284979,2000-01-08,2025-01-04,"""2000s""",13.887734
31621,"""Stay""","""The Kid LAROI & Justin Bieber""",18.82777,57.64,169.379045,2021-07-24,2022-10-01,"""2020s""",12.729448
28269,"""Old Town Road""","""Lil Nas X Featuring Billy Ray Cyrus""",21.105596,38.64,117.616063,2019-03-16,2020-01-18,"""2010s""",12.072087
20336,"""Mack The Knife""","""Bobby Darin""",11.236275,22.02,66.567219,1959-08-22,1960-02-13,"""1960s""",12.00029
31051,"""The Twist""","""Chubby Checker""",10.923174,34.62,99.947806,1960-07-30,1962-04-07,"""1960s""",11.653365
9924,"""Hey Jude""","""The Beatles""",10.860703,17.7,57.988543,1968-09-14,1969-01-18,"""1960s""",11.584146
