In [6]:
import polars as pl
import json
from transform import clean


In [87]:
pl.Config.set_tbl_cols(20)
pl.Config.set_tbl_rows(100)
pl.Config.set_fmt_str_lengths(70)

polars.config.Config

In [7]:
df: pl.DataFrame = clean(pl.read_json("../data/records05-29_23-57.json"))

In [10]:
df.head()

id,date,position,song,artist
u32,date,u8,str,str
0,1960-01-02,1,"""El Paso""","""Marty Robbins"""
1,1960-01-02,2,"""Why""","""Frankie Avalon"""
2,1960-01-02,3,"""The Big Hurt""","""Miss Toni Fisher"""
3,1960-01-02,4,"""Running Bear""","""Johnny Preston"""
4,1960-01-02,5,"""Way Down Yonder In New Orleans""","""Freddie Cannon"""


In [77]:
edge_df = df.unique(subset=["artist"]).filter(
    pl.col("artist").str.contains(r"[\(\)\[\]]")
)

In [76]:
edge_df.sort(by="artist")

id,date,position,song,artist
u32,date,u8,str,str
239575,2006-12-02,89,"""When Your Heart Stops Beating""","""(+44)"""
64393,1972-05-06,94,"""Woman's Gotta Have It""","""(The Preacher) Bobby Womack"""
1297,1960-03-26,98,"""I Need You Now""","""100 Strings and Jono (Choir of 40 Voices)"""
315436,2022-04-02,50,"""Nobody Like U""","""4*TOWN (From Disney And Pixar's Turning Red)"""
34874,1966-09-03,75,"""96 Tears""","""? (Question Mark) & The Mysterians"""
20995,1964-01-04,96,"""Java""","""Al (He's the King) Hirt"""
194779,1998-01-03,93,"""What If I Said""","""Anita Cochran (Duet With Steve Wariner)"""
84183,1976-03-06,84,"""Street Talk""","""B.C.G. (B.C. Generation)"""
177861,1994-10-08,75,"""U Will Know (From ""Jason's Lyric"")""","""B.M.U. (Black Men United)"""
188334,1996-10-12,48,"""No Diggity""","""BLACKstreet (Featuring Dr. Dre)"""


In [78]:
stripped_df = edge_df.with_columns(
    pl.col("artist").str.replace_all(r"[()]", "").alias("stripped")
)

In [None]:
stripped_df

id,date,position,song,artist,stripped
u32,date,u8,str,str,str
177861,1994-10-08,75,"""U Will Know (From ""Jason's Lyric"")""","""B.M.U. (Black Men United)""","""B.M.U. Black Men United"""
184972,1996-02-17,86,"""Microphone Master""","""Das EFX (Featuring Mobb Deep)""","""Das EFX Featuring Mobb Deep"""
193254,1997-09-20,68,"""Avenues""","""Refugee Camp All Stars Featuring Pras (With Ky-mani)""","""Refugee Camp All Stars Featuring Pras With Ky-mani"""
167672,1992-10-24,86,"""Slow And Sexy""","""Shabba Ranks (Featuring Johnny Gill)""","""Shabba Ranks Featuring Johnny Gill"""
197852,1998-08-08,66,"""It's Alright""","""Memphis Bleek (& Jay-Z)""","""Memphis Bleek & Jay-Z"""
84183,1976-03-06,84,"""Street Talk""","""B.C.G. (B.C. Generation)""","""B.C.G. B.C. Generation"""
246247,2008-03-15,61,"""Falling Slowly""","""The Swell Season (Glen Hansard & Marketa Irglova)""","""The Swell Season Glen Hansard & Marketa Irglova"""
194779,1998-01-03,93,"""What If I Said""","""Anita Cochran (Duet With Steve Wariner)""","""Anita Cochran Duet With Steve Wariner"""
116974,1982-08-28,88,"""Ain't Nothing Like The Real Thing/You're All I Need To Get By""","""Chris Christian (with Amy Holland)""","""Chris Christian with Amy Holland"""
181558,1995-06-24,72,"""Sprinkle Me""","""E-40 (Featuring Suga T)""","""E-40 Featuring Suga T"""


In [159]:
df: pl.DataFrame = clean(pl.read_json("../data/records05-29_23-57.json"))

In [83]:
edge_df2 = df.filter(
    pl.col("artist").str.contains(r"(?i)(His Orchestra)|(\sand\s)")
)

In [None]:
no_split_words = [r"(?i)(his)|(the)|(her)"]
# don't have to worry about "and"
edge_df2 

In [192]:
def replace_fn(df: pl.LazyFrame):
    return df.unique(subset=["song", "artist"]).with_columns(
        song=pl.col("song"),
        artist=pl.col("artist").str.replace_all(r"[()]", "").str.replace("Duet With", "&")
)

In [181]:
def split_features(df: pl.LazyFrame):
    split_pattern: str = r"(?i)(\sfeat\.*[a-z]*\s)|(\swith\s)"
    new_uniform_seperator: str = "-"
    maindf = df.select(
        song=pl.col("song"),
        artist=pl.col("artist")
        .str.replace(split_pattern, new_uniform_seperator)
        .str.split_exact(new_uniform_seperator, 1)
        .struct[0],
        role=pl.lit("main"),
    ).drop_nulls()
    featdf = df.select(
        song=pl.col("song"),
        artist=pl.col("artist")
        .str.replace(split_pattern, new_uniform_seperator)
        .str.split_exact(new_uniform_seperator, 1)
        .struct[1],
        role=pl.lit("featured"),
    ).drop_nulls()
    junction_df = pl.concat([maindf, featdf])
    return junction_df

In [195]:
def split_artists(df: pl.LazyFrame):
    band_name_exclusion: str = r"(?i)&\s(the|his|her|original)(.*)"
    band_name_sub: str = r"and $1$2"
    seperator_sub: str = r"(?i)(\s*[&/+,]\s*)|(x\s)"
    new_uniform_seperator: str = "!~!"
    transformed_df = df.select(
        song=pl.col("song"),
        artist=pl.col("artist")
        .str.replace_all(band_name_exclusion, band_name_sub)
        .str.replace_all(seperator_sub, new_uniform_seperator)
        .str.split(new_uniform_seperator).list.eval(pl.element().str.strip_chars()),
        role=pl.col("role")
    ).explode(["artist"])
    return transformed_df.collect()

In [196]:
df1 = replace_fn(df.lazy())
df2 = split_features(df1)
df3 = split_artists(df2)

In [197]:
df3

song,artist,role
str,str,str
"""All Star""","""Smash Mouth""","""main"""
"""I'll Walk""","""Bucky Covington""","""main"""
"""Sunday Sun""","""Neil Diamond""","""main"""
"""Good Morning Heartache""","""Diana Ross""","""main"""
"""Stronger Woman""","""Jewel""","""main"""
"""It's Just The Sun""","""Don McLean""","""main"""
"""Youth Of The Nation""","""P.O.D.""","""main"""
"""Sober Saturday Night""","""Chris Young""","""main"""
"""Fortuneteller""","""Bobby Curtola""","""main"""
"""Just Another Night""","""Mick Jagger""","""main"""
