In [41]:
import pandas as pd
from pathlib import Path

RAW = Path("../data/raw")        # where your CSVs are
PROC = Path("../data/processed") # where we'll save outputs
PROC.mkdir(parents=True, exist_ok=True)

charts_path = RAW / "Hot Stuff.csv"
feats_path  = RAW / "Hot 100 Audio Features.csv"

In [42]:
charts = pd.read_csv(charts_path)
feats  = pd.read_csv(feats_path)

As the datasets have identical matches for Song and Performer, I merge the charts and audio features files into one dataframe.

For column names that have identical matches, I append _chart or _feat to specify which data it is using (e.g. index_feat & index_chart).

In [43]:
merged = charts.merge(
    feats,
    on=["Song", "Performer"],  # columns that must match in both files
    how="inner",               # keep only rows that match on BOTH sides
    suffixes=("_chart", "_feat")
)

print("Merged shape:", merged.shape)
merged.head(3)

Merged shape: (330217, 32)


Unnamed: 0,index_chart,url,WeekID,Week Position,Song,Performer,SongID_chart,Instance,Previous Week Position,Peak Position,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_track_popularity
0,0,http://www.billboard.com/charts/hot-100/1965-0...,7/17/1965,34,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,45.0,34,...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0
1,1,http://www.billboard.com/charts/hot-100/1965-0...,7/24/1965,22,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,34.0,22,...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0
2,2,http://www.billboard.com/charts/hot-100/1965-0...,7/31/1965,14,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,22.0,14,...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0


Remove any songs that are missing values within the audio feature columns.

In [44]:
feature_cols = [
    "danceability", "energy", "valence", "tempo", "loudness",
    "acousticness", "instrumentalness", "liveness", "speechiness"
]

before = len(merged)
merged = merged.dropna(subset=feature_cols)
after = len(merged)

print(f"Dropped {before - after} rows with missing feature values. Remaining: {after}")

Dropped 43709 rows with missing feature values. Remaining: 286508


Since this dataset tracks the Billboard Hot 100 per week, many songs appear several times within the chart. For the sake of simplicity and a broader view of the data, I will only be using the Peak Position of each song. This means all other instances less than the Peak Position will be deleted.

This results in sparser data per year but information like longevity is still kept in the "Weeks on Chart" column. 

In [19]:
merged_sorted = merged.sort_values("Week Position", ascending=True)
len(merged_sorted)

286508

Delete duplicates that have the same song name and artist, only keeping the first instance (the highest position based off our previous line of code)

In [20]:
peak_songs = (
    merged_sorted
    .drop_duplicates(subset=["Song", "Performer"], keep="first")
    .copy()
)

print("Unique songs kept:", len(peak_songs))

Unique songs kept: 24221


Keep only the songs between the Billboard position of 1-50, effectively cutting the data set in half.

In [21]:
peak_top50 = peak_songs.loc[peak_songs["Week Position"] <= 50].copy()
print("Songs that peaked in Top-50:", len(peak_top50))

Songs that peaked in Top-50: 13732


Convert the WeekID into a datetime format which lets us sort the data set properly.

In [22]:
peak_top50["chart_week"] = pd.to_datetime(peak_top50["WeekID"], errors="coerce")

Clean up the data by dropping irrelevant columns, renaming certain ones, and keeping consistency among column names.

In [25]:
cols_to_drop = [
    "url", "Instance", "Previous Week Position", "Peak Position",
    "index_feat", "SongID_chart", "SongID_feat",
    "spotify_track_id", "spotify_track_preview_url",
    "spotify_track_album", "WeekID"
]

peak_top50 = peak_top50.drop(columns=cols_to_drop, errors="ignore")

rename_map = {
    "index_chart": "index",
    "chart_week": "weekID",
    "spotify_genre": "genre",
    "spotify_track_explicit": "explicit",
    "spotify_track_duration_ms": "duration",
    "spotify_track_popularity" : "popularity"
}

peak_top50 = peak_top50.rename(columns=rename_map)

peak_top50.columns = [c.lower() for c in peak_top50.columns]

print(peak_top50.columns.tolist())
peak_top50.head(3)


['index', 'week position', 'song', 'performer', 'weeks on chart', 'genre', 'duration', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'popularity', 'weekid']


Unnamed: 0,index,week position,song,performer,weeks on chart,genre,duration,explicit,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity,weekid
93356,92943,1,(Just Like) Starting Over,John Lennon,10,"['album rock', 'classic rock', 'folk rock', 'm...",236546.0,False,0.701,0.79,...,1.0,0.0725,0.301,6.1e-05,0.179,0.421,99.104,4.0,57.0,1981-01-03
290479,288686,1,Centerfold,The J. Geils Band,19,"['album rock', 'blues rock', 'classic rock', '...",216533.0,False,0.653,0.677,...,1.0,0.0396,0.249,0.0,0.403,0.888,114.37,4.0,72.0,1982-03-13
290480,288687,1,Smooth,Santana Featuring Rob Thomas,19,"['blues rock', 'classic rock']",294986.0,False,0.609,0.923,...,1.0,0.0338,0.16,5e-06,0.295,0.961,115.996,4.0,71.0,1999-12-04


Move weekid to the 2nd column for legibility.

In [27]:
cols = peak_top50.columns.tolist()  # get current column order
cols.remove("weekid")                # temporarily remove it
cols.insert(1, "weekid")             # re-insert at position 1 (2nd place)
peak_top50 = peak_top50[cols]        # reorder DataFrame

Sort the dataframe from oldest to newest week. Within each week, sort from the highest rank (1) to the lowest.

In [29]:
peak_top50 = peak_top50.sort_values(
    by=["weekid", "week position"],
    ascending=[True, True]
).reset_index(drop=True)

In [39]:
peak_top50.head(10)

Unnamed: 0,index,weekid,week position,song,performer,weeks on chart,genre,duration,explicit,danceability,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity
0,964,1958-08-02,2,Patricia,Perez Prado And His Orchestra,1,"['mambo', 'space age pop']",140000.0,False,0.699,...,-5.976,1.0,0.0391,0.18,0.415,0.0704,0.81,137.373,4.0,27.0
1,980,1958-08-02,3,Splish Splash,Bobby Darin,1,"['adult standards', 'brill building pop', 'eas...",131719.0,False,0.645,...,-1.526,1.0,0.0393,0.385,0.0,0.37,0.965,147.768,4.0,60.0
2,292513,1958-08-02,7,Yakety Yak,The Coasters,1,"['brill building pop', 'bubblegum pop', 'doo-w...",113040.0,False,0.715,...,-9.491,1.0,0.128,0.705,0.000732,0.044,0.976,120.789,4.0,56.0
3,7501,1958-08-02,15,For Your Precious Love,Jerry Butler and The Impressions,1,"['funk', 'motown', 'soul']",251613.0,False,0.365,...,-16.317,1.0,0.309,0.659,0.000672,0.847,0.506,75.55,3.0,15.0
4,2304,1958-08-02,16,One Summer Night,The Danleers,1,['doo-wop'],133178.0,False,0.421,...,-6.889,1.0,0.0286,0.883,0.0,0.288,0.514,98.393,3.0,42.0
5,7533,1958-08-02,17,Endless Sleep,Jody Reynolds,1,[],143693.0,False,0.587,...,-10.737,1.0,0.0444,0.829,1.7e-05,0.116,0.529,111.195,4.0,28.0
6,2365,1958-08-02,20,Do You Want To Dance,Bobby Freeman,1,"['doo-wop', 'rhythm and blues', 'rock-and-roll...",165693.0,False,0.625,...,-12.003,1.0,0.0501,0.519,3.7e-05,0.155,0.957,155.537,4.0,27.0
7,2394,1958-08-02,22,A Certain Smile,Johnny Mathis,1,"['adult standards', 'brill building pop', 'eas...",168293.0,False,0.233,...,-10.031,1.0,0.0307,0.854,2e-06,0.274,0.218,73.576,4.0,39.0
8,67690,1958-08-02,24,The Purple People Eater,Sheb Wooley,1,"['comic', 'novelty']",137960.0,False,0.643,...,-5.006,1.0,0.0801,0.823,0.0,0.33,0.881,157.748,4.0,45.0
9,131373,1958-08-02,25,What Am I Living For,Chuck Willis,1,"['doo-wop', 'jazz blues', 'rhythm and blues', ...",146800.0,False,0.667,...,-8.712,1.0,0.0258,0.728,0.0,0.0809,0.682,101.3,3.0,24.0


In [40]:
from pathlib import Path
PROC = Path("../data/processed")
PROC.mkdir(parents=True, exist_ok=True)

clean_path = PROC / "merged_peak_top50_clean.csv"
peak_top50.to_csv(clean_path, index=False)
print("Saved cleaned dataset to:", clean_path)

Saved cleaned dataset to: ../data/processed/merged_peak_top50_clean.csv


In [45]:
peak_top50.head(10)

Unnamed: 0,index,weekid,week position,song,performer,weeks on chart,genre,duration,explicit,danceability,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,popularity
0,964,1958-08-02,2,Patricia,Perez Prado And His Orchestra,1,"['mambo', 'space age pop']",140000.0,False,0.699,...,-5.976,1.0,0.0391,0.18,0.415,0.0704,0.81,137.373,4.0,27.0
1,980,1958-08-02,3,Splish Splash,Bobby Darin,1,"['adult standards', 'brill building pop', 'eas...",131719.0,False,0.645,...,-1.526,1.0,0.0393,0.385,0.0,0.37,0.965,147.768,4.0,60.0
2,292513,1958-08-02,7,Yakety Yak,The Coasters,1,"['brill building pop', 'bubblegum pop', 'doo-w...",113040.0,False,0.715,...,-9.491,1.0,0.128,0.705,0.000732,0.044,0.976,120.789,4.0,56.0
3,7501,1958-08-02,15,For Your Precious Love,Jerry Butler and The Impressions,1,"['funk', 'motown', 'soul']",251613.0,False,0.365,...,-16.317,1.0,0.309,0.659,0.000672,0.847,0.506,75.55,3.0,15.0
4,2304,1958-08-02,16,One Summer Night,The Danleers,1,['doo-wop'],133178.0,False,0.421,...,-6.889,1.0,0.0286,0.883,0.0,0.288,0.514,98.393,3.0,42.0
5,7533,1958-08-02,17,Endless Sleep,Jody Reynolds,1,[],143693.0,False,0.587,...,-10.737,1.0,0.0444,0.829,1.7e-05,0.116,0.529,111.195,4.0,28.0
6,2365,1958-08-02,20,Do You Want To Dance,Bobby Freeman,1,"['doo-wop', 'rhythm and blues', 'rock-and-roll...",165693.0,False,0.625,...,-12.003,1.0,0.0501,0.519,3.7e-05,0.155,0.957,155.537,4.0,27.0
7,2394,1958-08-02,22,A Certain Smile,Johnny Mathis,1,"['adult standards', 'brill building pop', 'eas...",168293.0,False,0.233,...,-10.031,1.0,0.0307,0.854,2e-06,0.274,0.218,73.576,4.0,39.0
8,67690,1958-08-02,24,The Purple People Eater,Sheb Wooley,1,"['comic', 'novelty']",137960.0,False,0.643,...,-5.006,1.0,0.0801,0.823,0.0,0.33,0.881,157.748,4.0,45.0
9,131373,1958-08-02,25,What Am I Living For,Chuck Willis,1,"['doo-wop', 'jazz blues', 'rhythm and blues', ...",146800.0,False,0.667,...,-8.712,1.0,0.0258,0.728,0.0,0.0809,0.682,101.3,3.0,24.0
