In [77]:
import pandas as pd
from pathlib import Path

RAW = Path("../data/raw")        # where your CSVs are
PROC = Path("../data/processed") # where we'll save outputs
PROC.mkdir(parents=True, exist_ok=True)

charts_path = RAW / "Hot Stuff.csv"
feats_path  = RAW / "Hot 100 Audio Features.csv"

In [78]:
charts = pd.read_csv(charts_path)
feats  = pd.read_csv(feats_path)

print("CHARTS shape:", charts.shape)
print("FEATURES shape:", feats.shape)

print("\nCHARTS columns:")
print(charts.columns.tolist())

print("\nFEATURES columns:")
print(feats.columns.tolist())

CHARTS shape: (327895, 11)
FEATURES shape: (29503, 23)

CHARTS columns:
['index', 'url', 'WeekID', 'Week Position', 'Song', 'Performer', 'SongID', 'Instance', 'Previous Week Position', 'Peak Position', 'Weeks on Chart']

FEATURES columns:
['index', 'SongID', 'Performer', 'Song', 'spotify_genre', 'spotify_track_id', 'spotify_track_preview_url', 'spotify_track_duration_ms', 'spotify_track_explicit', 'spotify_track_album', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'spotify_track_popularity']


As the datasets have identical matches for Song and Performer, I merge the charts and audio features files into one dataframe.

In [79]:
merged = charts.merge(
    feats,
    on=["Song", "Performer"],  # columns that must match in both files
    how="inner",               # keep only rows that match on BOTH sides
    suffixes=("_chart", "_feat")
)

print("Merged shape:", merged.shape)
merged.head(3)

Merged shape: (330217, 32)


Unnamed: 0,index_chart,url,WeekID,Week Position,Song,Performer,SongID_chart,Instance,Previous Week Position,Peak Position,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_track_popularity
0,0,http://www.billboard.com/charts/hot-100/1965-0...,7/17/1965,34,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,45.0,34,...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0
1,1,http://www.billboard.com/charts/hot-100/1965-0...,7/24/1965,22,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,34.0,22,...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0
2,2,http://www.billboard.com/charts/hot-100/1965-0...,7/31/1965,14,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,22.0,14,...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0


Many columns contain irrelevant information to the data analysis, so we define a set of columns to keep, removing columns such as the track_id or preview_url.

In [80]:
keep_cols = ['index_chart',
 'WeekID',
 'Week Position',
 'Song',
 'Performer',
 'SongID_chart',
 'Instance',
 'Peak Position',
 'Weeks on Chart',
 'spotify_genre',
 'spotify_track_duration_ms',
 'spotify_track_explicit',
 'spotify_track_album',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'spotify_track_popularity']

In [81]:
merged = merged[keep_cols].copy()

In [82]:
merged.head(3)

Unnamed: 0,index_chart,WeekID,Week Position,Song,Performer,SongID_chart,Instance,Peak Position,Weeks on Chart,spotify_genre,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_track_popularity
0,0,7/17/1965,34,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,34,4,['deep adult standards'],...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0
1,1,7/24/1965,22,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,22,5,['deep adult standards'],...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0
2,2,7/31/1965,14,Don't Just Stand There,Patty Duke,Don't Just Stand TherePatty Duke,1,14,6,['deep adult standards'],...,-15.044,1.0,0.0298,0.61,7.7e-05,0.1,0.568,82.331,3.0,21.0


Remove any songs that are missing values within the audio feature columns.

In [83]:
feature_cols = [
    "danceability", "energy", "valence", "tempo", "loudness",
    "acousticness", "instrumentalness", "liveness", "speechiness"
]

before = len(merged)
merged = merged.dropna(subset=feature_cols)
after = len(merged)

print(f"Dropped {before - after} rows with missing feature values. Remaining: {after}")


Dropped 43709 rows with missing feature values. Remaining: 286508


Since this dataset tracks the Billboard Hot 100 per week, many songs appear several times within the chart. For the sake of simplicity and a broader view of the data, I will only be using the Peak Position of each song. This means all other instances less than the Peak Position will be deleted.

This results in sparser data per year but information like longevity is still kept in the "Weeks on Chart" column. 

In [84]:
merged_sorted = merged.sort_values("Week Position", ascending=True)
len(merged_sorted)

286508

Delete duplicates that have the same song name and artist, only keeping the first instance (the highest position based off our previous line of code)

In [85]:
peak_songs = (
    merged_sorted
    .drop_duplicates(subset=["Song", "Performer"], keep="first")
    .copy()
)

print("Unique songs kept:", len(peak_songs))


Unique songs kept: 24221


Keep only the songs between the Billboard position of 1-50, effectively cutting the data set in half.

In [86]:
peak_top50 = peak_songs.loc[peak_songs["Week Position"] <= 50].copy()
print("Songs that peaked in Top-50:", len(peak_top50))

Songs that peaked in Top-50: 13732


Convert the WeekID into a more readable and consistent format.

In [87]:
peak_top50["chart_week"] = pd.to_datetime(peak_top50["WeekID"], errors="coerce")

In [88]:
peak_top50.columns.tolist()

['index_chart',
 'WeekID',
 'Week Position',
 'Song',
 'Performer',
 'SongID_chart',
 'Instance',
 'Peak Position',
 'Weeks on Chart',
 'spotify_genre',
 'spotify_track_duration_ms',
 'spotify_track_explicit',
 'spotify_track_album',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'spotify_track_popularity',
 'chart_week']

In [89]:
peak_top50.head(3)

Unnamed: 0,index_chart,WeekID,Week Position,Song,Performer,SongID_chart,Instance,Peak Position,Weeks on Chart,spotify_genre,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,spotify_track_popularity,chart_week
93356,92943,1/3/1981,1,(Just Like) Starting Over,John Lennon,(Just Like) Starting OverJohn Lennon,1,1,10,"['album rock', 'classic rock', 'folk rock', 'm...",...,1.0,0.0725,0.301,6.1e-05,0.179,0.421,99.104,4.0,57.0,1981-01-03
290479,288686,3/13/1982,1,Centerfold,The J. Geils Band,CenterfoldThe J. Geils Band,1,1,19,"['album rock', 'blues rock', 'classic rock', '...",...,1.0,0.0396,0.249,0.0,0.403,0.888,114.37,4.0,72.0,1982-03-13
290480,288687,12/4/1999,1,Smooth,Santana Featuring Rob Thomas,SmoothSantana Featuring Rob Thomas,1,1,19,"['blues rock', 'classic rock']",...,1.0,0.0338,0.16,5e-06,0.295,0.961,115.996,4.0,71.0,1999-12-04


In [90]:
peak_top50.drop(['WeekID', 'Instance', 'Peak Position', 'spotify_track_album'], axis=1, inplace=True)
peak_top50.rename(columns={'index_chart': 'index', 'SongID_chart' : 'SongID', 'spotify_genre' : 'genre',
 'spotify_track_duration_ms' : 'duration',
 'spotify_track_explicit' : 'explicit', 'spotify_track_popularity' : 'popularity'
 }, inplace=True)

In [91]:
peak_top50.head(3)
peak_top50.columns.tolist()

['index',
 'Week Position',
 'Song',
 'Performer',
 'SongID',
 'Weeks on Chart',
 'genre',
 'duration',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'popularity',
 'chart_week']