#### Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_2023 = pd.read_csv("../data/spotify-2023.csv", encoding="utf-8", encoding_errors="ignore")
data_2024 = pd.read_csv("../data/spotify-2024.csv", encoding="utf-8", encoding_errors="ignore")

#### Dataset Analysis

In [3]:
# number of rows 

print("Number of rows in 2023 Dataset: ", len(data_2023))
print("Number of rows in 2024 Dataset: ", len(data_2024))

Number of rows in 2023 Dataset:  953
Number of rows in 2024 Dataset:  4600


In [4]:
# number of columns

print("Number of columns in 2023 Dataset: ", len(data_2023.columns.to_list()))
print("Number of columns in 2024 Dataset: ", len(data_2024.columns.to_list()))

Number of columns in 2023 Dataset:  24
Number of columns in 2024 Dataset:  29


In [5]:
# 2023 dataset datatype

data_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   track_name            953 non-null    object
 1   artist(s)_name        953 non-null    object
 2   artist_count          953 non-null    int64 
 3   released_year         953 non-null    int64 
 4   released_month        953 non-null    int64 
 5   released_day          953 non-null    int64 
 6   in_spotify_playlists  953 non-null    int64 
 7   in_spotify_charts     953 non-null    int64 
 8   streams               953 non-null    object
 9   in_apple_playlists    953 non-null    int64 
 10  in_apple_charts       953 non-null    int64 
 11  in_deezer_playlists   953 non-null    object
 12  in_deezer_charts      953 non-null    int64 
 13  in_shazam_charts      903 non-null    object
 14  bpm                   953 non-null    int64 
 15  key                   858 non-null    ob

In [6]:
# 2024 dataset datatype

data_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4595 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4487 non-null   object 
 8   Spotify Playlist Count      4530 non-null   object 
 9   Spotify Playlist Reach      4528 non-null   object 
 10  Spotify Popularity          3796 non-null   float64
 11  YouTube Views               4292 non-null   object 
 12  YouTube Likes               4285 non-null   object 
 13  TikTok Posts                3427 

## Data Preparation and Cleaning

### Naming Convention and Data Type

In [7]:
# Change name for 2023 dataset

data_2023.columns = [col.replace("_", " ").title() for col in data_2023.columns]
data_2023.rename(columns={"Track Name": "Track", "Artist(S) Name": "Artist", "In Spotify Playlists": "Spotify Playlist Count 2023",
                             "Streams": "Streams 2023", "In Spotify Charts" : "Spotify Chart Counts 2023", "In Apple Playlists":"Apple Music Playlist Count 2023",
                             "In Apple Charts":"Apple Music Chart Counts 2023", "In Deezer Playlists":"Deezer Playlist Count 2023", "In Deezer Charts":"Deezer Chart Counts 2023",
                             "In Shazam Charts":"Shazam Chart Counts 2023"}, inplace=True)
data_2023.drop(columns=["Artist Count"], inplace=True)

In [8]:
# Convert datatype for 2023 dataset

for col in ["Streams 2023", "Deezer Playlist Count 2023", "Shazam Chart Counts 2023"]:
    data_2023[col] = data_2023[col].replace(",", "", regex=True)
    data_2023[col] = pd.to_numeric(data_2023[col], errors='coerce')

In [9]:
# Change name for 2024 dataset

data_2024.columns = data_2024.columns + " 2024"
data_2024.rename(columns={"Track 2024": "Track", "Album Name 2024": "Album Name", "Artist 2024":"Artist",
                             "Release Date 2024": "Release Date", "ISRC 2024":"ISRC", "All Time Rank 2024": "All Time Rank"}, inplace=True)
data_2024.drop(columns={"ISRC", "TIDAL Popularity 2024", "Explicit Track 2024"}, inplace=True)

In [10]:
# Convert datatype for 2024 dataset

numeric_cols = [col for col in data_2024.columns.to_list() if col not in ["Track", "Artist", "Release Date", "Album Name"]]
for col in numeric_cols:
    data_2024[col] = data_2024[col].replace(",", "", regex=True)
    data_2024[col] = pd.to_numeric(data_2024[col], errors='coerce')

In [11]:
data_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Track                            953 non-null    object 
 1   Artist                           953 non-null    object 
 2   Released Year                    953 non-null    int64  
 3   Released Month                   953 non-null    int64  
 4   Released Day                     953 non-null    int64  
 5   Spotify Playlist Count 2023      953 non-null    int64  
 6   Spotify Chart Counts 2023        953 non-null    int64  
 7   Streams 2023                     952 non-null    float64
 8   Apple Music Playlist Count 2023  953 non-null    int64  
 9   Apple Music Chart Counts 2023    953 non-null    int64  
 10  Deezer Playlist Count 2023       953 non-null    int64  
 11  Deezer Chart Counts 2023         953 non-null    int64  
 12  Shazam Chart Counts 20

In [12]:
data_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Track                            4600 non-null   object 
 1   Album Name                       4600 non-null   object 
 2   Artist                           4595 non-null   object 
 3   Release Date                     4600 non-null   object 
 4   All Time Rank                    4600 non-null   int64  
 5   Track Score 2024                 4600 non-null   float64
 6   Spotify Streams 2024             4487 non-null   float64
 7   Spotify Playlist Count 2024      4530 non-null   float64
 8   Spotify Playlist Reach 2024      4528 non-null   float64
 9   Spotify Popularity 2024          3796 non-null   float64
 10  YouTube Views 2024               4292 non-null   float64
 11  YouTube Likes 2024               4285 non-null   float64
 12  TikTok Posts 2024   

In [13]:
# Make the realease day column

data_2023["Release Date"] = data_2023["Released Month"].astype(str) + "/" + data_2023["Released Day"].astype(str) + "/" + data_2023["Released Year"].astype(str)
data_2023.drop(columns=["Released Year", "Released Month", "Released Day"], inplace=True)

In [14]:
data_2023.columns

Index(['Track', 'Artist', 'Spotify Playlist Count 2023',
       'Spotify Chart Counts 2023', 'Streams 2023',
       'Apple Music Playlist Count 2023', 'Apple Music Chart Counts 2023',
       'Deezer Playlist Count 2023', 'Deezer Chart Counts 2023',
       'Shazam Chart Counts 2023', 'Bpm', 'Key', 'Mode', 'Danceability %',
       'Valence %', 'Energy %', 'Acousticness %', 'Instrumentalness %',
       'Liveness %', 'Speechiness %', 'Release Date'],
      dtype='object')

In [15]:
data_2024.columns

Index(['Track', 'Album Name', 'Artist', 'Release Date', 'All Time Rank',
       'Track Score 2024', 'Spotify Streams 2024',
       'Spotify Playlist Count 2024', 'Spotify Playlist Reach 2024',
       'Spotify Popularity 2024', 'YouTube Views 2024', 'YouTube Likes 2024',
       'TikTok Posts 2024', 'TikTok Likes 2024', 'TikTok Views 2024',
       'YouTube Playlist Reach 2024', 'Apple Music Playlist Count 2024',
       'AirPlay Spins 2024', 'SiriusXM Spins 2024',
       'Deezer Playlist Count 2024', 'Deezer Playlist Reach 2024',
       'Amazon Playlist Count 2024', 'Pandora Streams 2024',
       'Pandora Track Stations 2024', 'Soundcloud Streams 2024',
       'Shazam Counts 2024'],
      dtype='object')

In [16]:
# Reorder columns

cols_2023 = data_2023.columns.to_list()
cols_2023.remove("Release Date")
cols_2023.remove("Artist")

cols_2023.insert(1, "Artist")
cols_2023.insert(2, "Release Date")
data_2023 = data_2023[cols_2023]
data_2023

Unnamed: 0,Track,Artist,Release Date,Spotify Playlist Count 2023,Spotify Chart Counts 2023,Streams 2023,Apple Music Playlist Count 2023,Apple Music Chart Counts 2023,Deezer Playlist Count 2023,Deezer Chart Counts 2023,...,Bpm,Key,Mode,Danceability %,Valence %,Energy %,Acousticness %,Instrumentalness %,Liveness %,Speechiness %
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",7/14/2023,553,147,141381703.0,43,263,45,10,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,3/23/2023,1474,48,133716286.0,48,126,58,14,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,6/30/2023,1397,113,140003974.0,94,207,91,14,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,8/23/2019,7858,100,800840817.0,116,207,125,12,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,5/18/2023,3133,50,303236322.0,84,133,87,15,...,144,A,Minor,65,23,80,14,63,11,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,My Mind & Me,Selena Gomez,11/3/2022,953,0,91473363.0,61,13,37,1,...,144,A,Major,60,24,39,57,0,8,3
949,Bigger Than The Whole Sky,Taylor Swift,10/21/2022,1180,0,121871870.0,4,0,8,0,...,166,F#,Major,42,7,24,83,1,12,6
950,A Veces (feat. Feid),"Feid, Paulo Londra",11/3/2022,573,0,73513683.0,2,0,7,0,...,92,C#,Major,80,81,67,4,0,8,6
951,En La De Ella,"Feid, Sech, Jhayco",10/20/2022,1320,0,133895612.0,29,26,17,0,...,97,C#,Major,82,67,77,8,0,12,5


In [17]:
# Reorder columns

cols_2024 = data_2024.columns.to_list()
cols_2024.remove("Release Date")
cols_2024.remove("Artist")

cols_2024.insert(1, "Artist")
cols_2024.insert(2, "Release Date")
spotify_2024 = data_2024[cols_2024]
spotify_2024

Unnamed: 0,Track,Artist,Release Date,Album Name,All Time Rank,Track Score 2024,Spotify Streams 2024,Spotify Playlist Count 2024,Spotify Playlist Reach 2024,Spotify Popularity 2024,...,Apple Music Playlist Count 2024,AirPlay Spins 2024,SiriusXM Spins 2024,Deezer Playlist Count 2024,Deezer Playlist Reach 2024,Amazon Playlist Count 2024,Pandora Streams 2024,Pandora Track Stations 2024,Soundcloud Streams 2024,Shazam Counts 2024
0,MILLION DOLLAR BABY,Tommy Richman,4/26/2024,Million Dollar Baby - Single,1,725.4,3.904709e+08,30716.0,196631588.0,92.0,...,210.0,40975.0,684.0,62.0,17598718.0,114.0,18004655.0,22931.0,4818457.0,2669262.0
1,Not Like Us,Kendrick Lamar,5/4/2024,Not Like Us,2,545.9,3.237039e+08,28113.0,174597137.0,92.0,...,188.0,40778.0,3.0,67.0,10422430.0,111.0,7780028.0,28444.0,6623075.0,1118279.0
2,i like the way you kiss me,Artemas,3/19/2024,I like the way you kiss me,3,538.4,6.013093e+08,54331.0,211607669.0,92.0,...,190.0,74333.0,536.0,136.0,36321847.0,172.0,5022621.0,5639.0,7208651.0,5285340.0
3,Flowers,Miley Cyrus,1/12/2023,Flowers - Single,4,444.9,2.031281e+09,269802.0,136569078.0,85.0,...,394.0,1474799.0,2182.0,264.0,24684248.0,210.0,190260277.0,203384.0,,11822942.0
4,Houdini,Eminem,5/31/2024,Houdini,5,423.3,1.070349e+08,7223.0,151469874.0,88.0,...,182.0,12185.0,1.0,82.0,17660624.0,105.0,4493884.0,7006.0,207179.0,457017.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,For the Last Time,$uicideboy$,9/5/2017,For the Last Time,4585,19.4,3.050500e+08,65770.0,5103054.0,71.0,...,3.0,6.0,,2.0,14217.0,,20104066.0,13184.0,50633006.0,656337.0
4596,Dil Meri Na Sune,Atif Aslam,7/27/2018,"Dil Meri Na Sune (From ""Genius"")",4575,19.4,5.228236e+07,4602.0,1449767.0,56.0,...,1.0,412.0,,1.0,927.0,,,,,193590.0
4597,Grace (feat. 42 Dugg),Lil Baby,2/28/2020,My Turn,4571,19.4,1.899727e+08,72066.0,6704802.0,65.0,...,19.0,204.0,,1.0,74.0,6.0,84426740.0,28999.0,,1135998.0
4598,Nashe Si Chadh Gayi,Arijit Singh,11/8/2016,November Top 10 Songs,4591,19.4,1.454670e+08,14037.0,7387064.0,66.0,...,1.0,1200.0,,,,7.0,6817840.0,,,448292.0


#### Handle duplicated tracks

For the 2024 dataset, we don't have information about tracks' features, so I will only care about the tracks' names and artists. I will handle the duplicated as follow:

- For ranking or score column, only keep the highest rank or score
- For count columns, use the sum

In [18]:
# Handle duplicated tracks in All Time Rank, Track Score, and popularity columns, and other count columns

data_2024["All Time Rank"] = data_2024.groupby(['Track', 'Artist'])["All Time Rank"].transform('min')
data_2024["Track Score 2024"] = data_2024.groupby(['Track', 'Artist'])["Track Score 2024"].transform('max')
data_2024["Spotify Popularity 2024"] = data_2024.groupby(['Track', 'Artist'])["All Time Rank"].transform('max')

remove_cols = ["Track", "Artist", "Release Date", "Album Name", "All Time Rank", "Track Score 2024", "Spotify Popularity 2024"]
count_2024_cols = [col for col in data_2024.columns.to_list() if col not in remove_cols]
data_2024[count_2024_cols] = data_2024.groupby(['Track', 'Artist'])[count_2024_cols].transform('sum')

In [19]:
data_2024.drop_duplicates(subset=["Track", "Artist"], keep="first", inplace=True)

For the 2023 dataset

In [20]:
duplicated_song_2023 = data_2023[data_2023[["Track", "Artist"]].duplicated()]["Track"].values
data_2023[data_2023["Track"].isin(duplicated_song_2023)]

Unnamed: 0,Track,Artist,Release Date,Spotify Playlist Count 2023,Spotify Chart Counts 2023,Streams 2023,Apple Music Playlist Count 2023,Apple Music Chart Counts 2023,Deezer Playlist Count 2023,Deezer Chart Counts 2023,...,Bpm,Key,Mode,Danceability %,Valence %,Energy %,Acousticness %,Instrumentalness %,Liveness %,Speechiness %
178,SNAP,Rosa Linn,3/19/2022,3202,18,726307468.0,148,80,226,24,...,170,,Major,56,53,64,11,0,45,6
345,SPIT IN MY FACE!,ThxSoMch,10/31/2022,629,14,303216294.0,32,3,9,0,...,94,G#,Major,73,65,79,5,2,11,6
372,About Damn Time,Lizzo,7/15/2022,2332,2,723894473.0,0,0,25,0,...,109,A#,Minor,84,72,74,10,0,34,7
482,SPIT IN MY FACE!,ThxSoMch,10/31/2022,573,0,301869854.0,1,0,18,0,...,166,C#,Major,70,57,57,9,20,11,7
512,Take My Breath,The Weeknd,8/6/2021,2597,0,130655803.0,17,80,38,0,...,121,A#,Minor,70,35,77,1,0,26,4
616,Take My Breath,The Weeknd,8/6/2021,6392,0,432702334.0,174,73,344,0,...,121,G#,Major,75,53,74,2,0,11,5
764,About Damn Time,Lizzo,4/14/2022,9021,0,723894473.0,242,49,272,21,...,109,A#,Minor,84,72,74,10,0,34,7
873,SNAP,Rosa Linn,3/19/2022,1818,0,711366595.0,3,0,63,0,...,170,,Major,56,52,64,11,0,45,7


For Key, by manually searching, I got that:

SPIT IN MY FACE! : Key C#, BPM: 166

SNAP: Key C#, BPM : 170 (correct)

About Damn Time: Key A#, BPM: 109

Take My Breath: Key G#, BPM: 121, Major


So I will keep rows that having these features. For About Damn Time, just use their average for features

In [21]:
index_to_remove = [345, 178, 512]
data_2023.drop(index=index_to_remove, inplace=True)

In [22]:
list_of_numerical_features = ['Bpm',
 'Danceability %',
 'Valence %',
 'Energy %',
 'Acousticness %',
 'Instrumentalness %',
 'Liveness %',
 'Speechiness %']

data_2023[list_of_numerical_features] = data_2023.groupby(["Track", "Artist"])[list_of_numerical_features].transform('mean')
data_2023.drop_duplicates(subset=["Track", "Artist"], keep="first", inplace=True)

#### Handle missing datas

The 2023 dataset has 3 columns which have missing values: Key, Shazam Chart Count and Streams 

The column Streams only have 1 missing data, so we can drop it

Replace missing values in Shazam Chart Count by mean of Apple Music Chart Count and Spotify Chart Count

For Key, there are various missing values. We cannot mannually search and replace all of them, and cannot replace by value in any other rows. So just leave them as missing values since about 25 missing values in key will not affect the overall distribution of key among 900 tracks

However, we still have missing data in Track and Artist column, but they are not "NaN". Instead, they are blank column. We need to handle them as well


In [23]:
# handle missing data
data_2023[data_2023["Streams 2023"].isnull()]
data_2023 = data_2023.drop(574)
data_2023["Shazam Chart Counts 2023"]=data_2023["Shazam Chart Counts 2023"].fillna((data_2023["Apple Music Chart Counts 2023"] + data_2023["Spotify Chart Counts 2023"]) / 2)

In [24]:
data_2023.columns[data_2023.isnull().any()].tolist() # check after handling missing data (NaN)

['Key']

In [25]:
data_2023[data_2023['Track'].str.strip() == ""] # check missing data in Track column

Unnamed: 0,Track,Artist,Release Date,Spotify Playlist Count 2023,Spotify Chart Counts 2023,Streams 2023,Apple Music Playlist Count 2023,Apple Music Chart Counts 2023,Deezer Playlist Count 2023,Deezer Chart Counts 2023,...,Bpm,Key,Mode,Danceability %,Valence %,Energy %,Acousticness %,Instrumentalness %,Liveness %,Speechiness %
174,,YOASOBI,4/12/2023,356,16,143573775.0,35,102,8,1,...,166.0,C#,Major,57.0,84.0,94.0,11.0,0.0,37.0,9.0
374,,Fujii Kaze,5/20/2020,685,14,403097450.0,24,94,9,0,...,158.0,F#,Minor,60.0,52.0,76.0,17.0,0.0,19.0,5.0


In [26]:
data_2023.drop([174, 374], inplace=True) # drop them

In [27]:
data_2023[data_2023["Artist"].str.strip() == ""] # Check missing data in Artist column

Unnamed: 0,Track,Artist,Release Date,Spotify Playlist Count 2023,Spotify Chart Counts 2023,Streams 2023,Apple Music Playlist Count 2023,Apple Music Chart Counts 2023,Deezer Playlist Count 2023,Deezer Chart Counts 2023,...,Bpm,Key,Mode,Danceability %,Valence %,Energy %,Acousticness %,Instrumentalness %,Liveness %,Speechiness %


For 2024 dataset, there are only 5 rows having missing values, so we can safely delete them

In [28]:
data_2024[data_2024.isnull().any(axis=1)] # drop them

Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Track Score 2024,Spotify Streams 2024,Spotify Playlist Count 2024,Spotify Playlist Reach 2024,Spotify Popularity 2024,...,Apple Music Playlist Count 2024,AirPlay Spins 2024,SiriusXM Spins 2024,Deezer Playlist Count 2024,Deezer Playlist Reach 2024,Amazon Playlist Count 2024,Pandora Streams 2024,Pandora Track Stations 2024,Soundcloud Streams 2024,Shazam Counts 2024
311,Cool,JnD Mix,,5/25/2024,,,,,,,...,,,,,,,,,,
480,I Wanna Party,I Wanna Party - Single,,5/31/2024,,,,,,,...,,,,,,,,,,
1345,Marlboro Remix,Marlboro Remix - Single,,6/7/2024,,,,,,,...,,,,,,,,,,
1561,Melting,Melting - Single,,6/10/2024,,,,,,,...,,,,,,,,,,
3402,La ��ltima Vez (Yo Te Per,La ��ltima Vez (Yo Te Perd��),,5/2/2024,,,,,,,...,,,,,,,,,,


In [29]:
data_2024.dropna(inplace=True) # drop na values in 2024 dataset

In [30]:
data_2024[data_2024.isnull().any(axis=1)] # check

Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Track Score 2024,Spotify Streams 2024,Spotify Playlist Count 2024,Spotify Playlist Reach 2024,Spotify Popularity 2024,...,Apple Music Playlist Count 2024,AirPlay Spins 2024,SiriusXM Spins 2024,Deezer Playlist Count 2024,Deezer Playlist Reach 2024,Amazon Playlist Count 2024,Pandora Streams 2024,Pandora Track Stations 2024,Soundcloud Streams 2024,Shazam Counts 2024


In [31]:
data_2024[data_2024['Track'].str.strip() == ""] # check missing data in Track column

Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Track Score 2024,Spotify Streams 2024,Spotify Playlist Count 2024,Spotify Playlist Reach 2024,Spotify Popularity 2024,...,Apple Music Playlist Count 2024,AirPlay Spins 2024,SiriusXM Spins 2024,Deezer Playlist Count 2024,Deezer Playlist Reach 2024,Amazon Playlist Count 2024,Pandora Streams 2024,Pandora Track Stations 2024,Soundcloud Streams 2024,Shazam Counts 2024
146,,,FLI:P,4/7/2023,147.0,125.4,8028757.0,1972.0,488653.0,147.0,...,1.0,373.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,220198.0
161,,3,(),1/5/2022,162.0,119.5,548408.0,130.0,20518.0,162.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69763.0
351,,,YOASOBI,4/12/2023,351.0,81.8,1481006000.0,206671.0,32689574.0,351.0,...,177.0,6219.0,0.0,21.0,105975.0,34.0,3679638.0,2395.0,0.0,3443506.0
460,,,ALEX&RUS,7/3/2019,460.0,71.9,52830200.0,14893.0,1344212.0,460.0,...,2.0,430.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3415809.0
804,,Show - Single,Ado,9/6/2023,800.0,54.2,303439500.0,52668.0,7471859.0,800.0,...,64.0,1435.0,0.0,3.0,4313.0,10.0,0.0,0.0,0.0,538733.0
1020,,,HoneyWorks,11/21/2022,1014.0,47.7,23191930.0,5912.0,438018.0,1014.0,...,3.0,80.0,0.0,0.0,0.0,6.0,46380.0,60.0,0.0,188814.0
1079,,- Single,,3/22/2024,1073.0,46.2,5599896.0,404.0,264984.0,1073.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31984.0
1292,,- Single,Jazzdauren,3/1/2024,1287.0,41.5,2902263.0,968.0,474517.0,1287.0,...,11.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,904938.0
1489,,Bibbidiba - Single,Hoshimachi Suisei,3/23/2024,1482.0,38.1,21255070.0,1754.0,3985309.0,1482.0,...,2.0,38.0,0.0,1.0,7336.0,7.0,0.0,0.0,0.0,41209.0
1520,,- Single,Ptrp Studio,6/2/2022,1515.0,37.8,13219830.0,3136.0,395560.0,1515.0,...,0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
data_2024.drop(data_2024[data_2024["Track"].str.strip() == ""].index, inplace=True) # drop them

In [33]:
data_2024[data_2024["Artist"].str.strip() == ""] # check missing data in Artist column

Unnamed: 0,Track,Album Name,Artist,Release Date,All Time Rank,Track Score 2024,Spotify Streams 2024,Spotify Playlist Count 2024,Spotify Playlist Reach 2024,Spotify Popularity 2024,...,Apple Music Playlist Count 2024,AirPlay Spins 2024,SiriusXM Spins 2024,Deezer Playlist Count 2024,Deezer Playlist Reach 2024,Amazon Playlist Count 2024,Pandora Streams 2024,Pandora Track Stations 2024,Soundcloud Streams 2024,Shazam Counts 2024
344,kompa pasi�,kompa pasi�,,4/12/2024,345.0,82.5,29458027.0,2602.0,6588032.0,345.0,...,9.0,132.0,0.0,6.0,73462.0,0.0,0.0,0.0,365682.0,2085595.0
3289,Baby you,Baby you,,1/18/2023,3273.0,24.0,30361093.0,6140.0,1321340.0,3273.0,...,16.0,114.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,250538.0
4192,sup�,sup�,,8/11/2023,4188.0,20.7,63642484.0,15618.0,1035077.0,4188.0,...,1.0,0.0,0.0,0.0,0.0,0.0,89501.0,93.0,0.0,400490.0


In [34]:
data_2024.drop(data_2024[data_2024["Artist"].str.strip() == ""].index, inplace=True) # drop them

In [35]:
data_2023.to_csv("../data/data_2023_clean.csv")
data_2024.to_csv("../data/data_2024_clean.csv")