In [7]:
import pandas as pd


In [8]:
spotify_df = pd.read_csv('spotify_data.csv')
billboard_df = pd.read_csv('billboard_hot_stuff.csv')

In [9]:
# Standardize song names and artist names
spotify_df['name'] = spotify_df['name'].str.lower().str.replace('[^a-zA-Z0-9 ]', '', regex=True)
spotify_df['artists'] = spotify_df['artists'].str.lower().str.replace('[^a-zA-Z0-9 ]', '', regex=True)
billboard_df['Song'] = billboard_df['Song'].str.lower().str.replace('[^a-zA-Z0-9 ]', '', regex=True)
billboard_df['Performer'] = billboard_df['Performer'].str.lower().str.replace('[^a-zA-Z0-9 ]', '', regex=True)

# Attempt to parse dates in multiple formats
spotify_df['release_date'] = pd.to_datetime(spotify_df['release_date'], errors='coerce')
billboard_df['WeekID'] = pd.to_datetime(billboard_df['WeekID'], errors='coerce')

# Check for remaining parsing issues
print(spotify_df['release_date'].isna().sum())
print(billboard_df['WeekID'].isna().sum())

# Drop rows where dates couldn't be parsed
spotify_df = spotify_df.dropna(subset=['release_date'])
billboard_df = billboard_df.dropna(subset=['WeekID'])

119527
0


In [10]:
# Ensure consistent date format
spotify_df['release_date'] = spotify_df['release_date'].dt.strftime('%Y-%m-%d')
billboard_df['WeekID'] = billboard_df['WeekID'].dt.strftime('%Y-%m-%d')


In [20]:
spotify_df

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,carl woitschach,0.708,158648,0.1950,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563000,singende bataillone 1 teil carl woitschach,0.1510,-12.428,1,singende bataillone 1 teil,0,1928-01-01,0.0506,118.469,0.7790,1928
1,0.994,robert schumann vladimir horowitz,0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901000,fantasiestcke op 111 pi tosto lento robert sch...,0.0763,-28.454,1,fantasiestcke op 111 pi tosto lento,0,1928-01-01,0.0462,83.972,0.0767,1928
2,0.604,seweryn goszczyski,0.749,104300,0.2200,0,6L63VW0PibdM1HDSBoqnoM,0.000000,chapter 118 zamek kaniowski seweryn goszczyski,0.1190,-19.924,0,chapter 118 zamek kaniowski,0,1928-01-01,0.9290,107.177,0.8800,1928
4,0.990,frdric chopin vladimir horowitz,0.210,687733,0.2040,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908000,polonaisefantaisie in aflat major op 61 frdric...,0.0980,-16.829,1,polonaisefantaisie in aflat major op 61,1,1928-01-01,0.0424,62.149,0.0693,1928
5,0.995,felix mendelssohn vladimir horowitz,0.424,352600,0.1200,0,6NxAf7M8DNHOBTmEd3JSO5,0.911000,scherzo a capriccio presto felix mendelssohn v...,0.0915,-19.242,0,scherzo a capriccio presto,0,1928-01-01,0.0593,63.521,0.2660,1928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169168,0.188,ajr,0.697,236400,0.6660,0,2z00vFwIe6zaCoA0LWmiTX,0.000002,my calling ajr,0.1260,-5.123,1,my calling,52,2013-01-01,0.0538,98.013,0.3460,2013
169194,0.049,rfs du sol,0.686,275493,0.7120,0,0PHWXLXOQXGlyUGq7woVFZ,0.004200,sundream rfs du sol,0.3680,-7.580,0,sundream,59,2013-01-01,0.0383,121.966,0.2470,2013
169208,0.114,aap rocky santigold,0.669,231787,0.4890,1,5Uc07fEUpjjFcLEIleEHkJ,0.000000,hell feat santigold aap rocky santigold,0.1840,-9.130,1,hell feat santigold,54,2013-01-01,0.1370,142.218,0.2580,2013
169298,0.120,5 seconds of summer,0.624,210424,0.8910,0,3Y3nRbS2YvT8LsYxvDylny,0.000000,disconnected 5 seconds of summer,0.2230,-4.812,1,disconnected,58,2014-01-01,0.0830,99.945,0.6930,2014


In [21]:
billboard_df

Unnamed: 0,url,WeekID,Week Position,Song,Performer,SongID,Instance,Previous Week Position,Peak Position,Weeks on Chart,key
0,http://www.billboard.com/charts/hot-100/1965-0...,1965-07-17,34,dont just stand there,patty duke,Don't Just Stand TherePatty Duke,1,45.0,34,4,dont just stand there patty duke
1,http://www.billboard.com/charts/hot-100/1965-0...,1965-07-24,22,dont just stand there,patty duke,Don't Just Stand TherePatty Duke,1,34.0,22,5,dont just stand there patty duke
2,http://www.billboard.com/charts/hot-100/1965-0...,1965-07-31,14,dont just stand there,patty duke,Don't Just Stand TherePatty Duke,1,22.0,14,6,dont just stand there patty duke
3,http://www.billboard.com/charts/hot-100/1965-0...,1965-08-07,10,dont just stand there,patty duke,Don't Just Stand TherePatty Duke,1,14.0,10,7,dont just stand there patty duke
4,http://www.billboard.com/charts/hot-100/1965-0...,1965-08-14,8,dont just stand there,patty duke,Don't Just Stand TherePatty Duke,1,10.0,8,8,dont just stand there patty duke
...,...,...,...,...,...,...,...,...,...,...,...
327890,https://www.billboard.com/charts/hot-100/2018-...,2018-10-20,22,god is a woman,ariana grande,God Is A WomanAriana Grande,1,21.0,8,13,god is a woman ariana grande
327891,http://www.billboard.com/charts/hot-100/1977-0...,1977-05-21,22,i wanna get next to you,rose royce,I Wanna Get Next To YouRose Royce,1,10.0,10,13,i wanna get next to you rose royce
327892,http://www.billboard.com/charts/hot-100/1981-0...,1981-05-23,22,i cant stand it,eric clapton and his band,I Can't Stand ItEric Clapton And His Band,1,12.0,10,13,i cant stand it eric clapton and his band
327893,http://www.billboard.com/charts/hot-100/1973-0...,1973-09-29,22,here i am come take me,al green,Here I Am Come & Take MeAl Green,1,16.0,10,13,here i am come take me al green


In [16]:
print(spotify_df.info())
print(spotify_df.describe())

print(billboard_df.info())
print(billboard_df.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 50382 entries, 0 to 169508
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      50382 non-null  float64
 1   artists           50382 non-null  object 
 2   danceability      50382 non-null  float64
 3   duration_ms       50382 non-null  int64  
 4   energy            50382 non-null  float64
 5   explicit          50382 non-null  int64  
 6   id                50382 non-null  object 
 7   instrumentalness  50382 non-null  float64
 8   key               50382 non-null  object 
 9   liveness          50382 non-null  float64
 10  loudness          50382 non-null  float64
 11  mode              50382 non-null  int64  
 12  name              50382 non-null  object 
 13  popularity        50382 non-null  int64  
 14  release_date      50382 non-null  object 
 15  speechiness       50382 non-null  float64
 16  tempo             50382 non-null  float64
 1

In [12]:
# Create a combined key for better matching
spotify_df['key'] = spotify_df['name'] + ' ' + spotify_df['artists']
billboard_df['key'] = billboard_df['Song'] + ' ' + billboard_df['Performer']


In [13]:
# Merge datasets on the key
combined_df = pd.merge(spotify_df, billboard_df, on='key', how='inner')


In [14]:
# Check the combined dataframe
print(combined_df.info())
print(combined_df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40931 entries, 0 to 40930
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   acousticness            40931 non-null  float64
 1   artists                 40931 non-null  object 
 2   danceability            40931 non-null  float64
 3   duration_ms             40931 non-null  int64  
 4   energy                  40931 non-null  float64
 5   explicit                40931 non-null  int64  
 6   id                      40931 non-null  object 
 7   instrumentalness        40931 non-null  float64
 8   key                     40931 non-null  object 
 9   liveness                40931 non-null  float64
 10  loudness                40931 non-null  float64
 11  mode                    40931 non-null  int64  
 12  name                    40931 non-null  object 
 13  popularity              40931 non-null  int64  
 14  release_date            40931 non-null

In [19]:
combined_df.to_csv("result.csv", index = False)