In [1]:
import pandas as pd
import re


1. Load data
2. Clean data (Million song subset)
3. Merge datasets
4. Save data

In [2]:
million_songs_df = pd.read_csv('../data/raw/2_million_song_subset_raw.csv')
hot100_df = pd.read_csv('../data/raw/1_hot100.csv')

million_songs_df.head()

Unnamed: 0,title;artist
0,b'Je Sais Que La Terre Est Plate';b'Rapha\xc3\...
1,b'On Efface';b'Julie Zenatti'
2,b'Howells Delight';b'The Baltimore Consort'
3,b'Martha Served';b'I Hate Sally'
4,b'Zip-A-Dee-Doo-Dah (Song of the South)';b'Orl...


In [3]:
missing_columns = million_songs_df.isna().any()
print(missing_columns)


title;artist    False
dtype: bool


Split million_songs_df into songs and artists

In [4]:
# Split title and artist into separate columns
million_songs_df[['song_title', 'artist']] = million_songs_df['title;artist'].str.split(';b', expand=True)

# Clean the strings by removing b' or b" at the start and ' or " at the end
million_songs_df['song_title'] = million_songs_df['song_title'].str.replace(r"^b['\"]|['\"]$", "", regex=True)
million_songs_df['artist'] = million_songs_df['artist'].str.replace(r"^['\"]|['\"]$", "", regex=True)

# Decode UTF-8 encoded characters
def decode_utf8(x):
    if pd.isna(x):  # Check for None/NaN values
        return x
    if '\\x' in x:
        return bytes(x, 'utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
    return x

million_songs_df['song_title'] = million_songs_df['song_title'].apply(decode_utf8)
million_songs_df['artist'] = million_songs_df['artist'].apply(decode_utf8)

# Drop the original column
million_songs_df = million_songs_df.drop('title;artist', axis=1)

# million_songs_df.head()

In [5]:
million_songs_df.head(30)


Unnamed: 0,song_title,artist
0,Je Sais Que La Terre Est Plate,Raphaël
1,On Efface,Julie Zenatti
2,Howells Delight,The Baltimore Consort
3,Martha Served,I Hate Sally
4,Zip-A-Dee-Doo-Dah (Song of the South),Orlando Pops Orchestra
5,Liquid Time (composition by John Goodsall),Brand X
6,Misery Path (From the Privilege of Evil),Amorphis
7,Nuovi Re pt. II (feat. Tek money - Lady Tambler),Inoki
8,Halloween,Dead Kennedys
9,Parto em terras distantes,Brigada Victor Jara


Remove second parentheses in song_title


In [6]:
def remove_parentheses(text):
    return re.sub(r'\(.*?\)', '', text)  # Removes everything inside parentheses, including the parentheses

million_songs_df['song_title'] = million_songs_df['song_title'].apply(remove_parentheses)

million_songs_df.head(30)

Unnamed: 0,song_title,artist
0,Je Sais Que La Terre Est Plate,Raphaël
1,On Efface,Julie Zenatti
2,Howells Delight,The Baltimore Consort
3,Martha Served,I Hate Sally
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra
5,Liquid Time,Brand X
6,Misery Path,Amorphis
7,Nuovi Re pt. II,Inoki
8,Halloween,Dead Kennedys
9,Parto em terras distantes,Brigada Victor Jara


Check for duplicates

In [7]:
million_songs_df.duplicated(subset=['song_title', 'artist']).sum()

million_songs_df.drop_duplicates(subset=['song_title', 'artist'], keep='first')

display(million_songs_df)


Unnamed: 0,song_title,artist
0,Je Sais Que La Terre Est Plate,Raphaël
1,On Efface,Julie Zenatti
2,Howells Delight,The Baltimore Consort
3,Martha Served,I Hate Sally
4,Zip-A-Dee-Doo-Dah,Orlando Pops Orchestra
...,...,...
9995,One About Heaven,Brent Lamb
9996,October,U2
9997,Comin' Home,ZO2
9998,Pode Me Chamar,Eddie


Add column to indicate if song is in hot100