In [1]:
import pandas as pd
import re


1. Load data
2. Clean data (Million song subset)
3. Merge datasets
4. Save data

In [None]:
million_songs_df = pd.read_csv('../data/raw/2_million_song_subset_raw.csv')
hot100_df = pd.read_csv('../data/raw/1_hot100.csv')

million_songs_df.head()

In [None]:
missing_columns = million_songs_df.isna().any()
print(missing_columns)


Split million_songs_df into songs and artists

In [4]:
million_songs_df[['song_title', 'artist']] = million_songs_df['title;artist'].str.split(';b', expand=True)  # separate title and artist into separate columns

# clean strings
million_songs_df['song_title'] = million_songs_df['song_title'].str.replace(r"^b['\"]|['\"]$", "", regex=True)
million_songs_df['artist'] = million_songs_df['artist'].str.replace(r"^['\"]|['\"]$", "", regex=True)

# decode UTF-8 characters
def decode_utf8(x):
    if pd.isna(x):
        return x
    if '\\x' in x:
        return bytes(x, 'utf-8').decode('unicode_escape').encode('latin1').decode('utf-8')
    return x

million_songs_df['song_title'] = million_songs_df['song_title'].apply(decode_utf8)
million_songs_df['artist'] = million_songs_df['artist'].apply(decode_utf8)

million_songs_df = million_songs_df.drop('title;artist', axis=1)

# million_songs_df.head()

In [None]:
million_songs_df.head(30)


Remove second parentheses in song_title


In [None]:
def remove_parentheses(text):
    return re.sub(r'\(.*?\)', '', text)  # remove everything inside parentheses

million_songs_df['song_title'] = million_songs_df['song_title'].apply(remove_parentheses)

million_songs_df.head(30)

Check for duplicates

In [None]:
million_songs_df.duplicated(subset=['song_title', 'artist']).sum()

million_songs_df.drop_duplicates(subset=['song_title', 'artist'], keep='first')

display(million_songs_df)


## Clean hot100_df

In [8]:
hot100_df['song_title'] = hot100_df['song_title'].apply(remove_parentheses)

In [None]:
hot100_df.head(30)

Clean string columns

In [10]:
def clean_string_columns(df):
        for column in df.columns:
            if df[column].dtype == 'object':
                df[column] = df[column].str.lower()  # Convert to lowercase
                df[column] = df[column].str.replace('feat.', 'featuring', regex=False)  # Replace 'feat.' with 'featuring'
                df[column] = df[column].str.replace('&', 'and', regex=False)  # Replace '&' with 'and'
        return df

In [None]:
hot100_df = clean_string_columns(hot100_df)
hot100_df.head(30)

In [None]:
million_songs_df = clean_string_columns(million_songs_df)
million_songs_df.head(30)

check matching columns

In [None]:
def clean_text_for_matching(text):
    return re.sub(r'[^\w\s]', '', str(text).lower()) # remove all punctuation and convert to lowercase

hot100_clean = hot100_df['song_title'].apply(clean_text_for_matching)
million_clean = million_songs_df['song_title'].apply(clean_text_for_matching)

song_matches = pd.merge(
    pd.DataFrame({
        'clean_title': hot100_clean,
        'original_title': hot100_df['song_title'],
        'artist': hot100_df['artist']
    }),
    pd.DataFrame({
        'clean_title': million_clean,
        'million_title': million_songs_df['song_title'],
        'million_artist': million_songs_df['artist']
    }),
    on='clean_title'
)

print(f'Number of matching song titles after cleaning: {len(song_matches)}')
print('\nExample matches:')
display(song_matches[['original_title', 'million_title', 'artist', 'million_artist']].head(20))

In [None]:
hot100_clean = hot100_df['artist'].apply(clean_text_for_matching)
million_clean = million_songs_df['artist'].apply(clean_text_for_matching)

artist_matches = pd.merge(
    pd.DataFrame({
        'clean_artist': hot100_clean,
        'original_title': hot100_df['song_title'],
        'artist': hot100_df['artist']
    }),
    pd.DataFrame({
        'clean_artist': million_clean,
        'million_title': million_songs_df['song_title'],
        'million_artist': million_songs_df['artist']
    }),
    on='clean_artist'
)

print(f'Number of matching artist names after cleaning: {len(artist_matches)}')
print('\nExample matches:')
display(artist_matches[['original_title', 'million_title', 'artist', 'million_artist']].head(20))