# Loading the data

In [1]:
import pandas as pd
muse_data = pd.read_csv('./data/raw-data/muse_v3.csv')
spotify_data = pd.read_csv('./data/raw-data/spotify_data.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [103]:
merged_df = pd.merge(muse_data, spotify_data, left_on='spotify_id', right_on='id')

In [104]:
# Drop columns that are not needed or are duplicate
merged_df.drop(['lastfm_url', 'mbid', 'id', 'year', 'track', 'seeds'], axis=1, inplace=True)

In [105]:
# separate the artists from the featured artists

import ast
merged_df['artists'] = merged_df['artists'].apply(ast.literal_eval)

# Creating the 'Featured_Artists' column
merged_df['featured_artists'] = merged_df.apply(lambda row: [artist for artist in row['artists'] if artist != row['artist']], axis=1)
merged_df.drop('artists', axis=1, inplace=True) 

In [106]:
merged_df.head()

Unnamed: 0,artist,number_of_emotion_tags,valence_tags,arousal_tags,dominance_tags,spotify_id,genre,valence,acousticness,danceability,...,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,featured_artists
0,Eminem,6,4.55,5.273125,5.690625,4xkOaSrkexMciUUogZKVTS,rap,0.1,0.0622,0.548,...,1,0.0816,-3.237,1,'Till I Collapse,84,2002-05-26,0.186,171.447,[Nate Dogg]
1,Metallica,8,3.71,5.833,5.42725,3fOc9x06lKJBhz435mInlH,metal,0.498,0.00131,0.249,...,2,0.0953,-2.642,0,St. Anger,53,2003-06-05,0.0678,185.252,[]
2,Dope,7,3.771176,5.348235,5.441765,5bU4KX47KqtDKKaLM4QCzh,metal,0.567,0.00169,0.657,...,5,0.109,-3.524,0,Die MF Die,68,2001-12-06,0.07,126.02,[]
3,Drowning Pool,9,2.971389,5.5375,4.726389,4Q1w4Ryyi8KNxxaFlOQClK,metal,0.585,8e-06,0.431,...,6,0.321,-3.269,1,Step Up,53,2004-01-01,0.0789,156.103,[]
4,Kanye West,1,3.08,5.87,5.49,49fT6owWuknekShh9utsjv,hip-hop,0.169,0.563,0.811,...,8,0.104,-6.033,0,Feedback,61,2016-06-10,0.517,100.224,[]


## Scrape the lyrics from the web

In [107]:
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

def scrape_lyrics(artist, song_title):
    # Format the artist and song_title for the URL
    artist_formatted = re.sub(r'[^\w\s-]', '', artist).replace(' ', '-').lower()
    song_title_formatted = re.sub(r'[^\w\s-]', '', song_title).replace(' ', '-').lower()

    # Construct the URL for the lyrics page on Genius
    url = f'https://genius.com/{artist_formatted}-{song_title_formatted}-lyrics'
    # print(f"Fetching lyrics from URL: {url}")

    try:
        response = requests.get(url)
        response.raise_for_status()
        # print(f"Page fetched successfully: {url}")

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        # print("HTML content parsed successfully")
        
        # Find the div with id 'lyrics-root' and then find all divs with 'data-lyrics-container' attribute set to true
        lyrics_div = soup.find('div', id='lyrics-root')
        
        if lyrics_div:
            pass
            # print("Found the 'lyrics-root' container")
        else:
            # print("Could not find the 'lyrics-root' container")
            return None

        # Replace <br> tags with newlines
        for br in lyrics_div.find_all('br'):
            br.replace_with('\n')

        # Extracting the text as one block with preserved line breaks
        lyrics_parts = lyrics_div.find_all('div', {'data-lyrics-container': 'true'})
        lyrics = '\n'.join(part.get_text() for part in lyrics_parts if part.get_text(strip=True))
        # print("Lyrics extracted successfully")
        # print("Here they are:\n")
        # print(lyrics)
        return lyrics

    except requests.HTTPError as e:
        # print(f"HTTP error occurred for {artist} - {song_title}: {e}")
        # Add the song title and artist to the global list
        http_error_songs.append((artist, song_title))
    except Exception as e:
        pass
        # print(f"An error occurred: {e}")
        
    return None

# Define a global list to keep track of songs with HTTP errors
http_error_songs = []

# Print the progress of the lyrics scraping
tqdm.pandas(desc="Scraping lyrics")

# Add a new column to your DataFrame for the lyrics
merged_df['lyrics'] = merged_df.progress_apply(lambda row: scrape_lyrics(row['artist'], row['name']), axis=1)
print("Lyrics added to DataFrame")

for error in http_error_songs:
    print(error)

Scraping lyrics: 100%|██████████| 9574/9574 [2:18:56<00:00,  1.15it/s]  

Lyrics added to DataFrame
('Hüsker Dü', 'Chartered Trips')
('The Pussycat Dolls', 'Hush Hush; Hush Hush - Main')
('The Pussycat Dolls', 'Hush Hush; Hush Hush - Main')
('Clawfinger', 'Biggest & the Best')
('Prodigy', 'Invaders Must Die')
('Pendulum', 'Self vs Self (feat. In Flames)')
('Hüsker Dü', 'Turn on the News')
('White Zombie', "More Human Than Human - Meet Bambi In The King's Harem Mix (Explicit)")
('White Zombie', "More Human Than Human - Meet Bambi In The King's Harem Mix (Explicit)")
('Rilo Kiley', 'A Better Son/Daughter')
('Rilo Kiley', 'A Better Son/Daughter')
('Tina Turner', 'I Might Have Been Queen - 2015 Remaster')
('(hed) Planet Earth', 'Raise Hell')
('David Glen Eisley', 'Sweet Victory (As Heard on "SpongeBob SquarePants")')
('The Gun', 'Race with the Devil')
('The Clash', 'All the Young Punks (New Boots and Contracts) - Remastered')
('(hed) Planet Earth', 'Wake Up')
('Three Days Grace', 'I Hate Everything About You - Live Acoustic - Rolling Stone Original (EP)')
('The 




In [117]:
merged_df = merged_df[merged_df['lyrics'].notna()]
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6837 entries, 0 to 9571
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   artist                  6837 non-null   object 
 1   number_of_emotion_tags  6837 non-null   int64  
 2   valence_tags            6837 non-null   float64
 3   arousal_tags            6837 non-null   float64
 4   dominance_tags          6837 non-null   float64
 5   spotify_id              6837 non-null   object 
 6   genre                   6782 non-null   object 
 7   valence                 6837 non-null   float64
 8   acousticness            6837 non-null   float64
 9   danceability            6837 non-null   float64
 10  duration_ms             6837 non-null   int64  
 11  energy                  6837 non-null   float64
 12  explicit                6837 non-null   int64  
 13  instrumentalness        6837 non-null   float64
 14  key                     6837 non-null   int64

## Create the dataframes described by datadictionary.md and save them to disk

In [116]:
# Creating song_metadata_df
song_metadata_columns = ['spotify_id', 'artist', 'name', 'popularity', 
                         'release_date', 'genre', 'explicit', 'duration_ms', 'featured_artists']
song_metadata_df = merged_df[song_metadata_columns]

# Creating song_sentiment_df
song_sentiment_columns = ['spotify_id', 'valence', 'danceability', 'energy',
                          'number_of_emotion_tags', 'valence_tags', 'arousal_tags', 
                          'dominance_tags', 'tempo', 'key', 'mode', 'instrumentalness', 
                          'liveness', 'speechiness', 'acousticness', 'loudness']
song_sentiment_df = merged_df[song_sentiment_columns]

# Creating song_lyrics_df
song_lyrics_columns = ['spotify_id', 'lyrics']
song_lyrics_df = merged_df[song_lyrics_columns]

# save the dataframes to csv files
song_metadata_df.to_csv('./data/preprocessed-data/song_metadata.csv', index=False)
song_sentiment_df.to_csv('./data/preprocessed-data/song_sentiment.csv', index=False)
song_lyrics_df.to_csv('./data/preprocessed-data/song_lyrics.csv', index=False)