# Data Collection - Lyrics Scraping

+ Input: Kaggle dataset (max_weeks.csv) & Self-scraping dataset (billboard_data_rest.csv)
+ Output: Lyric dictiionary (all_song_lyrics_dict.json)

## Deal with Kaggle dataset

In [2]:
# Use genius API to get access to songs info
import requests
import pandas as pd

file = open("token_genius.txt", "r")
token_genius = file.read()
file.close()
search_term = "David Bowie"
genius_search_url = f"http://api.genius.com/search?q={search_term}&access_token={token_genius}"

response = requests.get(genius_search_url)
json_data = response.json()

david_bowie_info = []
for song in json_data['response']['hits']:
    david_bowie_info.append([song['result']['full_title'], song['result']['stats']['pageviews']])
    
david_bowie_df = pd.DataFrame(david_bowie_info)
david_bowie_df.columns = ['song_title', 'page_views']
david_bowie_df

Unnamed: 0,song_title,page_views
0,Under Pressure by Queen & David Bowie,1314687
1,Space Oddity by David Bowie,1320084
2,Starman by David Bowie,1033004
3,"""Heroes"" by David Bowie",872479
4,★ (Blackstar) by David Bowie,792198
5,Life on Mars? by David Bowie,672264
6,Lazarus by David Bowie,498171
7,Changes by David Bowie,488448
8,The Man Who Sold the World by David Bowie,391169
9,Moonage Daydream by David Bowie,324916


In [4]:
file_path = 'max_weeks.csv'
charts_df = pd.read_csv(file_path)
charts_df

Unnamed: 0,index,date,rank,song,artist,last-week,peak-rank,max-weeks-on-board
0,163090,1990-08-11,91,"""B"" Girls",Young And Restless,83.0,54,15
1,252937,1973-05-19,51,"""Cherry Cherry"" from Hot August Night",Neil Diamond,31.0,31,10
2,250048,1973-12-08,62,"""Having A Party"" Medley",The Ovations (Featuring Louis Williams),56.0,56,9
3,249028,1974-02-16,42,"""Joy"" Pt. I",Isaac Hayes,30.0,30,9
4,232582,1977-04-16,83,"""Roots"" Medley",Quincy Jones,57.0,57,7
...,...,...,...,...,...,...,...,...
24615,1491,2021-07-31,92,transparentsoul,Willow Featuring Travis Barker,93.0,76,6
24616,14795,2019-01-12,96,whoa (mind in awe),XXXTENTACION,,37,3
24617,90399,2004-07-17,100,whoknows,Musiq,92.0,65,13
24618,108298,2001-02-10,99,www.memory,Alan Jackson,89.0,45,15


In [None]:
import logging
import lyricsgenius

# Setup logging
logging.basicConfig(filename='lyrics_fetch.log', level=logging.INFO, format='%(asctime)s %(message)s')


def fetch_lyrics(song, artist, genius_api):
    """
    Fetch the lyrics for the given song and artist.
    """
    try:
        song_info = genius_api.search_song(song, artist)
        if song_info:
            return song_info.lyrics
        else:
            return "Lyrics not available"
    except Exception as e:
        logging.error(f"Error fetching lyrics for {song} by {artist}: {e}")
        return "Error"


def process_batch(df, genius_api, start_index=0, batch_size=1000):
    """
    Process a batch of records and save to a CSV file.
    """
    end_index = min(start_index + batch_size, len(df))
    batch_df = df.iloc[start_index:end_index]
    batch_df['lyrics'] = batch_df.apply(lambda row: fetch_lyrics(row['song'], row['artist'], genius_api), axis=1)
    batch_df.to_csv(f'max_week_{start_index}_{end_index}.csv', index=False)
    logging.info(f"Processed records {start_index} to {end_index}")


def main():
    '''
    Fetch lyrics and do batch processing.
    '''
    file_path = 'max_weeks.csv'
    charts_df = pd.read_csv(file_path)

    file = open("token.txt", "r")
    token = file.read()
    file.close()
    genius = lyricsgenius.Genius(token, timeout=60)

    batch_size = 1000
    for start_index in range(0, len(charts_df), batch_size):
        process_batch(charts_df, genius, start_index, batch_size)

    logging.info("All records processed successfully.")

main()

In [None]:
# Merge all data
import os


def merge_csv_files(directory):
    '''
    Merge the batch processed lyrics data
    '''
    # List all CSV files in the directory
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

    # Read each CSV file into a pandas DataFrame
    dataframes = [pd.read_csv(os.path.join(directory, file)) for file in csv_files]

    # Concatenate all DataFrames into one
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Save the concatenated DataFrame to a new CSV file
    merged_df.to_csv(os.path.join(directory, 'merged_max_weeks.csv'), index=False)
    print(f'Merged CSV saved as: {os.path.join(directory, "merged_max_weeks.csv")}')

# Replace 'your_directory_path' with the path where your CSV files are stored
merge_csv_files('/Users/qxlin/Library/CloudStorage/GoogleDrive-qxlin@uchicago.edu/My Drive/UChi/MACS-30122/fianl-proj/lyric scraping/for_merge')

In [None]:
df = pd.read_csv('merged_max_weeks.csv')

display(df)

Unnamed: 0,index,date,rank,song,artist,last-week,peak-rank,max-weeks-on-board,lyrics
0,34748,2015-03-14,49,Glory,Common & John Legend,99.0,49,3,95 ContributorsGlory Lyrics[Produced by John L...
1,39889,2014-03-22,90,Glory And Gore,Lorde,68.0,68,3,70 ContributorsTranslationsFrançaisItalianoEsp...
2,258631,1972-04-15,45,Glory Bound,The Grass Roots,44.0,34,10,3 ContributorsGlory Bound LyricsAll the sun on...
3,188497,1985-09-28,98,Glory Days,Bruce Springsteen,79.0,5,18,34 ContributorsGlory Days Lyrics[Intro]\nOh ye...
4,267152,1970-08-29,66,Glory Glory,The Rascals,58.0,58,6,"2 ContributorsGlory Glory LyricsI saw a child,..."
...,...,...,...,...,...,...,...,...,...
24615,280922,1968-01-06,36,By The Time I Get To Phoenix,Glen Campbell,33.0,26,11,17 ContributorsBy the Time I Get to Phoenix Ly...
24616,146683,1993-10-02,84,By The Time This Night Is Over,Kenny G With Peabo Bryson,92.0,25,20,Lyrics not available
24617,99375,2002-10-26,76,By The Way,Red Hot Chili Peppers,76.0,34,20,54 ContributorsBy the Way Lyrics[Chorus]\nStan...
24618,107594,2001-03-31,95,By Your Side,Sade,86.0,75,11,29 ContributorsBy Your Side Lyrics[Verse 1]\nY...


## Add lyrics for new billboard data & Change CSV to .json file dictionary for future use

In [None]:
old = pd.read_csv("merged_max_weeks.csv")

In [None]:
# Create a new column in the dataframe that combines 'song' and 'artist'
old['song_artist'] = old['song'] + ' - ' + old['artist']

# Convert the dataframe to a dictionary
song_lyrics_dict = pd.Series(old.lyrics.values, index=old.song_artist).to_dict()

In [None]:
new = pd.read_csv("billboard_data_rest.csv")

In [None]:
# Setup logging
logging.basicConfig(filename='lyrics_fetch.log', level=logging.INFO, format='%(asctime)s %(message)s')


# Update the functions with the lyrics in old csv to save more time
def fetch_lyrics(song, artist, genius_api, dict):
    """
    Fetch the lyrics for the given song and artist.
    """
    try:
        song_artist = song + ' - ' + artist
        if song_artist in dict.keys():
            print("Fetch from dictionary")
            return dict[song_artist]
        else:
            song_info = genius_api.search_song(song, artist)
            if song_info:
                dict[song_artist] = song_info.lyrics
                return song_info.lyrics
            else:
                return "Lyrics not available"
    except Exception as e:
        logging.error(f"Error fetching lyrics for {song} by {artist}: {e}")
        return "Error"


def process_batch(df, genius_api, start_index=0, batch_size=1000):
    """
    Process a batch of records and save to a CSV file.
    """
    end_index = min(start_index + batch_size, len(df))
    batch_df = df.iloc[start_index:end_index]
    batch_df['lyrics'] = batch_df.apply(lambda row: fetch_lyrics(row['title'], row['author'], genius_api, song_lyrics_dict), axis=1)
    batch_df.to_csv(f'billboard_data_rest_{start_index}_{end_index}.csv', index=False)
    logging.info(f"Processed records {start_index} to {end_index}")


def main():
    '''
    Fetch lyrics and do batch processing.
    '''
    file_path = 'billboard_data_rest.csv'
    charts_df = pd.read_csv(file_path)

    file = open("token.txt", "r")
    token = file.read()
    file.close()
    genius = lyricsgenius.Genius(token, timeout=60)

    batch_size = 1000
    for start_index in range(0, len(charts_df), batch_size):
        process_batch(charts_df, genius, start_index, batch_size)

    logging.info("All records processed successfully.")


main()

In [None]:
import json

# Convert dictionary to JSON formatted string
json_str = json.dumps(song_lyrics_dict)

# To print the JSON string to the console
print(json_str)

# To write the JSON data to a file
with open('all_song_lyrics_dict.json', 'w') as json_file:
    json.dump(song_lyrics_dict, json_file)