# Data preprocessing

In [1]:
# Import libraries
import pandas as pd
import os
import csv
import billboard
import json
import re
import lyricsgenius
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from langdetect import detect
pd.options.mode.chained_assignment = None

### Genius and Spotify APIs' settings

In [2]:
# Genius token
genius_token = os.environ.get('genius_token')

# Spotify authorization
spotify_client_id = os.environ.get('spotify_client_id')
spotify_client_secret = os.environ.get('spotify_client_secret')
client_credentials_manager = SpotifyClientCredentials(client_id=spotify_client_id, client_secret=spotify_client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

### CSV data

In [3]:
# Loading data and adding year columns and wrapping up billboard songs (1950-2015)
wd = os.getcwd()
data_path = os.chdir(wd + '\data')
all_csv_files = os.listdir(data_path)
songs_50_15 = []

for filename in all_csv_files:
    songs = pd.read_csv(filename, header = 0, sep = ';')
    year = os.path.basename(filename)[:-4]
    songs['Year'] = year
    songs_50_15.append(songs)

### billboard library data

In [4]:
# Downloading data from years 2016-2020 from billboard library
playlist = 'hot-100-songs'
dates = [2016, 2017, 2018, 2019, 2020]
songs_billboard = []
for y in dates:
    songs = billboard.ChartData(playlist, date = None, year = y, fetch = True, timeout = 25)
    songs_billboard.append(songs)

In [5]:
# Preprocessing data from years 2016-2020
song_titles, artists = [[] for i in range(2)]

for year in songs_billboard:
    for playlist in year:
        song = str(playlist).split(' by ')
        song_titles.append(song[0].strip("\'"))
        artists.append(song[1])
        
# Adding missed song on 87th position in 2016 chart
song_titles.insert(86, 'All the Way Up')
artists.insert(86, 'Fat Joe, Remy Ma and Jay-Z featuring French Montana and Infared')

### Concatenating CSV and billboard library data

In [6]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# Splitting list into same-length elements lists
def list_split(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

split_artists = list(list_split(artists, 100))
split_song_titles = list(list_split(song_titles, 100))

# Creating list of dictionaries for each year to convert them into pandas DataFrame
def dicts_of_dates(lst_of_dates):
    songs = []
    for i in range(len(lst_of_dates)):
        ranger = list(range(1, 101))
        year = lst_of_dates[i]
        dictionary = {'Position': ranger, 'Artist': split_artists[i], 'Song Title': split_song_titles[i], 'Year': [dates[i] for year in range(1, 101)]}
        as_pandas_df = pd.DataFrame(dictionary)
        songs.append(as_pandas_df)
    return songs

# Concatenate all songs from years 1950 - 2020
songs_16_20 = dicts_of_dates(dates)
all_songs = songs_50_15 + songs_16_20
all_time_billboard_wrap_up = pd.concat(all_songs)
all_time_billboard_wrap_up.reset_index(drop = True, inplace = True) # reset indexing
# Creating columns for json files' names
all_time_billboard_wrap_up['JSON file'] = [str(i) + '.json' for i in all_time_billboard_wrap_up.index]
display(all_time_billboard_wrap_up)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,0.json
1,2,Nat King Cole,Mona Lisa,1950,1.json
2,3,Anton Karas,Third Man Theme,1950,2.json
3,4,Gary and Bing Crosby,Sam's Song,1950,3.json
4,5,Gary and Bing Crosby,Simple Melody,1950,4.json
...,...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020,6675.json
6676,97,Luke Combs,Lovin' On You,2020,6676.json
6677,98,Moneybagg Yo,Said Sum,2020,6677.json
6678,99,H.E.R. Featuring YG,Slide,2020,6678.json


In [7]:
# all_time_billboard_wrap_up dataframe nans checking
nans_table = [all_time_billboard_wrap_up['Position'].isna().any(), all_time_billboard_wrap_up['Artist'].isna().any(), all_time_billboard_wrap_up['Song Title'].isna().any(), all_time_billboard_wrap_up['Year'].isna().any()]
nans_proj = pd.DataFrame({'Column': ['Position', 'Artist', 'Song Title', 'Year'], 'Is nan?': nans_table}).set_index('Column')
display(nans_proj)

Unnamed: 0_level_0,Is nan?
Column,Unnamed: 1_level_1
Position,False
Artist,False
Song Title,False
Year,False


In [8]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# Spliting authors by 'and' and 'Featuring' to make api searching easier and more efficient
artists = list(all_time_billboard_wrap_up.loc[:,'Artist'])
split_artists = []
for artist in artists:
    if any(re.findall(r'and|featuring|feat.|Featuring|Feat.|&', str(artist))):
        result = re.split(r'and|featuring|feat.|Featuring|Feat.|&', str(artist))[0].strip()
        split_artists.append(result)
    else:
        split_artists.append(artist)

all_time_billboard_wrap_up['Split Names'] = split_artists
display(all_time_billboard_wrap_up)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,0.json,Gordon Jenkins
1,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole
2,3,Anton Karas,Third Man Theme,1950,2.json,Anton Karas
3,4,Gary and Bing Crosby,Sam's Song,1950,3.json,Gary
4,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary
...,...,...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020,6675.json,Morgan Wallen
6676,97,Luke Combs,Lovin' On You,2020,6676.json,Luke Combs
6677,98,Moneybagg Yo,Said Sum,2020,6677.json,Moneybagg Yo
6678,99,H.E.R. Featuring YG,Slide,2020,6678.json,H.E.R.


### Genius data into json files

In [9]:
# lyricsgenius settings
genius = lyricsgenius.Genius(genius_token, timeout = 30, retries = 3)
songs = list(all_time_billboard_wrap_up.loc[:, 'Song Title'])
artists = list(all_time_billboard_wrap_up.loc[:,'Split Names'])
jsons_path = wd + '\\jsons\\'

# Saving lyrics to jsons function
def return_lyrics_json(song, artist, json_name):
    song_lyrics = genius.search_song(song, artist)
    with open(jsons_path + json_name, 'w', encoding = 'UTF-8') as f:
        json.dump({'lyrics': song_lyrics.lyrics}, f, ensure_ascii = False, indent = 4)

# No lyrics function (only instrumental songs)
def no_lyrics(json_name):
    with open(jsons_path + json_name, 'w', encoding = 'UTF-8') as f:
        json.dump({'lyrics': ""}, f, ensure_ascii = False, indent = 4)  

In [10]:
# Applying return_lyrics_json and no_lyrics functions on lyrics:
def json_data():
    error_list = []
    for i in range(len(songs)):
        try:
            return_lyrics_json(songs[i], artists[i], str(i)+'.json')
        except AttributeError:
            no_lyrics(str(i)+'.json')
            error_list.append(i)
# json_data()

In [11]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# err based on error_list
# Fixed 6140 Beyonce 7/11 (date format in csv)
err = [12, 23, 25, 53, 60, 74, 84, 100, 101, 103, 111, 116, 141, 160, 173, 190, 196, 210, 233, 239, 266, 277, 334, 365, 380, 385, 386, 401, 428, 439, 440, 448, 501, 515, 519, 537, 554, 565, 604, 616, 619, 633, 690, 694, 696, 708, 723, 771, 774, 784, 800, 816, 832, 838, 853, 862, 866, 871, 906, 920, 937, 969, 991, 1039, 1111, 1126, 1150, 1180, 1211, 1245, 1277, 1381, 1387, 1397, 1400, 1422, 1454, 1537, 1542, 1569, 1598, 1613, 1615, 1668, 1801, 1807, 1822, 1850, 1873, 1895, 1940, 1965, 1982, 2027, 2095, 2144, 2167, 2177, 2189, 2208, 2269, 2350, 2370, 2400, 2528, 2559, 2603, 2633, 2690, 2738, 2791, 2835, 2843, 3140, 3334, 3460, 3742, 3908, 3952, 4121, 4169, 4171, 4180, 4208, 4244, 4245, 4280, 4351, 4361, 4387, 4518, 4579, 4760, 4777, 4823, 5134, 5235]

### Deleting empty or invalid json files

In [12]:
# Deleting empty json files
# Adding string '.json' to err
json_delete = [str(j) + '.json' for j in err]
# Deteling json files that are in json_delete list and bigger than 5KB
all_json_files = os.listdir(jsons_path)
for x in all_json_files:
    filepath = jsons_path + str(x)
    json_size = os.path.getsize(filepath)
    if (json_size > 5 * 1024 or x in json_delete): #error_list
        os.remove(filepath)
# Selecting all_billboard_wrap_up data corresponding with actual list of jsons
ordered_jsons = sorted([int(x[:-5]) for x in os.listdir(jsons_path)])
all_time_billboard_wrap_up_cleaned = all_time_billboard_wrap_up.iloc[ordered_jsons]

### Detecting language of lyrics

In [13]:
# Language detection
def language_detect(json_name):
    with open(jsons_path + json_name, 'r', encoding = 'UTF-8') as f:
        data = json.load(f)
    return detect(data['lyrics'])
all_time_billboard_wrap_up_cleaned['Song Language'] = [language_detect(json_name) for json_name in all_time_billboard_wrap_up_cleaned['JSON file']]
all_time_billboard_wrap_up_cleaned = all_time_billboard_wrap_up_cleaned[all_time_billboard_wrap_up_cleaned['Song Language'].isin(['en'])]
# Sava all_time_billboard_wrap_up_cleaned to csv
all_time_billboard_wrap_up_cleaned.to_csv(wd + '\\all_time_billboard_wrap_up_cleaned.csv', index = False)

### Spotify IDs

In [14]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# Function that enables finding Spotify ID from spotipy
def find_spotify_id(artist, track):
    try:
        track_id = sp.search(q='artist:' + artist + ' track:' + track, type='track')
        return track_id['tracks']['items'][0]['id']
    except IndexError:
        return ''    
all_time_billboard_wrap_up_cleaned['Spotify ID'] = [find_spotify_id(all_time_billboard_wrap_up_cleaned['Split Names'].iloc[i], all_time_billboard_wrap_up_cleaned['Song Title'].iloc[i]) for i in range(len(all_time_billboard_wrap_up_cleaned))]
display(all_time_billboard_wrap_up_cleaned)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names,Song Language,Spotify ID
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,0.json,Gordon Jenkins,en,1fhLgOJgIIZEsWWffk8ljs
1,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole,en,5dae01pKNjRQtgOeAkFzPY
3,4,Gary and Bing Crosby,Sam's Song,1950,3.json,Gary,en,1v4OqleXWTeORpLcPtAxyH
4,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary,en,33FdwvfiSBRbE2AQ4UehaR
5,6,Teresa Brewer,"Music, Music, Music",1950,5.json,Teresa Brewer,en,0lO5EKoz1Rb1pJoPoldE4D
...,...,...,...,...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020,6675.json,Morgan Wallen,en,5OELUCYgOHKFAvCERnAvfS
6676,97,Luke Combs,Lovin' On You,2020,6676.json,Luke Combs,en,0nYvjcSlCgjcwogQAwIwNp
6677,98,Moneybagg Yo,Said Sum,2020,6677.json,Moneybagg Yo,en,3sKz6Sd72K0ofPWcJPPk6H
6678,99,H.E.R. Featuring YG,Slide,2020,6678.json,H.E.R.,en,6MO2bfLHKykUgCChFdw91H


In [15]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
sum(all_time_billboard_wrap_up_cleaned['Spotify ID'] == '')

498

In [16]:
# Check artist and song name for n Spotify IDs
spotify_ids = all_time_billboard_wrap_up_cleaned['Spotify ID'].tolist()
spotify_artists = []
spotify_songs = []
for i in range(0, len(spotify_ids)):
    if spotify_ids[i] != '':
        track = sp.track(spotify_ids[i])
        song_name = track['name']
        spotify_songs.append(song_name)
        artist = track['album']['artists'][0].get('name')
        spotify_artists.append(artist)
    else:
        song_name, artist = '', ''
        spotify_songs.append(song_name)
        spotify_artists.append(artist)

In [17]:
all_time_billboard_wrap_up_cleaned['Spotify Artist'] = spotify_artists
all_time_billboard_wrap_up_cleaned['Spotify Song'] = spotify_songs

# Filtering data by Artist and Spotify Artist inclusion
all_time_billboard_wrap_up_cleaned_sp_data = all_time_billboard_wrap_up_cleaned[all_time_billboard_wrap_up_cleaned['Spotify Artist'].isin(all_time_billboard_wrap_up_cleaned['Artist'])]

### Spotify audio features

In [18]:
temp_audio_features_list = []
for i in all_time_billboard_wrap_up_cleaned_sp_data['Spotify ID']:
    temp_song_audio_features = pd.DataFrame(sp.audio_features(i))
    temp_audio_features_list.append(temp_song_audio_features)
temp_audio_features_df = pd.concat(temp_audio_features_list)
all_time_billboard_wrap_up_cleaned_sp_data = all_time_billboard_wrap_up_cleaned_sp_data.merge(temp_audio_features_df, how = 'left', left_on = 'Spotify ID', right_on = 'id')
all_time_billboard_wrap_up_cleaned_sp_data = all_time_billboard_wrap_up_cleaned_sp_data.drop(all_time_billboard_wrap_up_cleaned_sp_data.columns[11], axis=1)
display(all_time_billboard_wrap_up_cleaned_sp_data)

Unnamed: 0,Position,Artist,Song Title,Year,JSON file,Split Names,Song Language,Spotify ID,Spotify Artist,Spotify Song,...,liveness,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence
0,2,Nat King Cole,Mona Lisa,1950,1.json,Nat King Cole,en,5dae01pKNjRQtgOeAkFzPY,Nat King Cole,Mona Lisa,...,0.1130,-12.201,1.0,0.0371,174.578,3.0,https://api.spotify.com/v1/tracks/5dae01pKNjRQ...,audio_features,spotify:track:5dae01pKNjRQtgOeAkFzPY,0.303
1,5,Gary and Bing Crosby,Simple Melody,1950,4.json,Gary,en,33FdwvfiSBRbE2AQ4UehaR,Bing Crosby,Play A Simple Melody - Single Version,...,0.1670,-13.755,1.0,0.0473,138.800,3.0,https://api.spotify.com/v1/tracks/33FdwvfiSBRb...,audio_features,spotify:track:33FdwvfiSBRbE2AQ4UehaR,0.829
2,6,Teresa Brewer,"Music, Music, Music",1950,5.json,Teresa Brewer,en,0lO5EKoz1Rb1pJoPoldE4D,Teresa Brewer,(Put Another Nickel In) Music! Music! Music!,...,0.1540,-14.392,1.0,0.0398,99.136,4.0,https://api.spotify.com/v1/tracks/0lO5EKoz1Rb1...,audio_features,spotify:track:0lO5EKoz1Rb1pJoPoldE4D,0.919
3,7,Guy Lombardo,Third Man Theme,1950,6.json,Guy Lombardo,en,6kVfwQOc2PjVPwbk0aEmN9,Guy Lombardo,Third Man Theme,...,0.0849,-10.808,1.0,0.0358,115.590,4.0,https://api.spotify.com/v1/tracks/6kVfwQOc2PjV...,audio_features,spotify:track:6kVfwQOc2PjVPwbk0aEmN9,0.536
4,8,Red Foley,Chattanoogie Shoe Shine Boy,1950,7.json,Red Foley,en,7Jf323ttHKUnPylFWiaGl3,Red Foley,Chattanoogie Shoe Shine Boy - 1949 Single Version,...,0.1180,-15.925,1.0,0.0494,148.367,4.0,https://api.spotify.com/v1/tracks/7Jf323ttHKUn...,audio_features,spotify:track:7Jf323ttHKUnPylFWiaGl3,0.846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4716,93,Luke Combs Featuring Eric Church,Does To Me,2020,6672.json,Luke Combs,en,0nGXi46VcQQ56ZJR428MKS,Luke Combs,Does To Me (feat. Eric Church),...,0.3110,-4.897,1.0,0.0373,113.065,4.0,https://api.spotify.com/v1/tracks/0nGXi46VcQQ5...,audio_features,spotify:track:0nGXi46VcQQ56ZJR428MKS,0.638
4717,96,Morgan Wallen,More Than My Hometown,2020,6675.json,Morgan Wallen,en,5OELUCYgOHKFAvCERnAvfS,Morgan Wallen,More Than My Hometown,...,0.1310,-5.479,1.0,0.0462,126.019,4.0,https://api.spotify.com/v1/tracks/5OELUCYgOHKF...,audio_features,spotify:track:5OELUCYgOHKFAvCERnAvfS,0.597
4718,97,Luke Combs,Lovin' On You,2020,6676.json,Luke Combs,en,0nYvjcSlCgjcwogQAwIwNp,Luke Combs,Lovin' On You,...,0.1630,-4.865,1.0,0.0600,118.974,4.0,https://api.spotify.com/v1/tracks/0nYvjcSlCgjc...,audio_features,spotify:track:0nYvjcSlCgjcwogQAwIwNp,0.530
4719,98,Moneybagg Yo,Said Sum,2020,6677.json,Moneybagg Yo,en,3sKz6Sd72K0ofPWcJPPk6H,Moneybagg Yo,Said Sum,...,0.1000,-6.789,0.0,0.3530,126.998,4.0,https://api.spotify.com/v1/tracks/3sKz6Sd72K0o...,audio_features,spotify:track:3sKz6Sd72K0ofPWcJPPk6H,0.274


In [19]:
# Save all_time_billboard_wrap_up_cleaned_sp_data to csv
all_time_billboard_wrap_up_cleaned_sp_data.to_csv(wd + '\\all_time_billboard_wrap_up_cleaned_spotify.csv', index = False)