# Data preprocessing

In [1]:
# Import libraries
import pandas as pd
import os
import csv
import billboard
import lyricsgenius
import json
import re
import spotipy as sp
from spotipy.oauth2 import SpotifyClientCredentials

### Genius and Spotify APIs' settings

In [2]:
# Genius token
genius_token = os.environ.get('genius_token')

# Spotify authorization
spotify_client_id = os.environ.get('spotify_client_id')
spotify_client_secret = os.environ.get('spotify_client_secret')
client_credentials_manager = SpotifyClientCredentials(client_id=spotify_client_id, client_secret=spotify_client_secret)
sp = sp.Spotify(client_credentials_manager=client_credentials_manager)

### CSV data

In [3]:
# Loading data and adding year columns and wrapping up billboard songs (1950-2015)
wd = os.getcwd()
data_path = os.chdir(wd + '\data')
all_csv_files = os.listdir(data_path)
songs_50_15 = []

for filename in all_csv_files:
    songs = pd.read_csv(filename, header = 0, sep = ';')
    year = os.path.basename(filename)[:-4]
    songs['Year'] = year
    songs_50_15.append(songs)

### billboard library data

In [4]:
# Downloading data from years 2016-2020 from billboard library
playlist = 'hot-100-songs'
dates = [2016, 2017, 2018, 2019, 2020]
songs_billboard = []
for y in dates:
    songs = billboard.ChartData(playlist, date = None, year = y, fetch = True, timeout = 25)
    songs_billboard.append(songs)

In [5]:
# Preprocessing data from years 2016-2020
song_titles, artists = [[] for i in range(2)]

for year in songs_billboard:
    for playlist in year:
        song = str(playlist).split(' by ')
        song_titles.append(song[0].strip("\'"))
        artists.append(song[1])
        
# Adding missed song on 87th position in 2016 chart
song_titles.insert(86, 'All the Way Up')
artists.insert(86, 'Fat Joe, Remy Ma and Jay-Z featuring French Montana and Infared')

### Concatenatinc CSV and billboard library data

In [6]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# Splitting list into same-length elements lists
def list_split(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

split_artists = list(list_split(artists, 100))
split_song_titles = list(list_split(song_titles, 100))

# Creating list of dictionaries for each year to convert them into pandas DataFrame
def dicts_of_dates(lst_of_dates):
    songs = []
    for i in range(len(lst_of_dates)):
        ranger = list(range(1, 101))
        year = lst_of_dates[i]
        dictionary = {'Position': ranger, 'Artist': split_artists[i], 'Song Title': split_song_titles[i], 'Year': [dates[i] for year in range(1, 101)]}
        as_pandas_df = pd.DataFrame(dictionary)
        songs.append(as_pandas_df)
    return songs

# Concatenate all songs from years 1950 - 2020
songs_16_20 = dicts_of_dates(dates)
all_songs = songs_50_15 + songs_16_20
all_time_billboard_wrap_up = pd.concat(all_songs)
all_time_billboard_wrap_up.reset_index(drop = True, inplace = True) # reset indexing
display(all_time_billboard_wrap_up)

Unnamed: 0,Position,Artist,Song Title,Year
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950
1,2,Nat King Cole,Mona Lisa,1950
2,3,Anton Karas,Third Man Theme,1950
3,4,Gary and Bing Crosby,Sam's Song,1950
4,5,Gary and Bing Crosby,Simple Melody,1950
...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020
6676,97,Luke Combs,Lovin' On You,2020
6677,98,Moneybagg Yo,Said Sum,2020
6678,99,H.E.R. Featuring YG,Slide,2020


In [7]:
# all_time_billboard_wrap_up dataframe nans checking
nans_table = [all_time_billboard_wrap_up['Position'].isna().any(), all_time_billboard_wrap_up['Artist'].isna().any(), all_time_billboard_wrap_up['Song Title'].isna().any(), all_time_billboard_wrap_up['Year'].isna().any()]
nans_proj = pd.DataFrame({'Column': ['Position', 'Artist', 'Song Title', 'Year'], 'Is nan?': nans_table}).set_index('Column')
display(nans_proj)

Unnamed: 0_level_0,Is nan?
Column,Unnamed: 1_level_1
Position,False
Artist,False
Song Title,False
Year,False


In [8]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# Spliting authors by 'and' and 'Featuring' to make api searching easier and more efficient
artists = list(all_time_billboard_wrap_up.loc[:,'Artist'])
split_artists = []
for artist in artists:
    if any(re.findall(r'and|Featuring', str(artist))):
        result = re.split(r'and|Featuring', str(artist))[0].strip()
        split_artists.append(result)
    else:
        split_artists.append(artist)

all_time_billboard_wrap_up['Split Names'] = split_artists
display(all_time_billboard_wrap_up)

Unnamed: 0,Position,Artist,Song Title,Year,Split Names
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,Gordon Jenkins
1,2,Nat King Cole,Mona Lisa,1950,Nat King Cole
2,3,Anton Karas,Third Man Theme,1950,Anton Karas
3,4,Gary and Bing Crosby,Sam's Song,1950,Gary
4,5,Gary and Bing Crosby,Simple Melody,1950,Gary
...,...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020,Morgan Wallen
6676,97,Luke Combs,Lovin' On You,2020,Luke Combs
6677,98,Moneybagg Yo,Said Sum,2020,Moneybagg Yo
6678,99,H.E.R. Featuring YG,Slide,2020,H.E.R.


In [9]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# Function that enables to find Spotify ID
def find_spotify_id(artist, track):
    try:
        track_id = sp.search(q='artist:' + artist + ' track:' + track, type='track')
        return track_id['tracks']['items'][0]['id']
    except IndexError:
        return ''
all_time_billboard_wrap_up['Spotify ID'] = [find_spotify_id(all_time_billboard_wrap_up['Split Names'].iloc[i], all_time_billboard_wrap_up['Song Title'].iloc[i]) for i in range(len(all_time_billboard_wrap_up))]
display(all_time_billboard_wrap_up)

Unnamed: 0,Position,Artist,Song Title,Year,Split Names,Spotify ID
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,Gordon Jenkins,1fhLgOJgIIZEsWWffk8ljs
1,2,Nat King Cole,Mona Lisa,1950,Nat King Cole,5dae01pKNjRQtgOeAkFzPY
2,3,Anton Karas,Third Man Theme,1950,Anton Karas,1jWbae7ESwl2fWHGwcLvLk
3,4,Gary and Bing Crosby,Sam's Song,1950,Gary,1v4OqleXWTeORpLcPtAxyH
4,5,Gary and Bing Crosby,Simple Melody,1950,Gary,33FdwvfiSBRbE2AQ4UehaR
...,...,...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020,Morgan Wallen,5OELUCYgOHKFAvCERnAvfS
6676,97,Luke Combs,Lovin' On You,2020,Luke Combs,0nYvjcSlCgjcwogQAwIwNp
6677,98,Moneybagg Yo,Said Sum,2020,Moneybagg Yo,3sKz6Sd72K0ofPWcJPPk6H
6678,99,H.E.R. Featuring YG,Slide,2020,H.E.R.,6MO2bfLHKykUgCChFdw91H


### Genius data into json files

In [10]:
# lyricsgenius settings
genius = lyricsgenius.Genius(genius_token, timeout = 30, retries = 3)
songs = list(all_time_billboard_wrap_up.loc[:, 'Song Title'])
artists = list(all_time_billboard_wrap_up.loc[:,'Split Names'])
jsons_path = wd + '\\jsons\\'

# Saving lyrics to jsons function
def return_lyrics_json(song, artist, json_name):
    song_lyrics = genius.search_song(song, artist)
    with open(jsons_path + json_name, 'w', encoding = 'UTF-8') as f:
        json.dump({'lyrics': song_lyrics.lyrics}, f, ensure_ascii = False, indent = 4)

# No lyrics function (only instrumental songs)
def no_lyrics(json_name):
    with open(jsons_path + json_name, 'w', encoding = 'UTF-8') as f:
        json.dump({'lyrics': ""}, f, ensure_ascii = False, indent = 4)  

In [11]:
# Applying return_lyrics_json and no_lyrics functions on lyrics:
def json_data():
    error_list = []
    for i in range(len(songs)):
        try:
            return_lyrics_json(songs[i], artists[i], str(i)+'.json')
        except AttributeError:
            no_lyrics(str(i)+'.json')
            error_list.append(i)
# json_data()

In [12]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# err based on error_list
# Fixed 6140 Beyonce 7/11 (date format in csv)
err = [12, 23, 25, 53, 60, 74, 84, 100, 101, 103, 111, 116, 141, 160, 173, 190, 196, 210, 233, 239, 266, 277, 334, 365, 380, 385, 386, 401, 428, 439, 440, 448, 501, 515, 519, 537, 554, 565, 604, 616, 619, 633, 690, 694, 696, 708, 723, 771, 774, 784, 800, 816, 832, 838, 853, 862, 866, 871, 906, 920, 937, 969, 991, 1039, 1111, 1126, 1150, 1180, 1211, 1245, 1277, 1381, 1387, 1397, 1400, 1422, 1454, 1537, 1542, 1569, 1598, 1613, 1615, 1668, 1801, 1807, 1822, 1850, 1873, 1895, 1940, 1965, 1982, 2027, 2095, 2144, 2167, 2177, 2189, 2208, 2269, 2350, 2370, 2400, 2528, 2559, 2603, 2633, 2690, 2738, 2791, 2835, 2843, 3140, 3334, 3460, 3742, 3908, 3952, 4121, 4169, 4171, 4180, 4208, 4244, 4245, 4280, 4351, 4361, 4387, 4518, 4579, 4760, 4777, 4823, 5134, 5235]

In [13]:
# Skipping error_list - songs without lyrics or unable to find
billboard_songs = pd.concat([all_time_billboard_wrap_up, all_time_billboard_wrap_up.iloc[err]]).drop_duplicates(keep=False)
billboard_songs = billboard_songs.drop('Split Names', 1)
display(billboard_songs)

Unnamed: 0,Position,Artist,Song Title,Year,Spotify ID
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,1fhLgOJgIIZEsWWffk8ljs
1,2,Nat King Cole,Mona Lisa,1950,5dae01pKNjRQtgOeAkFzPY
2,3,Anton Karas,Third Man Theme,1950,1jWbae7ESwl2fWHGwcLvLk
3,4,Gary and Bing Crosby,Sam's Song,1950,1v4OqleXWTeORpLcPtAxyH
4,5,Gary and Bing Crosby,Simple Melody,1950,33FdwvfiSBRbE2AQ4UehaR
...,...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020,5OELUCYgOHKFAvCERnAvfS
6676,97,Luke Combs,Lovin' On You,2020,0nYvjcSlCgjcwogQAwIwNp
6677,98,Moneybagg Yo,Said Sum,2020,3sKz6Sd72K0ofPWcJPPk6H
6678,99,H.E.R. Featuring YG,Slide,2020,6MO2bfLHKykUgCChFdw91H


In [14]:
# Deleting empty json files
# Adding string '.json' to err
json_delete = [str(j) + '.json' for j in err]
# Deteling json files that are in json_delete list
all_json_files = os.listdir(jsons_path)
#for x in all_json_files:
#    if os.path.getsize(jsons_path + '\\' + x + '.json' > 10000) or x in error_list: #err
#        os.remove(x)
#
# Deleting files which size is bigger than 10 KB