# Data preprocessing

In [1]:
# Import libraries
import pandas as pd
import os
import csv
import billboard
import lyricsgenius
import json
import re

In [2]:
# Loading data and adding year columns and wrapping up billboard songs (1950-2015)
cwd = os.getcwd()
data_path = os.chdir(cwd + "\data")
all_csv_files = os.listdir(data_path)
songs_50_15 = []

for filename in all_csv_files:
    songs = pd.read_csv(filename, header = 0, sep = ',')
    year = os.path.basename(filename)[:-4]
    songs["Year"] = year
    songs_50_15.append(songs)

In [3]:
# Downloading data from years 2016-2020 from billboard library
playlist = 'hot-100-songs'
dates = [2016, 2017, 2018, 2019, 2020]
songs_billboard = []
for y in dates:
    songs = billboard.ChartData(playlist, date = None, year = y, fetch = True, timeout = 25)
    songs_billboard.append(songs)

In [4]:
# Preprocessing data from years 2016-2020
song_titles = []
artists = []
for year in songs_billboard:
    for playlist in year:
        song = str(playlist).split(" by ")
        song_titles.append(song[0].strip("\'"))
        artists.append(song[1])
        
# Adding missed song on 87th position in 2016 chart
song_titles.insert(86, 'All the Way Up')
artists.insert(86, 'Fat Joe, Remy Ma and Jay-Z featuring French Montana and Infared')

# Splitting list into same-length elements lists
def list_split(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

split_artists = list(list_split(artists, 100))
split_song_titles = list(list_split(song_titles, 100))

# Creating list of dictionaries for each year to convert them into pandas DataFrame
def dicts_of_dates(lst_of_dates):
    songs = []
    for i in range(len(lst_of_dates)):
        ranger = list(range(1, 101))
        year = lst_of_dates[i]
        dict_ = {'Position': ranger, 'Artist': split_artists[i], 'Song Title': split_song_titles[i], 'Year': [dates[i] for year in range(1, 101)]}
        as_pandas_df = pd.DataFrame(dict_)
        songs.append(as_pandas_df)
    return songs

songs_16_20 = dicts_of_dates(dates)

all_songs = songs_50_15 + songs_16_20
all_time_billboard_wrap_up = pd.concat(all_songs)
all_time_billboard_wrap_up.reset_index(drop = True, inplace = True) # reset indexing
display(all_time_billboard_wrap_up)

Unnamed: 0,Position,Artist,Song Title,Year
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950
1,2,Nat King Cole,Mona Lisa,1950
2,3,Anton Karas,Third Man Theme,1950
3,4,Gary and Bing Crosby,Sam's Song,1950
4,5,Gary and Bing Crosby,Simple Melody,1950
...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020
6676,97,Luke Combs,Lovin' On You,2020
6677,98,Moneybagg Yo,Said Sum,2020
6678,99,H.E.R. Featuring YG,Slide,2020


In [5]:
# Pandas dataframe nans fixing
print(all_time_billboard_wrap_up['Position'].isna().any())
print(all_time_billboard_wrap_up['Artist'].isna().any())
print(all_time_billboard_wrap_up['Song Title'].isna().any())
print(all_time_billboard_wrap_up['Year'].isna().any())
print(all_time_billboard_wrap_up['Artist'].isna().any() == all_time_billboard_wrap_up['Song Title'].isna().any())
# results to dataframe

False
True
True
False
True


In [6]:
# Deleting nans from pandas dataframe
nans = all_time_billboard_wrap_up[all_time_billboard_wrap_up['Artist'].isna()]
nans[['Position', 'Artist', 'Song Title']] = nans['Position'].str.split(",", 2).to_list()
nans['Artist'] = nans.Artist.str.replace('"', "")
nans['Song Title'] = nans['Song Title'].str.replace('"', "")

# Automation!!!

# Manual editing
#pd.set_option("display.max_rows", None, "display.max_columns", None)

# 1071
nans.loc[1071,:]['Song Title'] = "Shangri-La"
nans.loc[1071,:]['Artist'] = "Robert Maxwell, His Harp and Orch."
# 1176
nans.loc[1176,:]['Song Title'] = "I'm A Fool"
nans.loc[1176,:]['Artist'] = "Dino, Desi and Billy"
# 1370
nans.loc[1370,:]['Song Title'] = "I Dig Rock And Roll Music"
nans.loc[1370,:]['Artist'] = "Peter, Paul and Mary"
# 1524
nans.loc[1524,:]['Song Title'] = "You've Made Me So Very Happy"
nans.loc[1524,:]['Artist'] = "Blood, Sweat and Tears"
# 1545
nans.loc[1545,:]['Song Title'] = "Black Pearl"
nans.loc[1545,:]['Artist'] = "Sonny Charles and The Checkmates, Ltd."
# 1721
nans.loc[1721,:]['Song Title'] = "Don't Pull Your Love"
nans.loc[1721,:]['Artist'] = "Hamilton, Joe Frank and Reynolds"
# 2138
nans.loc[2138,:]['Song Title'] = "That's The Way Of The World"
nans.loc[2138,:]['Artist'] = "Earth, Wind and Fire"
# 2238
nans.loc[2238,:]['Song Title'] = "Sing A Song"
nans.loc[2238,:]['Artist'] = "Earth, Wind and Fire"
# 2258
nans.loc[2258,:]['Song Title'] = "Getaway"
nans.loc[2258,:]['Artist'] = "Earth, Wind and Fire"
# 2326
nans.loc[2326,:]['Song Title'] = "Just A Song Before I Go"
nans.loc[2326,:]['Artist'] = "Crosby, Stills and Nash"
#2443
nans.loc[2443,:]['Artist'] = 'Evelyn "Champagne" King'
# 2456
nans.loc[2456,:]['Song Title'] = "Serpentine Fire"
nans.loc[2456,:]['Artist'] = "Earth, Wind and Fire"
# 2517
nans.loc[2517,:]['Song Title'] = "The Love Has Gone"
nans.loc[2517,:]['Artist'] = "Earth, Wind and Fire"
# 2536
nans.loc[2536,:]['Song Title'] = "Boogie Wonderland"
nans.loc[2536,:]['Artist'] = "Earth, Wind and Fire"
# 2557
nans.loc[2557,:]['Song Title'] = "September"
nans.loc[2557,:]['Artist'] = "Earth, Wind and Fire"
# 2812
nans.loc[2812,:]['Song Title'] = "Let's Groove"
nans.loc[2812,:]['Artist'] = "Earth, Wind and Fire"
# 2979
nans.loc[2979,:]['Song Title'] = "Fall In Love With Me"
nans.loc[2979,:]['Artist'] = "Earth, Wind and Fire"
# 3987
nans.loc[3987,:]['Song Title'] = "All For Love"
nans.loc[3987,:]['Artist'] = "Bryan Adams, Rod Stewart and Sting"
# 4019
nans.loc[4019,:]['Song Title'] = "Because The Night"
nans.loc[4019,:]['Artist'] = "10,000 Maniacs"
# 4703
nans.loc[4703,:]['Song Title'] = "Lady Marmalade"
nans.loc[4703,:]['Artist'] = "Christina Aguilera, Lil' Kim, Mya and Pink"
# 4797
nans.loc[4797,:]['Song Title'] = "I Need a Girl..."
nans.loc[4797,:]['Artist'] = "P. Diddy and Ginuwine feat. Loon, Mario Winans and Tammy Ruggeri"
# 4823
nans.loc[4823,:]['Song Title'] = "Down 4 U"
nans.loc[4823,:]['Artist'] = "Irv Gotti Presents The Inc. feat. Ja Rule, Ashanti, Charli Baltimore and Vita"
# 4832
nans.loc[4832,:]['Song Title'] = "Hey Ma"
nans.loc[4832,:]['Artist'] = "Cam'ron feat. Juelz Santana, Freekey Zekey and Toya"
# 4892
nans.loc[4892,:]['Song Title'] = "Shake Ya Tailfeather"
nans.loc[4892,:]['Artist'] = "Nelly, P. Diddy and Murphy Lee"
# 4919
nans.loc[4919,:]['Song Title'] = "Air Force Ones"
nans.loc[4919,:]['Artist'] = "Nelly feat. Kyjuan, Ali and Murphy Lee"
# 5031
nans.loc[5031,:]['Song Title'] = "I Like That"
nans.loc[5031,:]['Artist'] = "Houston feat. Chingy, Nate Dogg and I-20"
# 5134
nans.loc[5134,:]['Song Title'] = "Sugar (Gimme Some)"
nans.loc[5134,:]['Artist'] = "Trick Daddy feat. Ludacris, Lil' Kim and Cee-Lo"
# 5191
nans.loc[5191,:]['Song Title'] = "Grillz"
nans.loc[5191,:]['Artist'] = "Nelly feat. Paul Wall, Ali and Gipp"
# 5264
nans.loc[5264,:]['Song Title'] = "I Think They Like Me"
nans.loc[5264,:]['Artist'] = "Dem Franchize Boyz feat. Jermaine Dupri, Da Brat and Bow Wow"
# 5435
nans.loc[5435,:]['Song Title'] = "Sweetest Girl (Dollar Bill)"
nans.loc[5435,:]['Artist'] = "Wyclef Jean feat. Akon, Lil Wayne and Niia"
# 5449
nans.loc[5449,:]['Song Title'] = "Lolli Lolli (Pop That Body)"
nans.loc[5449,:]['Artist'] = "Three 6 Mafia feat. Project Pat, Young D and Superpower"
# 5510
nans.loc[5510,:]['Song Title'] = "Run This Town"
nans.loc[5510,:]['Artist'] = "Jay-Z, Rihanna and Kanye West"
# 5526
nans.loc[5526,:]['Song Title'] = "Crack A Bottle"
nans.loc[5526,:]['Artist'] = "Eminem, Dr. Dre and 50 Cent"
# 5567
nans.loc[5567,:]['Song Title'] = "Forever"
nans.loc[5567,:]['Artist'] = "Drake feat. Kanye West, Lil Wayne and Eminem"
# 5650
nans.loc[5650,:]['Song Title'] = "Forever"
nans.loc[5650,:]['Artist'] = "Drake feat. Kanye West, Lil Wayne and Eminem"
# 5658
nans.loc[5658,:]['Song Title'] = "All I Do Is Win"
nans.loc[5658,:]['Artist'] = "DJ Khaled feat. T-Pain, Ludacris, Snoop Dogg and Rick Ross"
# 5659
nans.loc[5659,:]['Song Title'] = "I Made It (Cash Money Heroes)"
nans.loc[5659,:]['Artist'] = "Kevin Rudolf feat. Birdman, Jay Sean, and Lil Wayne"
display(nans)
# Updating allall_time_billboard_wrap_up dataframe by nans


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nans[['Position', 'Artist', 'Song Title']] = nans['Position'].str.split(",", 2).to_list()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

Unnamed: 0,Position,Artist,Song Title,Year
340,61,Tune Weavers,"Happy, Happy Birthday Baby",1957
982,3,Louis Armstrong,"Hello, Dolly!",1964
983,4,Roy Orbison,"Oh, Pretty Woman",1964
1026,47,Major Lance,"Um, Um, Um, Um, Um, Um",1964
1071,92,"Robert Maxwell, His Harp and Orch.",Shangri-La,1964
...,...,...,...,...
5582,3,Train,"Hey, Soul Sister",2010
5650,71,"Drake feat. Kanye West, Lil Wayne and Eminem",Forever,2010
5658,79,"DJ Khaled feat. T-Pain, Ludacris, Snoop Dogg a...",All I Do Is Win,2010
5659,80,"Kevin Rudolf feat. Birdman, Jay Sean, and Lil ...",I Made It (Cash Money Heroes),2010


# Genius lyrics

In [7]:
# Spliting authors by 'and' and 'Featuring'
artists = list(all_time_billboard_wrap_up.loc[:,"Artist"])
split_artists = []
for artist in artists:
    if any(re.findall(r'and|Featuring', str(artist))):
        result = re.split(r'and|Featuring', str(artist))[0].strip()
        split_artists.append(result)
    else:
        split_artists.append(artist)

all_time_billboard_wrap_up["Split names"] = split_artists
display(all_time_billboard_wrap_up)

Unnamed: 0,Position,Artist,Song Title,Year,Split names
0,1,Gordon Jenkins and The Weavers,Goodnight Irene,1950,Gordon Jenkins
1,2,Nat King Cole,Mona Lisa,1950,Nat King Cole
2,3,Anton Karas,Third Man Theme,1950,Anton Karas
3,4,Gary and Bing Crosby,Sam's Song,1950,Gary
4,5,Gary and Bing Crosby,Simple Melody,1950,Gary
...,...,...,...,...,...
6675,96,Morgan Wallen,More Than My Hometown,2020,Morgan Wallen
6676,97,Luke Combs,Lovin' On You,2020,Luke Combs
6677,98,Moneybagg Yo,Said Sum,2020,Moneybagg Yo
6678,99,H.E.R. Featuring YG,Slide,2020,H.E.R.


In [8]:
'''
# lyricsgenius settings
genius_token = os.environ.get('genius_token')
genius = lyricsgenius.Genius(genius_token, timeout = 30, retries = 3)
songs = list(all_time_billboard_wrap_up.loc[:, "Song Title"])
artists = list(all_time_billboard_wrap_up.loc[:,"Split names"])

# Saving lyrics to jsons function
def return_lyrics_json(song, artist, json_name):
    song_lyrics = genius.search_song(song, artist)
    with open(json_name, 'w', encoding = 'UTF-8') as f:
        json.dump({'lyrics': song_lyrics.lyrics}, f, ensure_ascii = False, indent = 4)

# No lyrics function (only instrumental songs)
def no_lyrics(json_name):
    with open(json_name, 'w', encoding = 'UTF-8') as f:
        json.dump({'lyrics': ""}, f, ensure_ascii = False, indent = 4)
        
error_list = []

for i in range(100):
    try:
        return_lyrics_json(songs[i], artists[i], str(i)+'.json')
    except AttributeError:
        no_lyrics(str(i)+'.json')
        error_list.append(i)
'''

'\n# lyricsgenius settings\ngenius_token = os.environ.get(\'genius_token\')\ngenius = lyricsgenius.Genius(genius_token, timeout = 30, retries = 3)\nsongs = list(all_time_billboard_wrap_up.loc[:, "Song Title"])\nartists = list(all_time_billboard_wrap_up.loc[:,"Split names"])\n\n# Saving lyrics to jsons function\ndef return_lyrics_json(song, artist, json_name):\n    song_lyrics = genius.search_song(song, artist)\n    with open(json_name, \'w\', encoding = \'UTF-8\') as f:\n        json.dump({\'lyrics\': song_lyrics.lyrics}, f, ensure_ascii = False, indent = 4)\n\n# No lyrics function (only instrumental songs)\ndef no_lyrics(json_name):\n    with open(json_name, \'w\', encoding = \'UTF-8\') as f:\n        json.dump({\'lyrics\': ""}, f, ensure_ascii = False, indent = 4)\n        \nerror_list = []\n\nfor i in range(100):\n    try:\n        return_lyrics_json(songs[i], artists[i], str(i)+\'.json\')\n    except AttributeError:\n        no_lyrics(str(i)+\'.json\')\n        error_list.append(i)