In [90]:
# Importing the necessary library's

import pandas as pd
import base64
import os
from requests import get, post
import json
import time
import numpy as np

pd.set_option("display.max_rows", 50)
pd.set_option("display.expand_frame_repr", True)
pd.set_option('display.width', 1000)

Read in chart data, then isolate to only United States

In [91]:
# Reading the dataset and creating a dataframe
raw_data = pd.read_csv('charts.csv',
                       parse_dates=['date'])
raw_data = raw_data[raw_data['region'] == "United States"]
print(raw_data.isna().sum())
#stream numbers are missing only

title          0
rank           0
date           0
artist         0
url            0
region         0
chart          0
trend          0
streams    90873
dtype: int64


1. sort dataframe on date
2. drop any songs that show up on the same date, chart.. there are 57 of these and we are only keeping the first occurence
3. create new column 'song_num_days' which is a count of how many days a song is on either the top 50 playlist or the top 200 playlist
4. create new column 'num_days_cumulative' which is a cumulative count of how many days a song has been on each chart so far
5. create new column 'predict_days_stay'chart' which is how many more days the song will stay on that chart

In [92]:
def formatartist(x):
  return x.strip().upper().strip('#')


raw_data.sort_values(by='date', inplace=True)
raw_data.drop_duplicates(subset=['chart','title', 'artist', 'date'], inplace=True, keep='first')
raw_data['artist'] = raw_data['artist'].apply(lambda x: formatartist(x))


raw_data['predict_days_stay'] = raw_data.groupby(['chart','title','artist'])['date'].transform('nunique')
raw_data['num_days_cumulative'] = raw_data.groupby(['chart','title','artist'])['date'].cumcount()
raw_data['predict_days_stay'] = raw_data.apply(lambda row: row['predict_days_stay'] - row['num_days_cumulative'], axis = 1)

raw_data['Num_artists'] = raw_data.apply(lambda row: len(row.artist.split(',')), axis = 1)

raw_data

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams,predict_days_stay,num_days_cumulative,Num_artists
8246,Bad and Boujee (feat. Lil Uzi Vert),1,2017-01-01,MIGOS,https://open.spotify.com/track/4Km5HrUvYTaSUfi...,United States,top200,SAME_POSITION,1371493.0,429,0,1
8406,Wicked,160,2017-01-01,FUTURE,https://open.spotify.com/track/6BbINUfGabVyiNF...,United States,top200,NEW_ENTRY,164503.0,1,0,1
8407,Black Barbies,161,2017-01-01,"NICKI MINAJ, MIKE WILL MADE-IT",https://open.spotify.com/track/3y9cCbnBn0zjkJa...,United States,top200,MOVE_DOWN,163711.0,13,0,2
8408,Cut It (feat. Young Dolph),162,2017-01-01,O.T. GENASIS,https://open.spotify.com/track/376KnY4TrgBITxj...,United States,top200,MOVE_UP,163667.0,1,0,1
8409,What They Want,163,2017-01-01,RUSS,https://open.spotify.com/track/3pndPhlQWjuSoXh...,United States,top200,MOVE_DOWN,163498.0,338,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
22692001,The Spins,91,2021-12-31,MAC MILLER,https://open.spotify.com/track/51pshtuYkgUQnt5...,United States,top200,MOVE_UP,274426.0,1,306,1
22692002,Dos Oruguitas,92,2021-12-31,SEBASTIAN YATRA,https://open.spotify.com/track/5rohUzwEoRsUvAA...,United States,top200,MOVE_UP,273682.0,1,3,1
22692003,505,93,2021-12-31,ARCTIC MONKEYS,https://open.spotify.com/track/58ge6dfP91o9oXM...,United States,top200,MOVE_DOWN,272088.0,1,328,1
22691990,Streets,80,2021-12-31,DOJA CAT,https://open.spotify.com/track/60ynsPSSKe6O3sf...,United States,top200,MOVE_UP,287314.0,1,352,1


Next we will merge in the track data to the final dataset
I am merging based on track [title, artist]

In [186]:
tracks = pd.read_csv('charts_US_attributes_missing.csv')
track_columns = [ 'title', 
                 'url',
                 'artist', 
                 'danceability', 
                 'energy', 
                 'key', 
                 'loudness',
                 'mode', 
                 'speechiness', 
                 'acousticness', 
                 'instrumentalness', 
                 'liveness',
                 'valence', 
                 'tempo', 
                 'duration_ms', 
                 'time_signature']

tracks = tracks[track_columns]
#tracks['artist'] = tracks['artist'].apply(lambda x: formatartist(x))
print(tracks.isna().sum())
#tracks.drop_duplicates(subset=['url'], inplace=True)
print("missing track data we have: ",len(tracks))




title               0
url                 0
artist              0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
dtype: int64
missing track data we have:  1977


In [187]:
tracks2 = pd.read_csv('charts_US_attributes.csv')


tracks2 = tracks2[track_columns]
all_tracks = pd.concat([tracks2,tracks])
len(all_tracks)
all_tracks['artist'] = all_tracks['artist'].apply(lambda x: formatartist(x))
print(all_tracks.isna().sum())
all_tracks.drop_duplicates(subset=['url'], inplace=True)
print("track data we have: ",len(all_tracks))
all_tracks


title               0
url                 0
artist              0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
dtype: int64
track data we have:  14649


Unnamed: 0,title,url,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Bad and Boujee (feat. Lil Uzi Vert),https://open.spotify.com/track/4Km5HrUvYTaSUfi...,MIGOS,0.926,0.666,11,-5.314,1,0.2440,0.0611,0.000000,0.1230,0.168,127.079,343150,4
1,Fake Love,https://open.spotify.com/track/343YBumqHu19cGo...,DRAKE,0.928,0.481,9,-9.350,0,0.2870,0.1050,0.000000,0.1760,0.613,134.007,210937,4
2,Starboy,https://open.spotify.com/track/5aAx2yezTd8zXrk...,"THE WEEKND, DAFT PUNK",0.681,0.594,7,-7.028,1,0.2820,0.1650,0.000003,0.1340,0.535,186.054,230453,4
3,Closer,https://open.spotify.com/track/7BKLCZ1jbUBVqRi...,"THE CHAINSMOKERS, HALSEY",0.748,0.524,8,-5.599,1,0.0338,0.4140,0.000000,0.1110,0.661,95.010,244960,4
4,Black Beatles,https://open.spotify.com/track/6fujklziTHa8uoM...,"RAE SREMMURD, GUCCI MANE",0.794,0.632,0,-6.163,1,0.0649,0.1420,0.000000,0.1280,0.355,145.926,291893,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1972,God Rest Ye Merry Gentlemen,https://open.spotify.com/track/06IdUbnrpRZ0IKI...,BING CROSBY,0.450,0.202,0,-9.169,0,0.0365,0.9830,0.009130,0.1220,0.721,88.625,139333,5
1973,Want me!,https://open.spotify.com/track/0vvngjbg4Ilsm7f...,CL4PERS,0.753,0.675,9,-8.505,1,0.2450,0.5590,0.006210,0.1350,0.491,150.138,70078,4
1974,Winter Wonderland,https://open.spotify.com/track/1yxcKrEcxt4O03I...,TONY BENNETT,0.453,0.453,3,-9.354,1,0.0395,0.8710,0.000000,0.2180,0.586,116.098,133213,4
1975,We Wish You A Merry Christmas,https://open.spotify.com/track/0Xrev0e2wVRSkq8...,"JOHN DENVER, THE MUPPETS",0.677,0.353,9,-15.636,1,0.0716,0.8140,0.000000,0.0904,0.777,84.535,65347,1


In [188]:
#print(raw_data.columns)
#print(tracks.columns)

final_data = pd.merge(raw_data, all_tracks, 
                      on=['url', 'title', 'artist'],
                      how='left')
print("raw:   ",  len(raw_data))
print("final: " , len(final_data))  
print("diff:  " , len(raw_data) - len(final_data))  

print(final_data.columns)

final_data.sort_values(by='date', inplace=True)

print(final_data.isna().sum())

missing = final_data[final_data.danceability.isna()]
print(missing.columns)
#missing = missing[['title_x', 'artist_x', 'url']]
print(len(missing))
missing.drop_duplicates(subset=['url'],inplace= True)
print("dropping ", len(missing), " tracks")
#missing.rename(columns={'title_x': 'title', 
#                        'artist_x': 'artist', }, inplace=True )
length = len(final_data)
final_data.dropna(subset=['danceability'], inplace=True)
print("dropping ", length - len(final_data), " rows")
final_data.to_csv("final_data.csv", index=False)




raw:    455010
final:  455010
diff:   0
Index(['title', 'rank', 'date', 'artist', 'url', 'region', 'chart', 'trend', 'streams', 'predict_days_stay', 'num_days_cumulative', 'Num_artists', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'], dtype='object')
title                      0
rank                       0
date                       0
artist                     0
url                        0
region                     0
chart                      0
trend                      0
streams                90863
predict_days_stay          0
num_days_cumulative        0
Num_artists                0
danceability             295
energy                   295
key                      295
loudness                 295
mode                     295
speechiness              295
acousticness             295
instrumentalness         295
liveness                 295
valence           

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


dropping  295  rows


next we will merge the artists data, queries from spotify, into the main dataset

In [189]:
artists_columns = [ 
                    'artist_id',
                    'artist_followers',
                    'genres',
                    'artist_popularity',
                    'artist_name',
                    'type',
                    'album_names',
                    'release_dates',
                    'artist_total_tracks'
                  ]
artists = pd.read_csv('artist_data.csv', converters={'genres': pd.eval})
artists.rename(columns={'id': 'artist_id', 
                        'followers': 'artist_followers', 
                        'popularity': 'artist_popularity', 
                        'name': 'artist', 
                        'total_tracks':'artist_total_tracks'}, inplace=True )


artists['artist'] = artists['artist'].apply(lambda x: formatartist(x))
artists.drop_duplicates(subset='artist', inplace=True)
print(artists.columns)


Index(['artist_id', 'artist_followers', 'genres', 'artist_popularity', 'artist', 'type', 'album_names', 'release_dates', 'artist_total_tracks'], dtype='object')


In [190]:
print("total needed: ", len(final_data))

singleartists = final_data[final_data['Num_artists'] == 1]
singleartists['artist'] = singleartists['artist'].str.upper()
print("single needed: ", len(singleartists))

multipleartists = final_data[final_data['Num_artists'] > 1]
print("multiple needed: ", len(multipleartists))


final_single = pd.merge(singleartists, artists, 
                        on='artist', 
                        how='left')
print("single found: ", len(final_single))
print("single missing: ",  len(singleartists) - len(final_single))
length = len(final_single[final_single.artist_id.isna()])


missing = final_single[final_single.artist_id.isna()]
missing = missing[['artist']]
missing.drop_duplicates(inplace= True)
print("dropping ",len(missing), " single artists")
print("dropping ",length, " rows")

final_single.dropna(subset = ['artist_id'], inplace=True)
final_single.to_csv("final_data.csv", index=False)


total needed:  454715


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  singleartists['artist'] = singleartists['artist'].str.upper()


single needed:  381829
multiple needed:  72886
single found:  381829
single missing:  0
dropping  248  single artists
dropping  3605  rows


artist    378224
Name: type, dtype: int64

In [185]:
multipleartists
failcount=0
multipleartists.groupby('artist').size()
m_artists = {}
for name, group in multipleartists.groupby('artist'):
    loop = name.split(',')
    info = {}
    ids = []
    followers=[]
    genres = []
    popularity = []
    album_names=[]
    release_dates= []
    total_tracks = []
    for a in loop:
        a=a.strip()
        a=a.strip('#')
        data = artists[artists.artist == a]
        if (len(data == 1)):
            ids.append(data.artist_id.values[0])
            followers.append(data.artist_followers.values[0])
            for g in data.genres.values[0]:
                genres.append(g)
            popularity.append(data.artist_popularity.values[0])
            album_names.append(data.album_names.values[0])
            release_dates.append(data.release_dates.values[0])
            total_tracks.append(data.artist_total_tracks.values[0])
        elif len(data) > 1:
            print("more than 1 artist found")
        else:
            failcount = failcount +1
            #print("fail: ",a)
    info['artist_id'] = ids
    info['artist'] = name
    info['artist_name'] = name
    info['artist_followers'] = sum(followers)
    if len(popularity) > 0:
        info['artist_popularity'] = sum(popularity) / len(popularity) 
    else:
        info['artist_popularity'] = sum(popularity) 
    info['genres'] = list(set(genres))
    info['album_names']=album_names
    info['release_dates'] = release_dates
    info['type'] = 'artist'
    info ['artist_total_tracks'] =sum(total_tracks)
    m_artists[name] = info

        
 
df2 = pd.DataFrame.from_dict(m_artists, orient='index', columns=artists_columns)
df2.rename(columns={'artist_name': 'artist'}, inplace=True )

df2.reset_index(drop=True, inplace=True)

final_multiple = pd.merge(multipleartists, df2, 
                        on='artist', 
                        how='left')

print("mult found: ", len(final_multiple))
print("mult missing: ",  len(multipleartists) - len(final_multiple))

length = len(final_multiple[final_multiple.artist_id.isna()])  
print(length)
print(failcount)  
print(final_multiple.isna().sum())
FINAL_df = pd.concat([final_single,final_multiple])
FINAL_df

mult found:  72886
mult missing:  0
0
360
title                      0
rank                       0
date                       0
artist                     0
url                        0
region                     0
chart                      0
trend                      0
streams                17799
predict_days_stay          0
num_days_cumulative        0
Num_artists                0
danceability               0
energy                     0
key                        0
loudness                   0
mode                       0
speechiness                0
acousticness               0
instrumentalness           0
liveness                   0
valence                    0
tempo                      0
duration_ms                0
time_signature             0
artist_id                  0
artist_followers           0
genres                     0
artist_popularity          0
type                       0
album_names                0
release_dates              0
artist_total_tracks        0
d

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams,predict_days_stay,...,duration_ms,time_signature,artist_id,artist_followers,genres,artist_popularity,type,album_names,release_dates,artist_total_tracks
0,Bad and Boujee (feat. Lil Uzi Vert),1,2017-01-01,MIGOS,https://open.spotify.com/track/4Km5HrUvYTaSUfi...,United States,top200,SAME_POSITION,1371493.0,429,...,343150.0,4.0,6oMuImdp5ZcFhWP0ESe6mG,13504155.0,"[atl hip hop, hip hop, pop rap, rap, trap]",76.0,artist,"['Culture III (Deluxe)', 'Culture III', 'Cultu...","['2021-06-11', '2021-06-11', '2018-01-26', '20...",243.0
1,You Can Call Me Al,125,2017-01-01,PAUL SIMON,https://open.spotify.com/track/0qxYx4F3vm1AOnf...,United States,top200,NEW_ENTRY,183669.0,1,...,280000.0,4.0,2CvCyf1gEVhI0mX6aFXmVI,2192625.0,"[classic rock, folk, folk rock, mellow gold, p...",67.0,artist,"['Seven Psalms', 'In the Blue Light', 'Gracela...","['2023-05-19', '2018-09-07', '2018-06-01', '20...",309.0
2,Fake Love,2,2017-01-01,DRAKE,https://open.spotify.com/track/343YBumqHu19cGo...,United States,top200,SAME_POSITION,1180074.0,455,...,210937.0,4.0,3TVXtAsR1Inumwj472S9r4,80854202.0,"[canadian hip hop, canadian pop, hip hop, pop ...",97.0,artist,"['For All The Dogs', 'Her Loss', 'Honestly, Ne...","['2023-10-06', '2022-11-04', '2022-06-17', '20...",279.0
4,Caroline,8,2017-01-01,AMINÉ,https://open.spotify.com/track/7FB8l7UA1HKqnuS...,United States,top200,MOVE_DOWN,714839.0,389,...,209640.0,4.0,3Gm5F95VdRxW3mqCn8RPBJ,1892909.0,"[pop rap, portland hip hop, rap, underground h...",68.0,artist,"['KAYTRAMINÉ (Instrumentals)', 'KAYTRAMINÉ', '...","['2023-09-15', '2023-05-19', '2021-11-05', '20...",97.0
5,Bounce Back,10,2017-01-01,BIG SEAN,https://open.spotify.com/track/0SGkqnVQo9KPytS...,United States,top200,MOVE_DOWN,682688.0,468,...,222360.0,4.0,0c173mlxpT3dSFRgMO8XPh,11110118.0,"[detroit hip hop, hip hop, pop rap, r&b, rap, ...",75.0,artist,"['Detroit 2 (Deluxe)', 'Detroit', 'Finally Fam...","['2022-09-19', '2022-09-05', '2021-06-25', '20...",192.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72881,Promiscuous,153,2021-12-31,"NELLY FURTADO, TIMBALAND",https://open.spotify.com/track/2gam98EZKrF9XuO...,United States,top200,NEW_ENTRY,219570.0,1,...,242293.0,4.0,"[2jw70GZXlAI8QzWeY2bgRc, 5Y5TRrQiqgUO4S36tzjIRZ]",6218559.0,"[pop rap, dance pop, canadian latin, canadian ...",76.5,artist,"[['Loose (Expanded Edition)', 'The Ride', 'The...","[['2021-06-04', '2017-03-31', '2012-01-01', '2...",277.0
72882,The Motto,149,2021-12-31,"TIËSTO, AVA MAX",https://open.spotify.com/track/18asYwWugKjjsih...,United States,top200,MOVE_UP,223199.0,1,...,164819.0,4.0,"[2o5jDhtHVPhrJdv3cEQ99Z, 4npEfmQ6YuiwW1GpUmaq3F]",13221374.0,"[pop dance, slap house, trance, pop, brostep, ...",80.0,artist,"[['DRIVE Continuous DJ Mix', 'DRIVE', 'The Lon...","[['2023-04-21', '2023-04-21', '2020-05-15', '2...",944.0
72883,Calling My Phone,144,2021-12-31,"LIL TJAY, 6LACK",https://open.spotify.com/track/3J8EOeKLTLXORtW...,United States,top200,MOVE_DOWN,226234.0,1,...,205458.0,4.0,"[6jGMq4yGs7aQzuGsMgVgZR, 4IVAbR2w4JJNJDDRFP3E83]",11814827.0,"[brooklyn drill, r&b, atl hip hop, rap, melodi...",76.0,artist,"[['222', 'Destined 2 Win', 'State of Emergency...","[['2023-07-14', '2021-04-02', '2020-05-08', '2...",154.0
72884,NEW MAGIC WAND,141,2021-12-31,"TYLER, THE CREATOR",https://open.spotify.com/track/0fv2KH6hac06J86...,United States,top200,MOVE_DOWN,226850.0,1,...,195320.0,4.0,[],0.0,[],0.0,artist,[],[],0.0
