# import necessary libraries for web scraping

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# enter the url of the website to get the data from

In [2]:
url = "https://playback.fm/charts/top-100-songs/2021"

In [3]:
response = requests.get(url)
response.status_code

200

# look at the content of the data

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
soup

In [None]:
print(soup.prettify())

In [None]:
soup.select("td:nth-child(2) > a")

In [8]:
soup.select("td:nth-child(2) > a")[0].get_text()

'\nDua Lipa & DaBaby\n'

In [9]:
#myTable > tbody > tr:nth-child(1) > td:nth-child(2) > span

In [10]:
soup.select("td.mobile-hide > a > span.song")[0].get_text()

'Levitating'

In [11]:
soup.select("td:nth-child(2) > a")[0].get_text()

'\nDua Lipa & DaBaby\n'

# put the artist and song names into a dataframe

In [None]:
artist = []
song = []


# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set

num_iter = len(soup.select("td:nth-child(2) > a"))

a_list = soup.select("td:nth-child(2) > a")
s_list = soup.select("td.mobile-hide > a > span.song")
# iterate through the result set and retrive all the data
for i in range(num_iter):
    artist.append(a_list[i].get_text())
    song.append(s_list[i].get_text())

print(artist)
print(song)

In [13]:
artist[0]

'\nDua Lipa & DaBaby\n'

In [14]:
song[0]

'Levitating'

In [15]:
song_singers = pd.DataFrame({"artist":artist, "song":song})

In [16]:
song_singers.head()

Unnamed: 0,artist,song
0,\nDua Lipa & DaBaby\n,Levitating
1,\nOlivia Rodrigo\n,Drivers License
2,\nThe Weeknd & Ariana Grande\n,Save Your Tears
3,\nLil Nas X\n,Montero (Call Me by Your Name)
4,\nThe Weeknd\n,Blinding Lights


# clean the dataframe 

In [17]:
song_singers['artist'] = song_singers['artist'].str.replace('\n', ' ')

In [18]:
song_singers.head()

Unnamed: 0,artist,song
0,Dua Lipa & DaBaby,Levitating
1,Olivia Rodrigo,Drivers License
2,The Weeknd & Ariana Grande,Save Your Tears
3,Lil Nas X,Montero (Call Me by Your Name)
4,The Weeknd,Blinding Lights


# compare the dataframe with an input
asking user for a song as an input, if the input is in our data
we give back a random song back from the data, if not then we say that 'sorry we don't have a song recommendation'

In [19]:
song_of_the_user = input("Enter your song: ")
print(song_of_the_user)

Enter your song: you
you


In [20]:
print(song_singers['song'].sample())

82    Good Time
Name: song, dtype: object


In [21]:
song_singers['song'].str.match(song_of_the_user).any()


False

In [22]:
if song_singers['song'].str.match(song_of_the_user).any() == True:
    print(song_singers['song'].sample())
else:
    print("We are sorry that we do not have a song recommendation for you now.")

We are sorry that we do not have a song recommendation for you now.


# write a function for easy web scraping from the top song websites

In [23]:
def webscraper(url):
    any_response = requests.get(url)
    any_soup = BeautifulSoup(any_response.content, 'html.parser')
    artist = []
    song = []
    num_iters = len(any_soup.select('td:nth-child(2) > a'))
    a_list = any_soup.select('td:nth-child(2) > a')
    s_list = any_soup.select('td.mobile-hide > a > span.song')
    for i in range(num_iters):
        artist.append(a_list[i].get_text())
        song.append(s_list[i].get_text())
    test = pd.DataFrame({'artist':artist,'song':song})
    return test

url1 = 'https://playback.fm/charts/rnb/2017'
url2 = 'https://playback.fm/charts/rnb/2021'
top_2017 = webscraper(url1)
top_2021 = webscraper(url2)


In [24]:
songs_list = pd.concat([top_2017, top_2021], ignore_index=True)

In [25]:
songs_list

Unnamed: 0,artist,song
0,\nBruno Mars\n,That's What I Like
1,\nDJ Khaled featuring Rihanna & Bryson Tiller\n,Wild Thoughts
2,\nPost Malone\n,Rockstar
3,\nDJ Khaled\n,I'm the One
4,\nBruno Mars\n,24K Magic
...,...,...
195,\nDrake featuring Lil Baby\n,Wants And Needs
196,\nSummer Walker\n,Broken Promises
197,\nMegan Thee Stallion\n,Body
198,\nYoungBoy Never Broke Again\n,Nevada


In [26]:
songs_list.artist = songs_list.artist.str.replace('\n', ' ')

In [64]:
songs_list

Unnamed: 0,artist,song
0,Bruno Mars,That's What I Like
1,DJ Khaled featuring Rihanna & Bryson Tiller,Wild Thoughts
2,Post Malone,Rockstar
3,DJ Khaled,I'm the One
4,Bruno Mars,24K Magic
...,...,...
195,Drake featuring Lil Baby,Wants And Needs
196,Summer Walker,Broken Promises
197,Megan Thee Stallion,Body
198,YoungBoy Never Broke Again,Nevada


In [None]:
# Concat the 3 playlist all together and run the function above again

In [65]:
top_300_songs = pd.concat([song_singers,songs_list])


In [66]:
top_300_songs 

Unnamed: 0,artist,song
0,Dua Lipa & DaBaby,Levitating
1,Olivia Rodrigo,Drivers License
2,The Weeknd & Ariana Grande,Save Your Tears
3,Lil Nas X,Montero (Call Me by Your Name)
4,The Weeknd,Blinding Lights
...,...,...
195,Drake featuring Lil Baby,Wants And Needs
196,Summer Walker,Broken Promises
197,Megan Thee Stallion,Body
198,YoungBoy Never Broke Again,Nevada


In [None]:
# ask a user for a song, if the song is in the top_300 songs list
# then suggest another song from the list. If not, say "sorry"

In [69]:
song_of_the_user = input("Enter your song: ")
print(song_of_the_user)

Enter your song: Nevada
Nevada


In [70]:
if top_300_songs['song'].str.match(song_of_the_user).any() == True:
    print(top_300_songs['song'].sample())
else:
    print("We are sorry that we do not have a song recommendation for you now.")

11    Passionfruit
Name: song, dtype: object


# Practice web scraping

In [27]:
# Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: 
# url ='https://en.wikipedia.org/wiki/Python'

In [28]:
url ='https://en.wikipedia.org/wiki/Python'
any_response = requests.get(url)
any_soup = BeautifulSoup(any_response.content, 'html.parser')

In [29]:
any_response.status_code

200

In [None]:
print(any_soup.prettify())

In [31]:
any_soup.select('link')


[<link href="/w/load.php?lang=en&amp;modules=ext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cskins.vector.styles.legacy%7Cwikibase.client.init&amp;only=styles&amp;skin=vector" rel="stylesheet"/>,
 <link href="/w/load.php?lang=en&amp;modules=site.styles&amp;only=styles&amp;skin=vector" rel="stylesheet"/>,
 <link href="//upload.wikimedia.org" rel="preconnect"/>,
 <link href="//en.m.wikipedia.org/wiki/Python" media="only screen and (max-width: 720px)" rel="alternate"/>,
 <link href="/w/index.php?title=Python&amp;action=edit" rel="alternate" title="Edit this page" type="application/x-wiki"/>,
 <link href="/static/apple-touch/wikipedia.png" rel="apple-touch-icon"/>,
 <link href="/static/favicon/wikipedia.ico" rel="icon"/>,
 <link href="/w/opensearch_desc.php" rel="search" title="Wikipedia (en)" type="application/opensearchdescription+xml"/>,
 <link href="//en.wikipedia.org/w/api.php?action=rsd" rel="EditURI" type="application/rsd+xml"/>,
 <link

In [32]:
any_soup.select('a')[1]['href']  

'#mw-head'

In [None]:
any_soup.select('a')

# Spotipy API

In [34]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [119]:
secrets_file = open("secrets.txt","r")
# open from python not from pandas, opening for reading 'r'

In [120]:
string = secrets_file.read()

In [None]:
string

In [None]:
string.split('\n')

In [123]:
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1].strip()

In [None]:
secrets_dict

In [125]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['csecret']))


# Handling large playlists

In [126]:
from random import randint
from time import sleep

def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3000)/3000)
    return tracks

## Large playlist 1
checking its structure, then taking the artist and song names


In [43]:
playlist1 = get_playlist_tracks("36f5MuoelMRwVIet1HUqYC")
len(playlist1)

502

In [44]:
playlist1[0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [None]:
playlist1[0]['track']['album']

# this album information holds both artist and song names

In [46]:
playlist1[0]['track']['album'].keys()

dict_keys(['album_type', 'artists', 'available_markets', 'external_urls', 'href', 'id', 'images', 'name', 'release_date', 'release_date_precision', 'total_tracks', 'type', 'uri'])

In [47]:
playlist1[0]['track']['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/4x1nvY2FN8jxqAFA0DA02H'},
  'href': 'https://api.spotify.com/v1/artists/4x1nvY2FN8jxqAFA0DA02H',
  'id': '4x1nvY2FN8jxqAFA0DA02H',
  'name': 'John Lennon',
  'type': 'artist',
  'uri': 'spotify:artist:4x1nvY2FN8jxqAFA0DA02H'}]

In [48]:
playlist1[0]['track']['name']

'Imagine - Remastered 2010'

In [None]:
len(playlist1)

In [49]:
def get_name_artist_from_playlist_item(playlist_item):
    return get_name_artists_from_track(playlist_item['track'])

In [50]:
def get_name_artists_from_track(track):
    return [(track["name"],artist["name"]) for artist in track["artists"]]



In [53]:
get_name_artists_from_track(my_track)

[('Imagine', 'John Lennon')]

In [52]:
my_track = playlist1[0]['track']['album']

In [54]:
my_track2 = playlist1[1]['track']['album']

In [55]:
get_name_artists_from_track(my_track2)

[('A Whiter Shade Of Pale', 'Procol Harum')]

In [None]:
playlist1_songs_artists =[]

for i in range (len(playlist1)):
    my_track = playlist1[i]['track']['album']
    result = get_name_artists_from_track(my_track)
    playlist1_songs_artists.append(result)
    print(result)

In [57]:
def flatten(input_list):
    return [item for sublist in input_list for item in sublist]

In [None]:
flatten(playlist1_songs_artists)

# list of all songs and their artists in the playlist1

## Large playlist 2
checking its structure, then taking the artist and song names

In [None]:
# A new playlist (playlist2)

In [132]:
playlist2 = get_playlist_tracks("5lIoXRJHLCdOnPAE8A4hxE")
len(playlist2)

772

In [None]:
playlist2[770]

In [60]:
playlist2[0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [None]:
playlist1[0]['track']['album']

In [None]:
playlist2_songs_artists =[]

for i in range (len(playlist2)):
    my_track = playlist2[i]['track']['album']
    result = get_name_artists_from_track(my_track)
    playlist2_songs_artists.append(result)
    print(result)

In [None]:
flatten(playlist2_songs_artists)


## Large Playlists 3 

In [127]:
playlist3 = get_playlist_tracks("6tIxyT1Gq6O7DK7rIEUEZo")
len(playlist3)

9999

In [106]:
len(playlist3)

9999

In [107]:
display(playlist3[0]['track'].keys())
playlist1[0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [None]:
artist_s.append(tracks_list[i]['track']['album']['artists'][0]['name'])
song_s.append(tracks_list[i]['track']['album']['name'])
song_uri_s.append(tracks_list[i]['track']['uri'])

In [114]:
playlist3[0]['track']['album']['artists'][0]['name']

'Queen'

In [115]:
playlist3[0]['track']['album']['name']

'A Kind Of Magic (2011 Remaster)'

In [116]:
playlist3[0]['track']['uri']

'spotify:track:5RYLa5P4qweEAKq5U1gdcK'

In [None]:
playlist3_songs_artists =[]

for i in range (len(playlist3)):
    my_track = playlist3[i]['track']['album']
    result = get_name_artists_from_track(my_track)
    playlist3_songs_artists.append(result)
    print(result)

In [None]:
flatten(playlist3_songs_artists)

# dealing with auido features 

In [78]:
playlist3[501]['track']['uri']

'spotify:track:5xZU3pah1nptTzFKDJBkc9'

In [79]:
song_uri = playlist3[501]['track']['uri']

In [81]:
features=sp.audio_features(song_uri)

In [82]:
features[0]

{'danceability': 0.737,
 'energy': 0.511,
 'key': 2,
 'loudness': -10.499,
 'mode': 1,
 'speechiness': 0.0456,
 'acousticness': 0.365,
 'instrumentalness': 0.00718,
 'liveness': 0.0891,
 'valence': 0.77,
 'tempo': 131.001,
 'type': 'audio_features',
 'id': '5xZU3pah1nptTzFKDJBkc9',
 'uri': 'spotify:track:5xZU3pah1nptTzFKDJBkc9',
 'track_href': 'https://api.spotify.com/v1/tracks/5xZU3pah1nptTzFKDJBkc9',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/5xZU3pah1nptTzFKDJBkc9',
 'duration_ms': 118500,
 'time_signature': 4}

# Creating a dataframe with artist, song, and audio features

In [133]:
def flatten_it(data, col_list):
    for column in col_list:
        flattened = pd.DataFrame(dict(data[column])).transpose()
        columns = [str(col) for col in flattened.columns]
        flattened.columns = [column + '_' + colname for colname in columns]
        data = pd.concat([data, flattened], axis=1)
        data = data.drop(column, axis=1)
    return data

In [134]:
def playlist_to_df(tracks_list):
    artist_s = []
    song_s = []
    song_uri_s=[]
    features_s= []

    num_iter = len(tracks_list)

    for i in range(num_iter):
        artist_s.append(tracks_list[i]['track']['album']['artists'][0]['name'])
        song_s.append(tracks_list[i]['track']['album']['name'])
        song_uri_s.append(tracks_list[i]['track']['uri'])
        features_s.append(sp.audio_features(song_uri_s[i]))
        
    df_v0= pd.DataFrame({"artist":artist_s,"song":song_s,"audio_features":features_s})
    
    nested_columns = ['audio_features']
    df_v1 = flatten_it(df_v0, nested_columns)
    music_df=pd.concat([df_v1.drop(['audio_features_0'], axis=1), df_v1['audio_features_0'].apply(pd.Series)], axis=1)
    
    return music_df

In [90]:
spotify_df_1=playlist_to_df(playlist1)

In [91]:
display(spotify_df_1.head())
display(spotify_df_1.shape)

Unnamed: 0,artist,song,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,John Lennon,Imagine,0.547,0.257,0,-12.358,1,0.0252,0.907,0.183,0.0935,0.169,75.752,audio_features,7pKfPomDEeI4TPT6EOYjn9,spotify:track:7pKfPomDEeI4TPT6EOYjn9,https://api.spotify.com/v1/tracks/7pKfPomDEeI4...,https://api.spotify.com/v1/audio-analysis/7pKf...,187867,4
1,Procol Harum,A Whiter Shade Of Pale,0.249,0.66,0,-6.905,1,0.0342,0.504,0.0026,0.0891,0.435,149.813,audio_features,3qitymULqEibr7yknRMKU4,spotify:track:3qitymULqEibr7yknRMKU4,https://api.spotify.com/v1/tracks/3qitymULqEib...,https://api.spotify.com/v1/audio-analysis/3qit...,248947,4
2,Queen,A Night At The Opera (2011 Remaster),0.414,0.404,0,-9.928,0,0.0499,0.271,0.0,0.3,0.224,71.105,audio_features,1AhDOtG9vPSOmsWgNW0BEY,spotify:track:1AhDOtG9vPSOmsWgNW0BEY,https://api.spotify.com/v1/tracks/1AhDOtG9vPSO...,https://api.spotify.com/v1/audio-analysis/1AhD...,354320,4
3,Pink Floyd,The Wall [2011 - Remaster] (2011 Remastered Ve...,0.472,0.366,11,-12.595,0,0.0286,0.15,0.308,0.0837,0.171,127.167,audio_features,082cLCIXNPg2ruTrENz4Vt,spotify:track:082cLCIXNPg2ruTrENz4Vt,https://api.spotify.com/v1/tracks/082cLCIXNPg2...,https://api.spotify.com/v1/audio-analysis/082c...,382297,4
4,Bob Dylan,Pat Garrett & Billy The Kid (Soundtrack From T...,0.513,0.396,7,-13.061,1,0.0299,0.251,0.177,0.11,0.229,140.208,audio_features,6HSXNV0b4M4cLJ7ljgVVeh,spotify:track:6HSXNV0b4M4cLJ7ljgVVeh,https://api.spotify.com/v1/tracks/6HSXNV0b4M4c...,https://api.spotify.com/v1/audio-analysis/6HSX...,149880,4


(502, 20)

In [130]:
spotify_df_3 = playlist_to_df(playlist3)

KeyboardInterrupt: 

In [None]:
display(spotify_df_2.head())
display(spotify_df_2.shape)

In [135]:
spotify_df_2=playlist_to_df(playlist2)

TypeError: 'NoneType' object is not subscriptable

In [None]:
display(spotify_df_3.head())
display(spotify_df_3.shape)

In [149]:
artist_s =[]
# song_s = []
# song_uri_s = []
# features_s = []

for i in range(len(playlist2)):
    artist_s.append(playlist2[i]['track']['album']['artists'][0]['name'])
#     song_s.append(playlist2[i]['track']['album']['name'])
#     song_uri_s.append(playlist2[i]['track']['uri'])
#     features_s.append(sp.audio_features(song_uri_s[i]))
    
    
print(artist_s)
# print(song_s) 
# print(song_uri_s) 
# print(features_s) 
    
# df_v0= pd.DataFrame({"artist":artist_s,"song":song_s,"audio_features":features_s})

# nested_columns = ['audio_features']
# df_v1 = flatten_it(df_v0, nested_columns)
# music_df=pd.concat([df_v1.drop(['audio_features_0'], axis=1), df_v1['audio_features_0'].apply(pd.Series)], axis=1)



TypeError: 'NoneType' object is not subscriptable