# import necessary libraries for web scraping

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# enter the url of the website to get the data from

In [2]:
url = "https://playback.fm/charts/top-100-songs/2021"

In [3]:
response = requests.get(url)
response.status_code

200

# look at the content of the data

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
soup

In [None]:
print(soup.prettify())

In [None]:
soup.select("td:nth-child(2) > a")

In [8]:
soup.select("td:nth-child(2) > a")[0].get_text()

'\nDua Lipa & DaBaby\n'

In [9]:
#myTable > tbody > tr:nth-child(1) > td:nth-child(2) > span

In [10]:
soup.select("td.mobile-hide > a > span.song")[0].get_text()

'Levitating'

In [11]:
soup.select("td:nth-child(2) > a")[0].get_text()

'\nDua Lipa & DaBaby\n'

# put the artist and song names into a dataframe

In [12]:
artist = []
song = []


# define the number of iterations of our for loop
# by checking how many elements are in the retrieved result set

num_iter = len(soup.select("td:nth-child(2) > a"))

a_list = soup.select("td:nth-child(2) > a")
s_list = soup.select("td.mobile-hide > a > span.song")
# iterate through the result set and retrive all the data
for i in range(num_iter):
    artist.append(a_list[i].get_text())
    song.append(s_list[i].get_text())

print(artist)
print(song)

['\nDua Lipa & DaBaby\n', '\nOlivia Rodrigo\n', '\nThe Weeknd & Ariana Grande\n', '\nLil Nas X\n', '\nThe Weeknd\n', '\nOlivia Rodrigo\n', '\n24kGoldn featuring iann dior\n', '\nJustin Bieber featuring Daniel Caesar & Giveon\n', '\nSilk Sonic\n', '\nMasked Wolf\n', '\nDoja Cat featuring SZA\n', '\nThe Kid Laroi & Justin Bieber\n', '\nAdele\n', '\nBTS\n', '\nThe Kid LAROI\n', '\nOlivia Rodrigo\n', '\nPolo G\n', '\nAriana Grande\n', '\nCardi B\n', '\nPop Smoke\n', '\nEd Sheeran\n', '\nLil Tjay featuring 6LACK\n', '\nGlass Animals\n', '\nLil Nas X & Jack Harlow\n', '\nChris Brown & Young Thug\n', '\nTate McRae\n', '\nCJ\n', '\nBillie Eilish\n', '\nLuke Combs\n', '\nAriana Grande Feat. Doja Cat & Megan Thee Stallion\n', '\nDrake\n', '\nSaweetie featuring Doja Cat\n', '\nMachine Gun Kelly & BLACKBEAR\n', '\nBad Bunny & Jhay Cortez\n', '\nSZA\n', '\nJustin Bieber\n', '\nDrake featuring Lil Baby\n', '\nMariah Carey\n', '\nGiveon\n ', '\nDrake featuring Future & Young Thug\n', '\nWalker Hayes\

In [13]:
artist[0]

'\nDua Lipa & DaBaby\n'

In [14]:
song[0]

'Levitating'

In [15]:
song_singers = pd.DataFrame({"artist":artist, "song":song})

In [16]:
song_singers.head()

Unnamed: 0,artist,song
0,\nDua Lipa & DaBaby\n,Levitating
1,\nOlivia Rodrigo\n,Drivers License
2,\nThe Weeknd & Ariana Grande\n,Save Your Tears
3,\nLil Nas X\n,Montero (Call Me by Your Name)
4,\nThe Weeknd\n,Blinding Lights


# clean the dataframe 

In [17]:
song_singers['artist'] = song_singers['artist'].str.replace('\n', ' ')

In [18]:
song_singers.head()

Unnamed: 0,artist,song
0,Dua Lipa & DaBaby,Levitating
1,Olivia Rodrigo,Drivers License
2,The Weeknd & Ariana Grande,Save Your Tears
3,Lil Nas X,Montero (Call Me by Your Name)
4,The Weeknd,Blinding Lights


# compare the dataframe with an input
asking user for a song as an input, if the input is in our data
we give back a random song back from the data, if not then we say that 'sorry we don't have a song recommendation'

In [19]:
song_of_the_user = input("Enter your song: ")
print(song_of_the_user)

Enter your song: you
you


In [20]:
print(song_singers['song'].sample())

82    Good Time
Name: song, dtype: object


In [21]:
song_singers['song'].str.match(song_of_the_user).any()


False

In [22]:
if song_singers['song'].str.match(song_of_the_user).any() == True:
    print(song_singers['song'].sample())
else:
    print("We are sorry that we do not have a song recommendation for you now.")

We are sorry that we do not have a song recommendation for you now.


# write a function for easy web scraping from the top song websites

In [23]:
def webscraper(url):
    any_response = requests.get(url)
    any_soup = BeautifulSoup(any_response.content, 'html.parser')
    artist = []
    song = []
    num_iters = len(any_soup.select('td:nth-child(2) > a'))
    a_list = any_soup.select('td:nth-child(2) > a')
    s_list = any_soup.select('td.mobile-hide > a > span.song')
    for i in range(num_iters):
        artist.append(a_list[i].get_text())
        song.append(s_list[i].get_text())
    test = pd.DataFrame({'artist':artist,'song':song})
    return test

url1 = 'https://playback.fm/charts/rnb/2017'
url2 = 'https://playback.fm/charts/rnb/2021'
top_2017 = webscraper(url1)
top_2021 = webscraper(url2)


In [24]:
songs_list = pd.concat([top_2017, top_2021], ignore_index=True)

In [25]:
songs_list

Unnamed: 0,artist,song
0,\nBruno Mars\n,That's What I Like
1,\nDJ Khaled featuring Rihanna & Bryson Tiller\n,Wild Thoughts
2,\nPost Malone\n,Rockstar
3,\nDJ Khaled\n,I'm the One
4,\nBruno Mars\n,24K Magic
...,...,...
195,\nDrake featuring Lil Baby\n,Wants And Needs
196,\nSummer Walker\n,Broken Promises
197,\nMegan Thee Stallion\n,Body
198,\nYoungBoy Never Broke Again\n,Nevada


In [26]:
songs_list.artist = songs_list.artist.str.replace('\n', ' ')

In [64]:
songs_list

Unnamed: 0,artist,song
0,Bruno Mars,That's What I Like
1,DJ Khaled featuring Rihanna & Bryson Tiller,Wild Thoughts
2,Post Malone,Rockstar
3,DJ Khaled,I'm the One
4,Bruno Mars,24K Magic
...,...,...
195,Drake featuring Lil Baby,Wants And Needs
196,Summer Walker,Broken Promises
197,Megan Thee Stallion,Body
198,YoungBoy Never Broke Again,Nevada


In [None]:
# Concat the 3 playlist all together and run the function above again

In [65]:
top_300_songs = pd.concat([song_singers,songs_list])


In [66]:
top_300_songs 

Unnamed: 0,artist,song
0,Dua Lipa & DaBaby,Levitating
1,Olivia Rodrigo,Drivers License
2,The Weeknd & Ariana Grande,Save Your Tears
3,Lil Nas X,Montero (Call Me by Your Name)
4,The Weeknd,Blinding Lights
...,...,...
195,Drake featuring Lil Baby,Wants And Needs
196,Summer Walker,Broken Promises
197,Megan Thee Stallion,Body
198,YoungBoy Never Broke Again,Nevada


In [None]:
# ask a user for a song, if the song is in the top_300 songs list
# then suggest another song from the list. If not, say "sorry"

In [69]:
song_of_the_user = input("Enter your song: ")
print(song_of_the_user)

Enter your song: Nevada
Nevada


In [70]:
if top_300_songs['song'].str.match(song_of_the_user).any() == True:
    print(top_300_songs['song'].sample())
else:
    print("We are sorry that we do not have a song recommendation for you now.")

11    Passionfruit
Name: song, dtype: object


# Practice web scraping

In [27]:
# Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: 
# url ='https://en.wikipedia.org/wiki/Python'

In [28]:
url ='https://en.wikipedia.org/wiki/Python'
any_response = requests.get(url)
any_soup = BeautifulSoup(any_response.content, 'html.parser')

In [29]:
any_response.status_code

200

In [None]:
print(any_soup.prettify())

In [31]:
any_soup.select('link')


[<link href="/w/load.php?lang=en&amp;modules=ext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cskins.vector.styles.legacy%7Cwikibase.client.init&amp;only=styles&amp;skin=vector" rel="stylesheet"/>,
 <link href="/w/load.php?lang=en&amp;modules=site.styles&amp;only=styles&amp;skin=vector" rel="stylesheet"/>,
 <link href="//upload.wikimedia.org" rel="preconnect"/>,
 <link href="//en.m.wikipedia.org/wiki/Python" media="only screen and (max-width: 720px)" rel="alternate"/>,
 <link href="/w/index.php?title=Python&amp;action=edit" rel="alternate" title="Edit this page" type="application/x-wiki"/>,
 <link href="/static/apple-touch/wikipedia.png" rel="apple-touch-icon"/>,
 <link href="/static/favicon/wikipedia.ico" rel="icon"/>,
 <link href="/w/opensearch_desc.php" rel="search" title="Wikipedia (en)" type="application/opensearchdescription+xml"/>,
 <link href="//en.wikipedia.org/w/api.php?action=rsd" rel="EditURI" type="application/rsd+xml"/>,
 <link

In [32]:
any_soup.select('a')[1]['href']  

'#mw-head'

In [None]:
any_soup.select('a')

# Spotipy API

In [34]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [35]:
secrets_file = open("secrets.txt","r")
# open from python not from pandas, opening for reading 'r'

In [36]:
string = secrets_file.read()

In [None]:
string

In [None]:
string.split('\n')

In [39]:
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1].strip()

In [None]:
secrets_dict

In [41]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['csecret']))


# Handling large playlists

In [42]:
from random import randint
from time import sleep

def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3000)/3000)
    return tracks

## Large playlist 1
checking its structure, then taking the artist and song names


In [43]:
playlist1 = get_playlist_tracks("36f5MuoelMRwVIet1HUqYC")
len(playlist1)

502

In [44]:
playlist1[0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [None]:
playlist1[0]['track']['album']

# this album information holds both artist and song names

In [46]:
playlist1[0]['track']['album'].keys()

dict_keys(['album_type', 'artists', 'available_markets', 'external_urls', 'href', 'id', 'images', 'name', 'release_date', 'release_date_precision', 'total_tracks', 'type', 'uri'])

In [47]:
playlist1[0]['track']['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/4x1nvY2FN8jxqAFA0DA02H'},
  'href': 'https://api.spotify.com/v1/artists/4x1nvY2FN8jxqAFA0DA02H',
  'id': '4x1nvY2FN8jxqAFA0DA02H',
  'name': 'John Lennon',
  'type': 'artist',
  'uri': 'spotify:artist:4x1nvY2FN8jxqAFA0DA02H'}]

In [48]:
playlist1[0]['track']['name']

'Imagine - Remastered 2010'

In [49]:
def get_name_artist_from_playlist_item(playlist_item):
    return get_name_artists_from_track(playlist_item['track'])

In [50]:
def get_name_artists_from_track(track):
    return [(track["name"],artist["name"]) for artist in track["artists"]]



In [53]:
get_name_artists_from_track(my_track)

[('Imagine', 'John Lennon')]

In [52]:
my_track = playlist1[0]['track']['album']

In [54]:
my_track2 = playlist1[1]['track']['album']

In [55]:
get_name_artists_from_track(my_track2)

[('A Whiter Shade Of Pale', 'Procol Harum')]

In [None]:
playlist1_songs_artists =[]

for i in range (len(playlist1)):
    my_track = playlist1[i]['track']['album']
    result = get_name_artists_from_track(my_track)
    playlist1_songs_artists.append(result)
    print(result)

In [57]:
def flatten(input_list):
    return [item for sublist in input_list for item in sublist]

In [58]:
flatten(playlist1_songs_artists)

# list of all songs and their artists in the playlist1

[('Imagine', 'John Lennon'),
 ('A Whiter Shade Of Pale', 'Procol Harum'),
 ('A Night At The Opera (2011 Remaster)', 'Queen'),
 ('The Wall [2011 - Remaster] (2011 Remastered Version)', 'Pink Floyd'),
 ('Pat Garrett & Billy The Kid (Soundtrack From The Motion Picture)',
  'Bob Dylan'),
 ('Beatles Biggest Hits!', 'The Yesterdays'),
 ('Hotel California (2013 Remaster)', 'Eagles'),
 ('The Graduate', 'Simon & Garfunkel'),
 ("Don't Play That Song (Mono)", 'Ben E. King'),
 ('Meddle (2011 Remastered Version)', 'Pink Floyd'),
 ('Innuendo (2011 Remaster)', 'Queen'),
 ("Elvis 75 - Good Rockin' Tonight", 'Elvis Presley'),
 ('Nirvana (International Version)', 'Nirvana'),
 ('Forever Young', 'Alphaville'),
 ('The Doors', 'The Doors'),
 ('Top 100 90s', 'Various Artists'),
 ('Grace (Legacy Edition)', 'Jeff Buckley'),
 ('Greatest Hits II (2011 Remaster)', 'Queen'),
 ('Hot Rocks 1964-1971 (Remastered)', 'The Rolling Stones'),
 ('Edith Piaf - The Best Of', 'Édith Piaf'),
 ('Heroes (1999 Remaster)', 'David 

## Large playlist 2
checking its structure, then taking the artist and song names

In [None]:
# A new playlist (playlist2)

In [59]:
playlist2 = get_playlist_tracks("5lIoXRJHLCdOnPAE8A4hxE")
len(playlist2)

772

In [60]:
playlist2[0]['track'].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'episode', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track', 'track_number', 'type', 'uri'])

In [None]:
playlist1[0]['track']['album']

In [None]:
playlist2_songs_artists =[]

for i in range (len(playlist2)):
    my_track = playlist2[i]['track']['album']
    result = get_name_artists_from_track(my_track)
    playlist2_songs_artists.append(result)
    print(result)

In [63]:
flatten(playlist2_songs_artists)


[('17', 'XXXTENTACION'),
 ("Come Over When You're Sober, Pt. 2", 'Lil Peep'),
 ('Legends', 'Juice WRLD'),
 ('KIRK', 'DaBaby'),
 ('Camelot', 'NLE Choppa'),
 ('KIRK', 'DaBaby'),
 ('THE GOAT', 'Polo G'),
 ('KIRK', 'DaBaby'),
 ('Cottonwood', 'NLE Choppa'),
 ('So Much Fun (Deluxe)', 'Young Thug'),
 ('Cheat Codes for Hoes', 'Shotgun Willy'),
 ('Cheat Codes for Hoes', 'TRAQULA'),
 ('Floor Seats (Text Back)', 'Moxas'),
 ('Beautiful Thugger Girls', 'Young Thug'),
 ('3 Vets', 'The Future Kingz'),
 ('Daygo', 'KGodd'),
 ('Death Race For Love', 'Juice WRLD'),
 ('17', 'XXXTENTACION'),
 ('Fetty Wap (Deluxe)', 'Fetty Wap'),
 ('Rodeo (Expanded Edition)', 'Travis Scott'),
 ('Dark Knight Dummo (Feat. Travis Scott)', 'Trippie Redd'),
 ('Dark Knight Dummo (Feat. Travis Scott)', 'Travis Scott'),
 ('Pray 4 Love', 'Rod Wave'),
 ('Oso Different', 'MarMar Oso'),
 ('STOKELEY', 'Ski Mask The Slump God'),
 ('XXXTENTACION Presents: Members Only, Vol. 3', 'XXXTENTACION'),
 ('The Best of 2Pac', '2Pac'),
 ('?', 'XXXTE