# WebScraping

In [22]:
from bs4 import BeautifulSoup
import requests
import json

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import config

import pprint

import random 
from time import sleep

import pandas as pd

In [23]:
Client_ID = config.client_id
Client_Secret = config.client_secret

sp = spotipy.Spotify(auth_manager = SpotifyClientCredentials(client_id = Client_ID, client_secret = Client_Secret))

# Extracting Hot Songs (HTML Parsing)  
  
### Scraping the Billbord Website (2021-2006)

In [2]:
#------scrapping 2021 data-------#

url = "https://www.billboard.com/charts/hot-100"
response = requests.get(url)
newest_soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
response.status_code

200

In [6]:
#--------2021 song names--------#

newest_song = newest_soup.select('span.chart-element__information__song.text--truncate.color--primary')
newest_song = [t.text for t in newest_song]


#--------2021 artist names--------#

newest_artist = newest_soup.select('span.chart-element__information__artist.text--truncate.color--secondary')
newest_artist = [t.text for t in newest_artist]

In [7]:
#------scrapping 2006-2020 data-------#

str = 'https://www.billboard.com/charts/year-end/{}/hot-100-songs'
years = [ 2006, 2007 ,2008, 2009, 2010,2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 ,2019, 2020]

song_older_soups = []
artist_older_soups = []

for year in years:
    url = str.format(year)
    response = requests.get(url)
    #print(response.status_code)
    
    soup = BeautifulSoup(response.content)
    song_older_soups.append(soup.find_all('div', class_ = 'ye-chart-item__title'))
    artist_older_soups.append(soup.find_all('div', class_ = 'ye-chart-item__artist'))
    
    
    wait_time = random.randint(1,4)
    sleep(wait_time)

In [8]:
#------song names & artist names 2006-2020------#

for i in range(0,len(years)):
    song_older_soups[i] = [t.text.replace('\n','') for t in song_older_soups[i]]
    artist_older_soups[i] = [t.text.replace('\n','') for t in artist_older_soups[i]]    

In [9]:
#----------concatinate the 2006-2020 years data together & with 2021----------#

song_name = []
song_artist = []

for i in range(0,len(song_older_soups)):
    song_name += song_older_soups[i]
    song_artist += artist_older_soups[i]
    

song_name += newest_song
song_artist += newest_song

In [18]:
#----- hot topic dataframe 2006-2021----------#

df = pd.DataFrame({'song': song_name, 'artist': song_artist})
df.head(5)

Unnamed: 0,song,artist
0,Bad Day,Daniel Powter
1,Temperature,Sean Paul
2,Promiscuous,Nelly Furtado Featuring Timbaland
3,You're Beautiful,James Blunt
4,Hips Don't Lie,Shakira Featuring Wyclef Jean


In [17]:
df.shape

(1598, 2)

In [20]:
#---------Making the dataframe content lowercase-----------#

df["song"] = df["song"].apply(lambda x: x.lower())
df["artist"] = df["artist"].apply(lambda x: x.lower())

In [33]:
#------------save the result in a csv file ---------#

df.to_csv('scrapped_hot_songs.csv', index = False)

  
    
# Extracting Songs and Audio Features (Spotify API )
  
### Scraping various playlists using the API of Spotify and their corresponding audio features for further clustering  

In [30]:
#----------extracting playlists from spotify API---------#

cats = [cat['id'] for cat in sp.categories()['categories']['items']]
cats

['toplists',
 'at_home',
 'pop',
 'equal',
 'mood',
 'decades',
 'hiphop',
 'in_the_car',
 'frequency',
 'gaming',
 'wellness',
 'workout',
 'chill',
 'focus',
 'sleep',
 'party',
 'indie_alt',
 'metal',
 'rock',
 'edm_dance']

In [26]:
#cats.remove('radar')

In [31]:
#-----------determining the id of each playlists --------#

playlist_ids = []

for i in range(0,len(cats)):
    playlist_ids.append(sp.category_playlists(cats[i])['playlists']['items'][0]['id'])
    
    wait_time = random.randint(1,3)
    sleep(wait_time)

In [32]:
playlist_ids

['37i9dQZF1DWY4lFlS4Pnso',
 '37i9dQZF1DWTLSN7iG21yC',
 '37i9dQZF1DWZDMxPJhrYOE',
 '37i9dQZF1DWU8quswnFt3c',
 '37i9dQZF1DX3rxVfibe1L0',
 '37i9dQZF1DX4UtSsGT1Sbe',
 '37i9dQZF1DX0XUsuxWHRQd',
 '37i9dQZF1DX0BxHamIEkKV',
 '37i9dQZF1DWVgsJtp58d1t',
 '37i9dQZF1DWTyiBJ6yEqeu',
 '37i9dQZF1DX9uKNf5jGX6m',
 '37i9dQZF1DX76Wlfdnj7AP',
 '37i9dQZF1DX4WYpdgoIcn6',
 '37i9dQZF1DX4sWSpwq3LiO',
 '37i9dQZF1DXdbkmlag2h7b',
 '37i9dQZF1DXaXB8fQg7xif',
 '37i9dQZF1DXdTCdwCKzXwo',
 '37i9dQZF1DWTcqUzwhNmKv',
 '37i9dQZF1DWXRqgorJj26U',
 '37i9dQZF1DX4dyzvuaRJ0n']

In [None]:
#-----------getting the tracks inside each playlist --------#

def get_playlist_tracks(username ,playlist_id):
    results = sp.user_playlist_tracks(username,playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

In [None]:
#-----------determining the tracks'name, artists'name and uri for all the tracks in the playlists --------#

track_names = []                      
track_artist = []
track_uri = []                           

for i in range(0, len(playlist_ids)):
    tracks = get_playlist_tracks('breezz', playlist_ids[i])
    
    for j in range(0,len(tracks)):
        track_names.append(tracks[j]['track']['name'])
        track_uri.append(tracks[j]['track']['uri'])
        
        artist = tracks[j]['track']['artists']
        artists = []
        
        for k in range(0, len(artist)):
            artists.append(tracks[j]['track']['artists'][k]['name']) 
        
        track_artist.append(" & ".join(artists)) 
            
        wait_time = random.randint(1,5)
        sleep(wait_time)

In [None]:
#tracks = get_playlist_tracks('breezz', playlist_ids[0])
#len(tracks)
#track_names

In [None]:
#--------getting audio features based on the provided uri lists---------#

list_of_audio_feature_dict = []

for i in range(0,len(track_uri)):
    audio_feature_dict = sp.audio_features(track_uri[i])[0]
    list_of_audio_feature_dict.append(audio_feature_dict)
    
    wait_time = random.randint(1,3)
    sleep(wait_time) 


In [None]:
#len(list_of_audio_feature_dict)

In [None]:
#---------Making a dataframe form the audio features + song +artist -----------#

df_sp = pd.DataFrame(list_of_audio_feature_dict)
df_sp['artist'] = track_artist
df_sp['song_name'] = track_names

In [None]:
#------------save the result in a csv file ---------#

df_sp.to_csv('audio_features_album10.csv', index = False)