# Spotify Indie & Alternative Song Collection

## Import Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm.notebook import tqdm
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import getpass

## Set up Spotipy

In [2]:
# Store credentials 
client_id = str(getpass.getpass("Client ID: "))
client_secret = str(getpass.getpass("Client Secret: "))

Client ID: ········
Client Secret: ········


In [3]:
# Initialize Spotipy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id = client_id,
    client_secret = client_secret))

## Get Playlists

In [4]:
my_playlists = {"4d2IEVL30L65AztSfZM8XF": "mac_lovin",
                "08ITQREsf18K8L6kVVfN7S": "mac_lovin",
                "5BR1O6Eqs0m13FUYmzwVnp": "mac_lovin",
                "37i9dQZF1DWTpv9X85KmGq": "spotify",
                "17tok3VH0FuwyVXuMKwmAA": "annanassaft",
                "6Sr9J394N1si7jJDh3KuQQ": "default.rocks",
                "37i9dQZF1DX9VBcReWYoP0": "spotify",
                "37i9dQZF1DX8hcTuUCeYxa": "spotify",
                "37i9dQZF1DWWM6GBnxtToT": "spotify",
                "37i9dQZF1DWYJeWl6ior4d": "spotify",
                "37i9dQZF1DX873GaRGUmPl": "spotify",
                "37i9dQZF1DX0YKekzl0blG": "spotify",
                "37i9dQZF1DX2Nc3B70tvx0": "spotify",
                "37i9dQZF1DX35DWKgAk2B5": "spotify",
                "37i9dQZF1DX9GRpeH4CL0S": "spotify"
                }

In [5]:
# Request playlist details from Spotify
playlists = [sp.user_playlist_tracks(user, playlist) for playlist, user in tqdm(my_playlists.items())]

  0%|          | 0/15 [00:00<?, ?it/s]

In [6]:
len(playlists)

15

## Get Artists

In [7]:
# Create list of artist_ids
artist_ids = []
for playlist in playlists:
    for track in playlist["items"]:
        try: 
            for artist in track["track"]["artists"]:
                artist_ids.append(artist["id"])
        except:
            continue

In [8]:
len(artist_ids)

1453

In [9]:
# Convert to set to remove duplicate artists
artist_ids = list(set(artist_ids))

In [10]:
len(artist_ids)

813

## Get Albums

In [11]:
# Request album details from Spotify
albums = []
for artist_id in tqdm(artist_ids):
    try:
        album = sp.artist_albums(artist_id, album_type="album")
        albums.append(album)
    except:
        continue

  0%|          | 0/813 [00:00<?, ?it/s]

In [12]:
# Create list of album_ids
album_ids = []
for album in albums:
    for item in album["items"]:
        album_ids.append(item["id"])  

In [13]:
len(album_ids)

6382

In [14]:
# Convert to set to remove duplicate albums
album_ids = list(set(album_ids))

In [15]:
len(album_ids)

6364

## Get Tracks

In [16]:
# Request album tracks from Spotify
album_tracks = [sp.album_tracks(album_id) for album_id in tqdm(album_ids)]

  0%|          | 0/6364 [00:00<?, ?it/s]

In [17]:
album_tracks[0]["items"][0]["artists"][0]["name"]

'Beach House'

In [18]:
# Create tracks list including id, song and artist
tracks = []
for album_track in tqdm(album_tracks):
    for item in album_track["items"]:
        tracks.append([item["id"], item["name"], item["artists"][0]["name"]])

  0%|          | 0/6364 [00:00<?, ?it/s]

In [19]:
# Create tracks dataframe including id, song and artist
df_1 = pd.DataFrame(tracks, columns=["id", "song", "artist"])

In [20]:
df_1.head()

Unnamed: 0,id,song,artist
0,7aciebfpMZffzCTSg9rLIZ,Levitation,Beach House
1,6bzeIyoDKQdJU3NWc56u3u,Sparks,Beach House
2,1ZgMsA55GIY7ICkQh5MILA,Space Song,Beach House
3,0fbKFguQCxauLvVZ262f4c,Beyond Love,Beach House
4,66rCCXbN1ggzjTYibdJp3n,10:37,Beach House


In [21]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86772 entries, 0 to 86771
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      86772 non-null  object
 1   song    86772 non-null  object
 2   artist  86772 non-null  object
dtypes: object(3)
memory usage: 2.0+ MB


In [22]:
# Create list of track_ids
track_ids = []
for track in tracks:
    track_ids.append(track[0])

track_ids = set(track_ids)
track_ids = list(track_ids)

len(track_ids)

86772

## Get Audio Features

In [23]:
# Limit to request audio features is 100
chunk_size = 100

# Request audio features from Spotify
audio_feat = []
for start_index in tqdm(range(len(track_ids))[::chunk_size]):
    end_index = start_index + chunk_size
    chunk = track_ids[start_index:end_index]
    audio_feat.extend(sp.audio_features(tracks=chunk))

#audio_feat

  0%|          | 0/868 [00:00<?, ?it/s]

In [24]:
len(audio_feat)

86772

In [25]:
# Some songs don't have audio features (None)
# Skip those to be able to create a dataframe
audio_feat = [i for i in audio_feat if i is not None]
len(audio_feat)

86766

In [26]:
# Create dataframe for audio features
df_2 = pd.DataFrame(audio_feat)

In [27]:
df_2.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.589,0.615,11,-6.638,0,0.0308,0.856,0.867,0.272,0.596,133.982,audio_features,1mXwkgjw5NLxjc8rWWJnMk,spotify:track:1mXwkgjw5NLxjc8rWWJnMk,https://api.spotify.com/v1/tracks/1mXwkgjw5NLx...,https://api.spotify.com/v1/audio-analysis/1mXw...,510013,4
1,0.412,0.965,9,-5.652,1,0.0678,0.0332,0.0,0.327,0.361,137.022,audio_features,1qJUYVFcLOmyEjzPoML22x,spotify:track:1qJUYVFcLOmyEjzPoML22x,https://api.spotify.com/v1/tracks/1qJUYVFcLOmy...,https://api.spotify.com/v1/audio-analysis/1qJU...,399333,4
2,0.331,0.619,0,-7.77,1,0.028,0.0183,0.0847,0.137,0.223,101.853,audio_features,51tBxI8vMc8DY99TCjGb5h,spotify:track:51tBxI8vMc8DY99TCjGb5h,https://api.spotify.com/v1/tracks/51tBxI8vMc8D...,https://api.spotify.com/v1/audio-analysis/51tB...,264288,5
3,0.0892,0.913,4,-3.254,1,0.061,0.032,0.00492,0.286,0.508,165.684,audio_features,6TRu6X6gkRlsR4JTnOibMA,spotify:track:6TRu6X6gkRlsR4JTnOibMA,https://api.spotify.com/v1/tracks/6TRu6X6gkRls...,https://api.spotify.com/v1/audio-analysis/6TRu...,151427,4
4,0.546,0.319,11,-8.483,0,0.0278,0.643,5.4e-05,0.0827,0.135,108.913,audio_features,2J9rPzqtsatSzZNVFX0ea8,spotify:track:2J9rPzqtsatSzZNVFX0ea8,https://api.spotify.com/v1/tracks/2J9rPzqtsatS...,https://api.spotify.com/v1/audio-analysis/2J9r...,195600,4


In [28]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86766 entries, 0 to 86765
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      86766 non-null  float64
 1   energy            86766 non-null  float64
 2   key               86766 non-null  int64  
 3   loudness          86766 non-null  float64
 4   mode              86766 non-null  int64  
 5   speechiness       86766 non-null  float64
 6   acousticness      86766 non-null  float64
 7   instrumentalness  86766 non-null  float64
 8   liveness          86766 non-null  float64
 9   valence           86766 non-null  float64
 10  tempo             86766 non-null  float64
 11  type              86766 non-null  object 
 12  id                86766 non-null  object 
 13  uri               86766 non-null  object 
 14  track_href        86766 non-null  object 
 15  analysis_url      86766 non-null  object 
 16  duration_ms       86766 non-null  int64 

In [29]:
df = pd.merge(df_1, df_2, on="id")

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86766 entries, 0 to 86765
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                86766 non-null  object 
 1   song              86766 non-null  object 
 2   artist            86766 non-null  object 
 3   danceability      86766 non-null  float64
 4   energy            86766 non-null  float64
 5   key               86766 non-null  int64  
 6   loudness          86766 non-null  float64
 7   mode              86766 non-null  int64  
 8   speechiness       86766 non-null  float64
 9   acousticness      86766 non-null  float64
 10  instrumentalness  86766 non-null  float64
 11  liveness          86766 non-null  float64
 12  valence           86766 non-null  float64
 13  tempo             86766 non-null  float64
 14  type              86766 non-null  object 
 15  uri               86766 non-null  object 
 16  track_href        86766 non-null  object

In [31]:
# Check for duplicates
df.duplicated().value_counts()

False    86766
dtype: int64

In [34]:
# Export to CSV
df.to_csv("../Files/spotify_songs.csv", index=False)