# Indie & Alternative Song Recommender

## Import Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import getpass

## Scrape Indie Top 100

In [2]:
def indie_top_100():
    # Store URL in a variable
    url_2 = "https://popkultur.de/indie-songs/"

    # Download HTML with a get request
    response = requests.get(url_2)

    # Check response status code 
    print(f"HTTP status code: {response.status_code}")

    # Parse and store the contents of the URL call
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Different formats for top 1-20 and top 21-100, so scraping separately
    
    # Scrape top 1-20
    h2_tags = soup.find_all("h2")
    top_20 = []
    for h2 in h2_tags:
        if h2.get_text().startswith("Platz"):
            top_20.append((h2.get_text()))

    # Create dataframe for top 1-20
    df_top_20 = pd.DataFrame(top_20, columns = ["rank"])
    
    # Clean up dataframe for top 1-20
    df_top_20["rank"] = df_top_20["rank"].str.replace(" –", ":", n=1, regex=False)
    df_top_20[["rank", "artist", "song"]] = df_top_20["rank"].str.split(": ", expand=True)
    df_top_20["song"] = df_top_20["song"].str.replace("\s\S\d\d\d\d\S", "", regex=True)
    df_top_20.drop(["rank"], axis=1, inplace=True)
    
    # Scrape top 21-100
    len_ranking = len(soup.select("td.column-2"))
    artist_song = []
    for i in range(len_ranking):
        artist_song.append(soup.select("td.column-2")[i].text)
    
    # Create dataframe for top 21-100
    df_top_21_100 = pd.DataFrame(artist_song, columns=["artist_song"])
    
    # Clean up dataframe for top 21-100
    df_top_21_100[["artist", "song"]] = df_top_21_100["artist_song"].str.split(" - ", 1, expand=True)
    df_top_21_100.drop(columns = ["artist_song"], inplace=True)
    
    # Combine dataframes into one    
    top_100 = df_top_20.append(df_top_21_100, ignore_index=True)
    
    return top_100

In [3]:
top_100 = indie_top_100()
top_100

HTTP status code: 200


Unnamed: 0,artist,song
0,The Killers,Mr. Brightside
1,Radiohead,Creep
2,Kings of Leon,Sex on Fire
3,Blur,Song 2
4,Oasis,Wonderwall
...,...,...
95,Primal Scream,Come Together
96,The Maccabees,Toothpaste Kisses
97,Arctic Monkeys,A Certain Romance
98,Blink-182,I Miss You


## Create Dataframe

In [4]:
collection = pd.read_csv("../Files/spotify_songs.csv")

In [5]:
collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86766 entries, 0 to 86765
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                86766 non-null  object 
 1   song              86766 non-null  object 
 2   artist            86766 non-null  object 
 3   danceability      86766 non-null  float64
 4   energy            86766 non-null  float64
 5   key               86766 non-null  int64  
 6   loudness          86766 non-null  float64
 7   mode              86766 non-null  int64  
 8   speechiness       86766 non-null  float64
 9   acousticness      86766 non-null  float64
 10  instrumentalness  86766 non-null  float64
 11  liveness          86766 non-null  float64
 12  valence           86766 non-null  float64
 13  tempo             86766 non-null  float64
 14  type              86766 non-null  object 
 15  uri               86766 non-null  object 
 16  track_href        86766 non-null  object

In [6]:
# Drop irrelevant columns
collection = collection.drop(["type", "uri",
            "track_href", "analysis_url", "duration_ms", "time_signature"], axis=1)
collection.head()

Unnamed: 0,id,song,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,7aciebfpMZffzCTSg9rLIZ,Levitation,Beach House,0.249,0.452,2,-7.126,1,0.0303,0.55,0.427,0.0614,0.0676,142.352
1,6bzeIyoDKQdJU3NWc56u3u,Sparks,Beach House,0.336,0.761,3,-6.259,1,0.0353,0.037,0.0155,0.163,0.423,96.008
2,1ZgMsA55GIY7ICkQh5MILA,Space Song,Beach House,0.507,0.79,0,-7.307,0,0.0294,0.233,0.139,0.145,0.6,147.065
3,0fbKFguQCxauLvVZ262f4c,Beyond Love,Beach House,0.376,0.56,11,-6.631,1,0.0253,0.568,0.193,0.156,0.179,160.329
4,66rCCXbN1ggzjTYibdJp3n,10:37,Beach House,0.572,0.424,5,-9.225,1,0.0268,0.729,0.0914,0.093,0.477,96.065


## Clustering

In [7]:
def kmeans(collection):
    # Drop columns
    X = collection.drop(["id", "song", "artist", "cluster"], axis=1, errors="ignore")

    # Convert remaining columns to float
    X = X.astype({"key": 'float', "mode": 'float'})
        
    # Scale data with the standard scaler
    X_scaled = StandardScaler().fit_transform(X)
    
    # Create dataframe of scaled features
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    
    # Define and fit the model
    kmeans = KMeans(n_clusters=13, random_state=40)
    kmeans.fit(X_scaled)
    
    # Predict / assign the clusters:
    clusters = kmeans.predict(X_scaled)
    
    collection_cluster = pd.DataFrame(collection)
    collection_cluster["cluster"] = clusters
    
    return collection_cluster

## Set up Spotipy

In [8]:
# Store credentials 
client_id = str(getpass.getpass("Client ID: "))
client_secret = str(getpass.getpass("Client Secret: "))

Client ID: ········
Client Secret: ········


In [9]:
# Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id = client_id,
    client_secret = client_secret))

## Set up Functions

In [10]:
def answer_yes(song_id, df_1, collection):

    print("")
    print(f"Great! Please wait just a moment ...")

    # Check if song alrady exists in dataframe
    if song_id not in collection.values:

        print("")
        print("It looks like I don't know this song yet. Give me a few seconds to find a similar one ...")

        # Get audio features of the song
        df_2 = pd.DataFrame(sp.audio_features(song_id))

        # Drop irrelevant columns
        df_2 = df_2.drop(["type", "uri", "track_href",
                          "analysis_url", "duration_ms", "time_signature"], axis=1)

        # Merge song details and audio features
        df = pd.merge(df_1, df_2, on="id")

        # Add song to collection
        collection_new = collection.append(df, ignore_index=True)

        collection_cluster = kmeans(collection_new)

    else:
        collection_cluster = kmeans(collection)

    # Find the cluster of the song
    cluster = collection_cluster["cluster"].loc[collection_cluster["id"]
                                                == song_id].values[0]

    # Find another song in the same cluster
    suggestion = collection_cluster.loc[collection_cluster["cluster"] == cluster].sample(
    )
    suggestion_artist = suggestion["artist"].values[0]
    suggestion_song = suggestion["song"].values[0]
    suggestion_id = suggestion["id"].values[0]

    print("")
    print("*" * 10)
    print(
        f"Here is my recommendation for you: '{suggestion_song}' by {suggestion_artist}.")
    print(
        f"Play the song on Spotify: https://open.spotify.com/track/{suggestion_id}")
    print("*" * 10)
    print("")

In [11]:
def song_recommender(top_100, collection):

    while True:
        # Ask user for a song
        user_input = input(
            "Please tell me a song you like. Enter 'q' to quit. ")
        user_input = user_input.lower()

        if user_input == "q":
            print("")
            print(f"Thanks for using me. See ya!")
            break

        else:
            # Query Spotify for song
            song_search = sp.search(q=user_input, limit=3, type="track")

            # Get song details
            song_id_1 = song_search["tracks"]["items"][0]["id"]
            song_title_1 = song_search["tracks"]["items"][0]["name"]
            song_artist_1 = song_search["tracks"]["items"][0]["artists"][0]["name"]

            while True:
                print("")

                # Check if song is the correct one
                answer_1 = input(
                    f"Do you mean '{song_title_1}' by {song_artist_1}? Please answer with 'yes' or 'no': ")

                if answer_1.lower() == "" or answer_1.lower() not in ["yes", "no"]:
                    print("")
                    print("Please answer with 'yes' or 'no': ")

                else:
                    break

            if answer_1.lower() == "no":

                # Get song details of another song
                song_id_2 = song_search["tracks"]["items"][1]["id"]
                song_title_2 = song_search["tracks"]["items"][1]["name"]
                song_artist_2 = song_search["tracks"]["items"][1]["artists"][0]["name"]

                while True:
                    print("")

                    # Check if the other song is the correct one
                    answer_2 = input(
                        f"Okay, perhaps you mean '{song_title_2}' by {song_artist_2}? Please answer with 'yes' or 'no': ")

                    if answer_2.lower() == "" or answer_2.lower() not in ["yes", "no"]:
                        print("")
                        print("Please answer with 'yes' or 'no': ")

                    else:
                        break

                if answer_2.lower() == "no":

                    print("")
                    print("Sorry, then I don't know which song you mean. Feel free to try a different one.")
                    print("")

                elif answer_2.lower() == "yes":

                    if song_title_2 in top_100["song"].values and song_artist_2 in top_100["artist"].values:

                        suggestion = top_100.sample()

                        top_song = suggestion['song'].values[0] + \
                            " " + suggestion['artist'].values[0]

                        # Query Spotify for song
                        top_song_search = sp.search(
                            q=top_song, limit=1, type="track")

                        print("")
                        print("*" * 10)
                        print(
                            f"You've got excellent taste! This is a top 100 song. Then you might also like '{suggestion['song'].values[0]}' by {suggestion['artist'].values[0]}.")
                        print(
                            f"Play the song on Spotify: {top_song_search['tracks']['items'][0]['external_urls']['spotify']}")
                        print("*" * 10)
                        print("")

                    else:
                        df_1 = pd.DataFrame({"id": [song_id_2],
                                             "song": [song_title_2],
                                             "artist": [song_artist_2]})

                        answer_yes(song_id_2, df_1, collection)

            elif answer_1.lower() == "yes":

                if song_title_1 in top_100["song"].values and song_artist_1 in top_100["artist"].values:

                    suggestion = top_100.sample()

                    top_song = suggestion['song'].values[0] + \
                        " " + suggestion['artist'].values[0]

                    # Query Spotify for song
                    top_song_search = sp.search(
                        q=top_song, limit=1, type="track")

                    print("")
                    print("*" * 10)
                    print(
                        f"You've got excellent taste! This is a top 100 song. Then you might also like '{suggestion['song'].values[0]}' by {suggestion['artist'].values[0]}.")
                    print(
                        f"Play the song on Spotify: {top_song_search['tracks']['items'][0]['external_urls']['spotify']}")
                    print("*" * 10)
                    print("")

                else:
                    df_1 = pd.DataFrame({"id": [song_id_1],
                                         "song": [song_title_1],
                                         "artist": [song_artist_1]})

                    answer_yes(song_id_1, df_1, collection)

## Indie & Alternative Song Recommender

<img src="../Images/the_killers_2.jpeg" title="The Killers">

In [12]:
song_recommender(top_100, collection)

Please tell me a song you like. Enter 'q' to quit. underdog

Do you mean 'Underdog' by Alicia Keys? Please answer with 'yes' or 'no': no

Okay, perhaps you mean 'Underdog' by Kasabian? Please answer with 'yes' or 'no': yes

Great! Please wait just a moment ...

**********
Here is my recommendation for you: 'Too Bad, So Sad' by Metric.
Play the song on Spotify: https://open.spotify.com/track/5UqpZZE0TfGMRdv0b8RX8a
**********

Please tell me a song you like. Enter 'q' to quit. hello

Do you mean 'Hello (feat. A Boogie Wit da Hoodie)' by Pop Smoke? Please answer with 'yes' or 'no': no

Okay, perhaps you mean 'Hello' by KAROL G? Please answer with 'yes' or 'no': no

Sorry, then I don't know which song you mean. Feel free to try a different one.

Please tell me a song you like. Enter 'q' to quit. hello adele

Do you mean 'Hello' by Adele? Please answer with 'yes' or 'no': yes

Great! Please wait just a moment ...

It looks like I don't know this song yet. Give me a few seconds to find a sim