# 5. Model test and preparation for the streamlit app


In [6]:
import streamlit as st
import spotipy
import pandas as pd
import numpy as np
from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.preprocessing import StandardScaler
import joblib
from dotenv import load_dotenv
import os
import time

# Load Spotify credentials from .env file
load_dotenv()
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

if not client_id or not client_secret:
    raise ValueError("Spotify credentials are missing. Make sure SPOTIPY_CLIENT_ID and SPOTIPY_CLIENT_SECRET are set in your .env file.")
 


In [7]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [8]:
# Load your existing dataframe
df = pd.read_csv('../data/names_genres_clustered.csv')
df.drop(columns=['is_hot'], inplace=True) 
df.head()

Unnamed: 0,spotify_title,spotify_artist,release_date,popularity,duration_ms,explicit,is_rock,is_pop,is_jazz,is_electronic,is_classical,is_blues,is_indie,cluster
0,Je sais que la Terre est plate,Raphaël,2008,14,150040,0,1,1,0,0,0,0,0,0
1,On efface,Julie Zenatti,2004,1,253000,0,0,1,0,0,0,0,0,0
2,Howells Delight,Anonymous,2011,3,240400,0,0,1,0,0,0,0,0,4
3,Martha Served,I Hate Sally,2007,1,138760,1,0,1,0,0,0,0,1,8
4,"Zip-a-Dee-Doo-Dah (From ""Song of the South"")",Orlando Pops Orchestra,2022,0,199986,0,0,1,0,0,0,0,0,0


## 1. Load the model and the scaler and creating functions for the streamlit app

In [9]:
# Load the KMeans model and the scaler
kmeans_model = joblib.load('../Model/kmeans_model_9.pkl')
scaler = joblib.load('../Scaler/scaler.pkl')  # Load the saved scaler

def prepare_song_data(song_info):
    # Prepare the song data for prediction by adding genre columns
    # Check if the necessary genre columns exist, otherwise initialize them to 0
    genre_columns = ['is_rock', 'is_pop', 'is_jazz', 'is_electronic', 'is_classical', 'is_blues', 'is_indie']
    
    # Initialize genre columns to 0 if they don't exist in song_info
    for genre in genre_columns:
        if genre not in song_info:
            song_info[genre] = 0
    
    # Convert song_info to the correct feature order
    song_data = pd.DataFrame([[
        song_info['release_date'],
        song_info['popularity'],
        song_info['duration_ms'],
        song_info['explicit'],
        song_info['is_rock'],
        song_info['is_pop'],
        song_info['is_jazz'],
        song_info['is_electronic'],
        song_info['is_classical'],
        song_info['is_blues'],
        song_info['is_indie']
    ]], columns=[
        'release_date', 'popularity', 'duration_ms', 'explicit', 
        'is_rock', 'is_pop', 'is_jazz', 'is_electronic', 'is_classical', 'is_blues', 'is_indie'
    ])
    
    # Scale the song data using the saved scaler
    scaled_song_data = scaler.transform(song_data)
    
    return scaled_song_data

def get_song_from_df_or_spotify(song_title, artist_name):
    # Check if song is in the DataFrame
    song = df[(df['spotify_title'].str.lower() == song_title.lower()) & (df['spotify_artist'].str.lower() == artist_name.lower())]
    
    if not song.empty:
        return song.iloc[0].to_dict()
    
    # If song not found in DataFrame, search using Spotify API
    results = sp.search(q=f"track:{song_title} artist:{artist_name}", type='track', limit=1)
    
    if results['tracks']['items']:
        track = results['tracks']['items'][0]
        song_info = {
            'spotify_title': track['name'],
            'spotify_artist': track['artists'][0]['name'],
            'release_date': track['album']['release_date'][:4],  # Only the year
            'popularity': track['popularity'],
            'duration_ms': track['duration_ms'],
            'explicit': track['explicit'],
            'is_rock': 0,
            'is_pop': 0,
            'is_jazz': 0,
            'is_electronic': 0,
            'is_classical': 0,
            'is_blues': 0,
            'is_indie': 0
        }

        # Get the artist's genres using the artist_id
        artist_id = track['artists'][0]['id']
        artist_info = sp.artist(artist_id)
        genres = artist_info.get('genres', [])
        
        # Map genres to the genre columns (e.g., is_rock, is_pop, etc.)
        genre_keywords = ['rock', 'pop', 'jazz', 'electronic', 'classical', 'blues', 'indie']
        for genre in genres:
            for keyword in genre_keywords:
                if keyword.lower() in genre.lower():
                    song_info[f'is_{keyword}'] = 1

        return song_info
    else:
        return None

def suggest_similar_songs(song_info, df, num_suggestions=10):
    if song_info is None:
        print("Error: Song not found!")
        return None
    
    # Prepare the song data for cluster prediction
    song_data = prepare_song_data(song_info)
    
    # Predict the cluster using the KMeans model
    cluster = kmeans_model.predict(song_data)[0]
    
    # Filter the DataFrame for songs in the same cluster
    similar_songs = df[df['cluster'] == cluster]
    
    # Sort the songs by popularity (descending) and get the top suggestions
    similar_songs = similar_songs.sort_values(by='popularity', ascending=False)
    
    # Exclude the current song from the recommendations (if it's in the dataframe)
    similar_songs = similar_songs[similar_songs['spotify_title'] != song_info['spotify_title']]

    # Select the top songs
    top_similar_songs = similar_songs.head(num_suggestions)
    
    # Return the recommended songs
    recommended_songs = top_similar_songs[['spotify_title', 'spotify_artist', 'popularity', 'cluster']]
    
    return recommended_songs


## 2. Test the model on a song.

In [10]:
# Song details for testing
song_title = "fly me to the moon"
artist_name = "frank sinatra"

# Retrieve song information from the DataFrame or Spotify
song_info = get_song_from_df_or_spotify(song_title, artist_name)

if song_info is not None:
    # Get the top 5 similar songs ordered by popularity
    recommended_songs = suggest_similar_songs(song_info, df, num_suggestions=10)
    
    # Display the recommended songs
    print(f"Recommended songs similar to '{song_info['spotify_title']}' by {song_info['spotify_artist']}:")
    print(recommended_songs)
else:
    print(f"Song '{song_title}' by '{artist_name}' was not found in the database or on Spotify.")


Recommended songs similar to 'Fly Me To The Moon (In Other Words)' by Frank Sinatra:
                      spotify_title    spotify_artist  popularity  cluster
2770            Ready For The Floor          Hot Chip          52        1
1829                            Sin   Nine Inch Nails          48        1
1023                    Nom Nom Nom          Hot Chip          47        1
1218      Chime (Edit) - Remastered           Orbital          46        1
2748                       Autobahn         Kraftwerk          45        1
4959  Computer Love - 2009 Remaster         Kraftwerk          43        1
2079                      The Frail   Nine Inch Nails          41        1
2940                 When I Grow Up         Fever Ray          40        1
2809                  Happy Cycling  Boards of Canada          38        1
1690                      Salva Mea         Faithless          37        1


