In [1]:
import plotly.io as pio
pio.renderers.default = "iframe"

In [2]:
import warnings
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')  

In [3]:
df = pd.read_csv("spotify_songs.csv")

In [4]:
df.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [5]:
df.info

<bound method DataFrame.info of                      track_id  \
0      6f807x0ima9a1j3VPbc7VN   
1      0r7CVbZTWZgbTCYdfa2P31   
2      1z1Hg7Vb0AhHDiEmnDE79l   
3      75FpbthrwQmzHlBJLuGdC7   
4      1e8PAfcKUYoKkxPhrHqw4x   
...                       ...   
32828  7bxnKAamR3snQ1VGLuVfC1   
32829  5Aevni09Em4575077nkWHz   
32830  7ImMqPP3Q1yfUHvsdn7wEo   
32831  2m69mhnfQ1Oq6lGtXuYhgX   
32832  29zWqhca3zt5NsckZqDf6c   

                                              track_name      track_artist  \
0      I Don't Care (with Justin Bieber) - Loud Luxur...        Ed Sheeran   
1                        Memories - Dillon Francis Remix          Maroon 5   
2                        All the Time - Don Diablo Remix      Zara Larsson   
3                      Call You Mine - Keanu Silva Remix  The Chainsmokers   
4                Someone You Loved - Future Humans Remix     Lewis Capaldi   
...                                                  ...               ...   
32828               City 

In [6]:
df.shape

(32833, 23)

In [7]:
df.isnull().sum()

track_id                    0
track_name                  5
track_artist                5
track_popularity            0
track_album_id              0
track_album_name            5
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64

In [8]:
print(f'Shape before null removal: {df.shape}')
df.dropna(inplace=True)
print(f'Shape after null removal: {df.shape}')

Shape before null removal: (32833, 23)
Shape after null removal: (32828, 23)


In [9]:
for genre in df['playlist_genre'].unique():
    print(f"{genre}: {df['playlist_subgenre'][df['playlist_genre'] == genre].unique().tolist()}")

pop: ['dance pop', 'post-teen pop', 'electropop', 'indie poptimism']
rap: ['hip hop', 'southern hip hop', 'gangster rap', 'trap']
rock: ['album rock', 'classic rock', 'permanent wave', 'hard rock']
latin: ['tropical', 'latin pop', 'reggaeton', 'latin hip hop']
r&b: ['urban contemporary', 'hip pop', 'new jack swing', 'neo soul']
edm: ['electro house', 'big room', 'pop edm', 'progressive electro house']


In [10]:
names = []
parents = [''] * 6
for genre in df['playlist_genre'].unique():
    names.append(genre)
    parents.append(genre)
    parents.append(genre)
    parents.append(genre)
    parents.append(genre)

for genre in df['playlist_genre'].unique():
    x = df['playlist_subgenre'][df['playlist_genre'] == genre]
    names.extend(x.unique().tolist())

In [11]:
fig = go.Figure(go.Sunburst(
    labels=names,
    parents=parents,
    insidetextorientation='radial',
))

fig.update_layout(
    font=dict(family='Arial', size=14),
    title_text='Genres & their Subgenres',
    title_x=0.5,
)

fig.show()

In [12]:
plot_df = df.groupby(['track_artist', 'track_album_name'], as_index=False)['track_popularity'].mean()
plot_df['track_popularity'] = plot_df['track_popularity'].astype(int)
plot_df = plot_df.sort_values(by='track_popularity', ascending=False)

In [13]:
fig = px.treemap(
    data_frame=plot_df.iloc[:100, :],
    path=[px.Constant("All Artists"), 'track_artist', 'track_album_name'],
    values='track_popularity',
    color='track_popularity',
    color_continuous_scale='OrRd',
    labels={'track_popularity': 'Popularity'}
)

fig.update_layout(title=dict(
    text="Music Artists and their Most Popular Albums on Spotify",
    x=0.5
))

fig.show()

In [14]:
def artist_most_popular_songs(artist_name=''):
    while artist_name not in df['track_artist'].unique().tolist():
        artist_name = input("Artist not present in the database\nPlease enter another one: ")

    temp = df[df['track_artist'] == artist_name]
    plot_df = temp.groupby(['playlist_genre', 'track_name'], as_index=False)['track_popularity'].mean()
    plot_df = plot_df.sort_values(by='track_popularity', ascending=False)
    plot_df.drop_duplicates(subset='track_name', inplace=True)

    fig = px.treemap(
        data_frame=plot_df,
        path=[px.Constant(artist_name), 'playlist_genre', 'track_name'],
        values='track_popularity',
        color='track_popularity',
        color_continuous_scale='balance',
        labels={'track_popularity': 'Popularity'},
    )

    fig.update_layout(
        title=dict(
            text=f"Most popular tracks by {artist_name} divided by genre",
            x=0.5
        )
    )

    fig.show()

In [15]:
artist_most_popular_songs('Ed Sheeran')

In [16]:
artist_most_popular_songs('The Weeknd')


In [17]:
artist_most_popular_songs('Maroon 5')


In [18]:
df['year'] = df['track_album_release_date'].str[:4].astype(int)
plot_df = df.groupby(['playlist_genre', 'year'], as_index=False)['track_popularity'].mean()
plot_df = plot_df.sort_values(by='year')
plot_df = plot_df[plot_df['year'] >= 2000]
genres = plot_df['playlist_genre'].unique().tolist()

colors = ['black', 'lime', 'red', 'lavender', 'gold', 'cyan']
annotations = []

In [19]:
fig = go.Figure()

for i in range(0, 6):
    fig.add_trace(go.Scatter(
        x=plot_df['year'].unique().tolist(),
        y=plot_df['track_popularity'][plot_df['playlist_genre'] == genres[i]],
        name=genres[i],
        mode='lines',
        connectgaps=True,
        line=dict(color=colors[i]),
    ))

    fig.add_trace(go.Scatter(
        x=[plot_df['year'].unique().tolist()[0],
           plot_df['year'].unique().tolist()[-1]],
        y=[plot_df['track_popularity'][plot_df['playlist_genre'] == genres[i]].values.tolist()[0],
           plot_df['track_popularity'][plot_df['playlist_genre'] == genres[i]].values.tolist()[-1]],
        mode='markers',
        marker=dict(color=colors[i], size=12)
    ))

    annotations.append(dict(
        xref='paper', x=0.05, yref='y', y=plot_df['track_popularity'][plot_df['playlist_genre'] == genres[i]].values.tolist()[0],
        xanchor='right', yanchor='middle',
        text=f"{genres[i].capitalize()} {plot_df['track_popularity'][plot_df['playlist_genre'] == genres[i]].values.tolist()[0]:.1f}%",
        font=dict(family='Arial', size=16), showarrow=False
    ))
    annotations.append(dict(
        xref='paper', x=0.95, yref='y',
        y=plot_df['track_popularity'][plot_df['playlist_genre'] == genres[i]].values.tolist()[-1],
        xanchor='left', yanchor='middle', text=f'{genres[i].capitalize()}',
        font=dict(family='Arial', size=16), showarrow=False
    ))

annotations.append(dict(
    xref='paper', x=0, yref='paper', y=1.12,
    xanchor='left', yanchor='top', text='Popularity of each genre since the 2000s',
    font=dict(family='Arial', size=30, color='rgb(37,37,37)'), showarrow=False
))

In [20]:
fig.update_layout(
    xaxis=dict(
        showline=True,
        showticklabels=True,
        linewidth=5,
        showgrid=False,
        linecolor='rgb(204, 204, 204)',
        ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='rgb(82, 82, 82)',
        )
    ),
    yaxis=dict(
        showline=False,
        showticklabels=False,
        showgrid=False,
    ),
    showlegend=False,
    plot_bgcolor='white',
    annotations=annotations
)

fig.show()

In [21]:
features = ['danceability', 'energy',
            'loudness', 'speechiness', 'acousticness',
            'instrumentalness', 'liveness', 'valence', 'tempo', 'track_popularity', 'track_name']

In [22]:
temp = df[features]
temp.drop_duplicates(subset='track_name', inplace=True)
temp.set_index('track_name', inplace=True)

In [23]:
features = ['danceability', 'energy',
            'loudness', 'speechiness', 'acousticness',
            'instrumentalness', 'liveness', 'valence', 'tempo', 'track_popularity']

In [24]:
from sklearn.preprocessing import RobustScaler
from sklearn.metrics.pairwise import cosine_similarity

scaler = RobustScaler()
temp[features] = scaler.fit_transform(temp[features])
temp.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_popularity
track_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
I Don't Care (with Justin Bieber) - Loud Luxury Remix,0.38191,0.745247,1.010238,-0.055215,0.076831,-0.002759,-0.392244,0.013699,0.000891,0.657143
Memories - Dillon Francis Remix,0.271357,0.361217,0.364139,-0.269939,-0.041003,0.602126,1.462174,0.493151,-0.632968,0.685714
All the Time - Don Diablo Remix,0.015075,0.802281,0.78943,0.107362,-0.013137,0.000589,-0.108074,0.273973,0.057543,0.771429
Call You Mine - Keanu Silva Remix,0.231156,0.798479,0.693691,0.391616,-0.214968,-0.001404,0.48951,-0.646575,-0.001408,0.485714
Someone You Loved - Future Humans Remix,-0.110553,0.429658,0.44632,-0.284254,-0.009554,-0.002759,-0.277813,0.580822,0.056623,0.742857


In [25]:
# At this point, I was getting a Memory Error because the matrix is HUGE and computing it would require a lot of RAM.
# I was even struggling to keep webpages open because I didn't have enough memory.
# But after some research I found out that I don't need the whole similarity matrix so I decided to alter the code from this point onwards.

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import gc  # For garbage collection

# Step 1: Select relevant features
features = ['danceability', 'energy', 'loudness', 'speechiness', 
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']  # or whatever your project uses

# Step 2: Standardise the features
scaler = StandardScaler()
X = scaler.fit_transform(df[features])

# Step 3: Setup NearestNeighbors with small top_n (e.g. 5)
top_n = 5
nn = NearestNeighbors(n_neighbors=top_n + 1,  # +1 because the closest will be itself
                      metric='cosine', 
                      algorithm='brute', 
                      n_jobs=1)  # Use 1 core to avoid memory spikes

# Step 4: Fit the model
nn.fit(X)

# Step 5: Generate recommendations one row at a time
recommendations = []

for i in range(len(X)):
    dist, idx = nn.kneighbors([X[i]], n_neighbors=top_n + 1)
    recs = idx[0][1:]  # skip the first one (it's the item itself)
    recommendations.append(recs)
    
    # Free up memory every 1000 rows
    if i % 1000 == 0:
        gc.collect()

# Step 6: Convert to DataFrame
rec_df = pd.DataFrame(recommendations, index=df.index, columns=[f"Rec_{i+1}" for i in range(top_n)])


In [39]:
import tkinter as tk
from tkinter import ttk
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# --- Data Prep ---
df = pd.read_csv("spotify_songs.csv")
df.columns = df.columns.str.strip().str.lower()
df.dropna(subset=['track_artist', 'track_name'], inplace=True)
df['track_artist_lower'] = df['track_artist'].str.lower()

features = ['danceability', 'energy', 'loudness', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features].fillna(0))

# --- Tkinter Root ---
root = tk.Tk()
root.title("Spotify Recommendation System")
root.geometry("800x700")
root.configure(bg="#121212")

wrapper = tk.Frame(root, bg="#121212")
wrapper.pack(expand=True)

# --- Header ---
tk.Label(wrapper, text="Spotify Song Recommendation System", font=("Helvetica", 16, "bold"),
         bg="#121212", fg="#1DB954").pack(pady=(20, 10))

# --- Mode Selection ---
mode_var = tk.StringVar(value="artist")

mode_frame = tk.Frame(wrapper, bg="#121212")
mode_frame.pack(pady=(0, 10))

tk.Radiobutton(mode_frame, text="🎤 By Artist", variable=mode_var, value="artist", bg="#121212", fg="white",
               selectcolor="#121212", activeforeground="#1DB954", font=("Helvetica", 11),
               command=lambda: switch_mode("artist")).pack(side="left", padx=10)
tk.Radiobutton(mode_frame, text="🎧 By Mood", variable=mode_var, value="mood", bg="#121212", fg="white",
               selectcolor="#121212", activeforeground="#1DB954", font=("Helvetica", 11),
               command=lambda: switch_mode("mood")).pack(side="left", padx=10)

# --- Artist Widgets ---
artist_var = tk.StringVar()
unique_artists = sorted(df['track_artist'].dropna().unique(), key=lambda x: x.lower())

artist_label = tk.Label(wrapper, text="Select an Artist:", font=("Helvetica", 11), bg="#121212", fg="white")
artist_menu = ttk.Combobox(wrapper, textvariable=artist_var, width=45, font=("Helvetica", 10))
artist_menu['values'] = unique_artists

song_label = tk.Label(wrapper, text="Pick Your Favourite Songs:", font=("Helvetica", 11), bg="#121212", fg="white")
song_listbox = tk.Listbox(wrapper, selectmode=tk.MULTIPLE, width=60, height=6,
                          bg="#1E1E1E", fg="white", font=("Helvetica", 10), selectbackground="#1DB954")

def update_songs(event=None):
    artist_input = artist_var.get().lower()
    match_artist = df[df['track_artist_lower'] == artist_input]
    songs = match_artist['track_name'].dropna().unique()
    song_listbox.delete(0, tk.END)
    for song in sorted(songs):
        song_listbox.insert(tk.END, song)

def open_dropdown(event=None):
    artist_menu.event_generate('<Down>')

def filter_artists():
    value = artist_var.get().lower()
    artist_menu['values'] = [a for a in unique_artists if value in a.lower()]

artist_menu.bind("<Return>", open_dropdown)
artist_menu.bind("<KeyRelease>", lambda e: filter_artists())
artist_menu.bind("<<ComboboxSelected>>", update_songs)

# --- Mood Sliders ---
sliders = {}

def create_slider(name):
    f = tk.Frame(wrapper, bg="#121212")
    tk.Label(f, text=name.capitalize(), font=("Helvetica", 10), bg="#121212", fg="white", width=15, anchor="w").pack(side="left")
    var = tk.DoubleVar(value=5)
    sliders[name] = var
    tk.Scale(f, from_=0, to=10, orient="horizontal", resolution=1,
             variable=var, bg="#1E1E1E", fg="white", troughcolor="#1DB954",
             highlightthickness=0, width=8, length=250).pack(side="left", padx=5)
    return f

slider_frames = [create_slider(f) for f in features]

# --- Output Text Area ---
output_text = tk.Text(wrapper, height=10, width=85, wrap="word", bg="#1E1E1E", fg="white", insertbackground="white")
recommend_btn = ttk.Button(wrapper, text="🎵 Get Recommendations", command=lambda: get_recommendations())
copy_btn = ttk.Button(wrapper, text="📋 Copy Recommendations", command=lambda: copy_to_clipboard())

# --- Mode Switch Logic ---
def switch_mode(mode):
    # Clear previous
    for widget in wrapper.pack_slaves():
        if widget not in (mode_frame, wrapper.pack_slaves()[0]):
            widget.pack_forget()

    if mode == "artist":
        artist_label.pack()
        artist_menu.pack(pady=2)
        song_label.pack()
        song_listbox.pack(pady=5)
    else:
        for slider in slider_frames:
            slider.pack(pady=2)

    recommend_btn.pack(pady=10)
    output_text.pack(pady=(0, 10))
    copy_btn.pack(pady=(0, 10))

switch_mode("artist")

# --- Recommendation Logic ---
def get_recommendations():
    output_text.delete('1.0', tk.END)

    if mode_var.get() == "artist":
        selected_songs = [song_listbox.get(i) for i in song_listbox.curselection()]
        artist_input = artist_var.get().lower()
        selected_df = df[(df['track_artist_lower'] == artist_input) & (df['track_name'].isin(selected_songs))]
        if selected_df.empty:
            output_text.insert(tk.END, "No matching songs found.\n")
            return
        genre = selected_df['playlist_genre'].dropna().iloc[0]
        genre_df = df[df['playlist_genre'] == genre].copy()
        selected_vec = selected_df[features].mean().values.reshape(1, -1)
    else:
        user_vector = [sliders[f].get() / 10 for f in features]
        selected_vec = scaler.transform([user_vector])
        genre_df = df.copy()

    genre_df['similarity'] = cosine_similarity(genre_df[features], selected_vec).flatten()
    top_recs = genre_df.sort_values(by='similarity', ascending=False).head(10)

    output_text.insert(tk.END, "Top Recommendations:\n\n")
    for _, row in top_recs.iterrows():
        release = row.get('track_album_release_date', 'Unknown')
        output_text.insert(tk.END, f"• {row['track_name']} by {row['track_artist']} — {row['playlist_genre']} / {row['playlist_subgenre']} — Released: {release}\n")

def copy_to_clipboard():
    recommendations = output_text.get("1.0", tk.END)
    root.clipboard_clear()
    root.clipboard_append(recommendations)
    root.update()

In [40]:
root.mainloop()