In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas

from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [80]:
#file load 
df_songs = pd.read_csv(
    Path("song_data.csv")
)

# Review the DataFrame
df_songs['song_name'] = df_songs['song_name'].str.lower()

df_songs = df_songs.drop_duplicates(keep='first')
df_songs

Unnamed: 0,song_name,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,boulevard of broken dreams,73,262333,0.005520,0.496,0.682,0.000029,8,0.0589,-4.095,1,0.0294,167.060,4,0.474
1,in the end,66,216933,0.010300,0.542,0.853,0.000000,3,0.1080,-6.407,0,0.0498,105.256,4,0.370
2,seven nation army,76,231733,0.008170,0.737,0.463,0.447000,0,0.2550,-7.828,1,0.0792,123.881,4,0.324
3,by the way,74,216933,0.026400,0.451,0.970,0.003550,0,0.1020,-4.938,1,0.1070,122.444,4,0.198
4,how you remind me,56,223826,0.000954,0.447,0.766,0.000000,10,0.1130,-5.065,1,0.0313,172.011,4,0.574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18830,let it breathe,60,159645,0.893000,0.500,0.151,0.000065,11,0.1110,-16.107,1,0.0348,113.969,4,0.300
18831,answers,60,205666,0.765000,0.495,0.161,0.000001,11,0.1050,-14.078,0,0.0301,94.286,4,0.265
18832,sudden love (acoustic),23,182211,0.847000,0.719,0.325,0.000000,0,0.1250,-12.222,1,0.0355,130.534,4,0.286
18833,gentle on my mind,55,352280,0.945000,0.488,0.326,0.015700,3,0.1190,-12.020,1,0.0328,106.063,4,0.323


In [81]:
# Import the PCA module
from sklearn.decomposition import PCA

In [82]:
#make copy of df
df_songs_features = df_songs.copy()

#drop the song_name col
df_songs_features = df_songs_features.drop(['song_name'], axis=1)

In [83]:
df_songs_features

Unnamed: 0,song_popularity,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence
0,73,262333,0.005520,0.496,0.682,0.000029,8,0.0589,-4.095,1,0.0294,167.060,4,0.474
1,66,216933,0.010300,0.542,0.853,0.000000,3,0.1080,-6.407,0,0.0498,105.256,4,0.370
2,76,231733,0.008170,0.737,0.463,0.447000,0,0.2550,-7.828,1,0.0792,123.881,4,0.324
3,74,216933,0.026400,0.451,0.970,0.003550,0,0.1020,-4.938,1,0.1070,122.444,4,0.198
4,56,223826,0.000954,0.447,0.766,0.000000,10,0.1130,-5.065,1,0.0313,172.011,4,0.574
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18830,60,159645,0.893000,0.500,0.151,0.000065,11,0.1110,-16.107,1,0.0348,113.969,4,0.300
18831,60,205666,0.765000,0.495,0.161,0.000001,11,0.1050,-14.078,0,0.0301,94.286,4,0.265
18832,23,182211,0.847000,0.719,0.325,0.000000,0,0.1250,-12.222,1,0.0355,130.534,4,0.286
18833,55,352280,0.945000,0.488,0.326,0.015700,3,0.1190,-12.020,1,0.0328,106.063,4,0.323


In [84]:
pca = PCA(n_components=1)

# Fit the PCA model on the transformed credit card DataFrame
songs_pca = pca.fit_transform(df_songs_features)

# Review the first 5 rows of list data
songs_pca[:5]

array([[43383.41903362],
       [-2016.58134121],
       [12783.41874931],
       [-2016.58124887],
       [ 4876.41910664]])

In [85]:
pca.explained_variance_ratio_

array([0.99999967])

In [86]:
# Create the PCA DataFrame
songs_pca_df = pd.DataFrame(
    songs_pca,
    columns=["PCA"]
)

# Review the PCA DataFrame
songs_pca_df


Unnamed: 0,PCA
0,43383.419034
1,-2016.581341
2,12783.418749
3,-2016.581249
4,4876.419107
...,...
14921,-59304.581291
14922,-13283.581413
14923,-36738.581081
14924,133330.418677


In [91]:
song_names = df_songs[['song_name']]

song_names = song_names.reset_index(drop=True)
song_names

Unnamed: 0,song_name
0,boulevard of broken dreams
1,in the end
2,seven nation army
3,by the way
4,how you remind me
...,...
14921,let it breathe
14922,answers
14923,sudden love (acoustic)
14924,gentle on my mind


In [115]:
#concatenate the two dfs
horizontal_concat = pd.concat([song_names, songs_pca_df], axis=1)

horizontal_concat = horizontal_concat.sort_values(by=['PCA'])
horizontal_concat = horizontal_concat.reset_index(drop=True)

#order it by PCA value
horizontal_concat = horizontal_concat.sort_values(by=['PCA'])
horizontal_concat

Unnamed: 0,song_name,PCA
0,gina rodriguez - latinos trending intro,-2.069496e+05
1,the avengers,-1.927636e+05
2,thank you,-1.875766e+05
3,shirley chisholm - 1972,-1.830296e+05
4,twins,-1.689356e+05
...,...,...
14921,autobahn - 3-d,6.479394e+05
14922,i have a dream - the complete speech of martin...,8.289834e+05
14923,2112: overture / the temples of syrinx / disco...,1.014716e+06
14924,play,1.136988e+06


In [113]:
index_value = horizontal_concat.index[horizontal_concat['song_name'] == 'boulevard of broken dreams'].tolist()[0]
index_value

#get the index value of the next song


#return the actual song name 
new_song_name = horizontal_concat.iloc[index_value]
new_song_name

song_name    boulevard of broken dreams
PCA                        42316.419039
Name: 12322, dtype: object

In [102]:
#check the value for green day song 
index_value = horizontal_concat.index[horizontal_concat['song_name'] == 'thank you'].tolist()[0]
index_value
#get the index value of the next song
next_song = int(index_value) +1 


#return the actual song name 
new_song_name = horizontal_concat.iloc[next_song]['song_name']
new_song_name



'shirley chisholm - 1972'

In [103]:
index_value = horizontal_concat.index[horizontal_concat['song_name'] == 'boulevard of broken dreams'].tolist()[0]
index_value

#get the index value of the next song


#return the actual song name 
new_song_name = horizontal_concat.iloc[index_value]
new_song_name


song_name    boulevard of broken dreams
PCA                        42316.419039
Name: 12322, dtype: object

In [104]:
#given a song, ask the program to spit out the songs that are nearest to it in this df (1 above and 1 below.)

def song_prediction(songname): 
    index_value = horizontal_concat.index[horizontal_concat['song_name'] == songname].tolist()[0]
    song_behind = int(index_value) - 1 
    song_front = int(index_value) + 1 
    back_song_name = horizontal_concat.iloc[song_behind]['song_name']
    front_song_name = horizontal_concat.iloc[song_front]['song_name']
    
    return back_song_name, front_song_name


song_prediction('shirley chisholm - 1972')

('thank you', 'twins')

In [105]:
song_prediction('footloose')

('working for the weekend', 'work hard, play hard')

In [106]:
song_prediction('party in the u.s.a.')

('waze', 'shoegoo')

In [111]:
#return the playlist name based on the song given 
df_song_info = pd.read_csv(
    Path("song_info.csv")
)

df_song_info['song_name'] = df_song_info['song_name'].str.lower()
df_song_info.sort_values(by=['song_name'])

# df_song_info = df_song_info.drop_duplicates(keep='first')

Unnamed: 0,song_name,artist_name,album_names,playlist
5247,til i get it right,Tammy Wynette,The Essential Tammy Wynette,Country s Greatest Hits_ The 70 s
13859,till i collapse,Eminem,The Eminem Show,Power Workout
9732,till i collapse,Eminem,The Eminem Show,Jock Jams
17260,till i collapse,Eminem,The Eminem Show,Throwback Workout
16726,"""40"" - live version 1983",U2,Under A Blood Red Sky,The 80s All Lined Up
...,...,...,...,...
4531,通過驗證,艾福杰尼,通過驗證,Chinese Hip-Hop Stars
4513,都市森林,Force嘟嘟,都市森林,Chinese Hip-Hop Stars
9886,비상연락음,IndEgo Aid,비상연락음,K-Hip-Hop Beats
6694,빨간 맛 red flavor,Red Velvet,The Red Summer - Summer Mini Album,Fantastic K-Workout


In [112]:
#give it a song, and make spit back df of only those songs in that playlist
def playlist_filter(song):
    
    index_value = df_song_info.index[df_song_info['song_name'] == song].tolist()[0]
    playlist_name = df_song_info.iloc[index_value]['playlist']
    
    #from the song info df get all those songs names in that playlist
    df_playlist = df_song_info[(df_song_info.playlist == playlist_name)]

    
    #inner join the 2 DF based on the song_name 
    df_merged = df_playlist.merge(horizontal_concat, how = 'inner', left_on='song_name', right_on='song_name')
    

    
    
    return df_merged
    
playlist_filter('how you remind me')

Unnamed: 0,song_name,artist_name,album_names,playlist,PCA
0,boulevard of broken dreams,Green Day,Greatest Hits: God s Favorite Band,00s Rock Anthems,42316.419039
1,boulevard of broken dreams,Green Day,Greatest Hits: God s Favorite Band,00s Rock Anthems,43383.419034
2,in the end,Linkin Park,Hybrid Theory,00s Rock Anthems,-2016.581341
3,seven nation army,The White Stripes,Elephant,00s Rock Anthems,-46030.581343
4,seven nation army,The White Stripes,Elephant,00s Rock Anthems,12783.418749
...,...,...,...,...,...
106,remedy,Seether,Karma and Effect,00s Rock Anthems,-24298.581231
107,remedy,Seether,Karma and Effect,00s Rock Anthems,-11736.581207
108,plug in baby,Muse,Origin Of Symmetry,00s Rock Anthems,-949.581140
109,change (in the house of flies) - in the house ...,Deftones,White Pony (U.S. Version),00s Rock Anthems,80583.418881


In [112]:
def song_prediction(songname):
    
    playlist = playlist_filter(songname)
    
    
    index_value = horizontal_concat.index[horizontal_concat['song_name'] == songname].tolist()[0]
    song_behind = int(index_value) - 1 
    song_front = int(index_value) + 1 
    back_song_name = horizontal_concat.iloc[song_behind]['song_name']
    front_song_name = horizontal_concat.iloc[song_front]['song_name']
    return back_song_name, front_song_name

song_prediction('how you remind me')

('solitaire', 'what about me')