In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv('dataset.csv')
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

(114000, 21)

In [3]:
duplicated_rows = df['track_id'].duplicated().sum()

if duplicated_rows != 0:
    df = df.drop_duplicates(subset=['track_id'])
    undroped_nrows, ncols = df.shape
    dropped_rows = duplicated_rows
    print(f'There is a total of {dropped_rows} rows dropped based on "track_id')
    
print(f'Final dataframe shape: {df.shape}')

There is a total of 24259 rows dropped based on "track_id
Final dataframe shape: (89741, 21)


In [4]:
total_rows_with_missing_values = (df.isnull().any(axis=1)).sum()
print(f'Total number of rows with missing values: {total_rows_with_missing_values}')

Total number of rows with missing values: 1


In [5]:
index_to_drop = df[df.isnull().any(axis=1)].index
df.drop(index_to_drop, inplace=True)
print(f'Rows with missing values dropped. Updated dataframe shape: {df.shape}')

Rows with missing values dropped. Updated dataframe shape: (89740, 21)


In [6]:
df = df[(df['duration_ms']<=600000) & (df['duration_ms']>=60000)]
print(f'The dataframe now has {df.shape[0]} rows')

The dataframe now has 88419 rows


In [7]:
df = df[df['popularity']>=10]
print(f'The dataframe now has {df.shape[0]} rows')

The dataframe now has 74778 rows


In [8]:
df = df.drop(['time_signature', 'key'], axis=1)

In [9]:
output_file = 'Project_data.csv'
df.to_csv(output_file, index=False)

In [10]:
df.shape

(74778, 19)

In [11]:
features = ['danceability', 'energy', 'valence', 'tempo', 'acousticness', 'liveness', 'speechiness', 'instrumentalness']
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(df[features])
pd.DataFrame(normalized_features, columns=features).head()

Unnamed: 0,danceability,energy,valence,tempo,acousticness,liveness,speechiness,instrumentalness
0,0.686294,0.461,0.718593,0.361245,0.032329,0.358,0.148494,1e-06
1,0.426396,0.166,0.268342,0.318397,0.927711,0.101,0.079232,6e-06
2,0.44467,0.359,0.120603,0.313643,0.210843,0.117,0.05784,0.0
3,0.270051,0.0596,0.143719,0.746758,0.908635,0.132,0.037695,7.1e-05
4,0.627411,0.443,0.167839,0.492863,0.470884,0.0829,0.054621,0.0


In [12]:
df['track_id'] = df.index
df['artists'] = df['artists'].astype(str)
df['album_name'] = df['album_name'].astype(str)
df['track_name'] = df['track_name'].astype(str)
df['track_genre'] = df['track_genre'].astype(str)

In [13]:
preprocessed_file_path = 'preprocessed_dataset.csv'
df.to_csv(preprocessed_file_path, index=False)

In [14]:
from sklearn.neighbors import NearestNeighbors

k = 5
knn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
knn_model.fit(normalized_features)

def recommend_songs(song_index, num_recommendations=5):
    distances, indices = knn_model.kneighbors([normalized_features[song_index]], n_neighbors=num_recommendations + 1)
    recommendations = df.iloc[indices[0][1:]]
    return recommendations[['track_name', 'artists', 'album_name', 'track_genre', 'popularity']]

sample_song_index = 21000

print(df.iloc[sample_song_index])
recommended_songs = recommend_songs(sample_song_index)
recommended_songs

Unnamed: 0              28137
track_id                28137
artists             GG Magree
album_name          Deja Reve
track_name          Deja Reve
popularity                 54
duration_ms            168073
explicit                False
danceability            0.611
energy                  0.618
loudness               -6.573
mode                        0
speechiness            0.0655
acousticness           0.0029
instrumentalness     0.000757
liveness                 0.24
valence                 0.515
tempo                  99.999
track_genre               dub
Name: 28137, dtype: object


Unnamed: 0,track_name,artists,album_name,track_genre,popularity
25565,Kylie - Remix,Akcent;Midi Culture,Kylie (Remix),disco,46
65806,DALLA DALLA,ITZY,IT'z Different,k-pop,68
23110,Pumped Up,Klingande,The Album,deep-house,58
66308,Wheels on the Bus,Hip Hop Harry,Nursery Rhymes (Bonus Tracks),kids,14
13200,House Party,Aaron Smith;Malik Hart,House Party,chicago-house,21


In [15]:
def recommend_by_song_name(song_name, num_recommendations=5):
    matches = df[df['track_name'].str.contains(song_name, case=False, na=False)]
    if matches.empty:
        return f"No matches found for song name: {song_name}"
    print(matches)
    song_index = matches.index[0]
    print(f"Recommendations based on: {matches.iloc[0]['track_name']} by {matches.iloc[0]['artists']}")
    return recommend_songs(song_index, num_recommendations)

song_name_input = "Rap God"

recommendations = recommend_by_song_name(song_name_input)
recommendations

       Unnamed: 0  track_id artists                         album_name  \
51614       51614     51614  Eminem  The Marshall Mathers LP2 (Deluxe)   

      track_name  popularity  duration_ms  explicit  danceability  energy  \
51614    Rap God          78       363521      True         0.708   0.843   

       loudness  mode  speechiness  acousticness  instrumentalness  liveness  \
51614     -2.66     1        0.314         0.397               0.0     0.799   

       valence   tempo track_genre  
51614    0.625  148.14     hip-hop  
Recommendations based on: Rap God by Eminem


Unnamed: 0,track_name,artists,album_name,track_genre,popularity
75096,Returning,Tony O'Connor,Mariner,new-age,32
101204,Sonder,Modal Colours,Sonder,sleep,65
11993,"Mother Of God, Here I Stand",John Tavener;Voces8,Lux,british,27
101765,Perspective,Jorin Williams,Perspective,sleep,60
101355,Amado Nervo,Sayulita,Amado Nervo,sleep,62


In [16]:
def recommend_by_track_id(track_id, num_recommendations=5):
    if track_id not in df.index:
        return f"No matches found for track ID: {track_id}"
    
    song_index = track_id
    print(f"Recommendations based on: {df.loc[song_index, 'track_name']} by {df.loc[song_index, 'artists']}")
     
    distances, indices = knn_model.kneighbors([normalized_features[song_index]], n_neighbors=num_recommendations + 1)
    recommendations = df.iloc[indices[0][1:]]
    
    return recommendations[['track_id', 'track_name', 'artists', 'album_name', 'track_genre', 'popularity']]

sample_track_id = 22
recommendations = recommend_by_track_id(sample_track_id)
print(recommendations)

Recommendations based on: Say Something by A Great Big World;Christina Aguilera
        track_id              track_name  \
6              6           Say Something   
164          164           Say Something   
75079      75079  Reflections of Passion   
55427      55427                    Ghar   
113653    113653               Our Jesus   

                                     artists                   album_name  \
6       A Great Big World;Christina Aguilera  Is There Anybody Out There?   
164     A Great Big World;Christina Aguilera                Say Something   
75079                                  Yanni          The Essential Yanni   
55427                         Bharat Chauhan                Ghar - Single   
113653               Bethel Music;David Funk                       Simple   

        track_genre  popularity  
6          acoustic          74  
164        acoustic          58  
75079       new-age          47  
55427        indian          49  
113653  world-music   

In [17]:
def recommend_by_features(track_id, filter_by="genre", num_recommendations=5):
    if track_id not in df.index:
        return f"No matches found for track ID: {track_id}"
    
    song_index = track_id
    print(f"Recommendations based on: {df.loc[song_index, 'track_name']} by {df.loc[song_index, 'artists']}")
    
    selected_song = df.loc[track_id]
    filter_value = selected_song[filter_by]
    
    filtered_data = df[df[filter_by] == filter_value]

    if filtered_data.empty:
        return f"No songs found with the same {filter_by} as the selected song."
    
    filtered_indices = filtered_data.index
    filtered_features = normalized_features[filtered_indices]
    
    knn_model_filtered = NearestNeighbors(n_neighbors=num_recommendations + 1, metric="cosine")
    knn_model_filtered.fit(filtered_features)
    
    song_index_in_filtered = list(filtered_indices).index(track_id)
    distances, indices = knn_model_filtered.kneighbors([filtered_features[song_index_in_filtered]], n_neighbors=num_recommendations + 1)
    
    recommended_indices = [filtered_indices[i] for i in indices[0][1:]]
    recommendations = df.loc[recommended_indices]
    
    return recommendations[['track_id', 'track_name', 'artists', 'album_name', 'track_genre', 'popularity']]

sample_track_id = 4
recommendations = recommend_by_features(sample_track_id, filter_by="track_genre",num_recommendations=5)
print(recommendations)

Recommendations based on: Hold On by Chord Overstreet
     track_id                            track_name               artists  \
896       896  Alone in the Dark Mansion of Madness            Harley Poe   
341       341                        Tell The World       Eric Hutchinson   
251       251                    Believe - Acoustic            John Adams   
272       272                                Arcade  Andrew Foy;Renee Foy   
613       613                          First & Last          Zack Tabudlo   

             album_name track_genre  popularity  
896           Horrorful    acoustic          26  
341        Pure Fiction    acoustic          58  
251  Believe (Acoustic)    acoustic          57  
272              Arcade    acoustic          37  
613             Episode    acoustic          49  


In [18]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack, csr_matrix

In [19]:
df = pd.read_csv('preprocessed_dataset.csv')

df['artists'] = df['artists'].astype(str)
df['track_genre'] = df['track_genre'].astype(str)
df['track_name'] = df['track_name'].astype(str)

In [20]:
numerical_features = ['danceability', 'energy', 'valence', 'tempo', 'acousticness', 'liveness', 'speechiness', 'instrumentalness']
scaler = MinMaxScaler()
normalized_numerical = scaler.fit_transform(df[numerical_features])
numerical_sparse = csr_matrix(normalized_numerical)

In [21]:
genre_vectorizer = TfidfVectorizer()
genre_features = genre_vectorizer.fit_transform(df['track_genre'])

artist_vectorizer = TfidfVectorizer()
artist_features = artist_vectorizer.fit_transform(df['artists'])

In [22]:
num_w, gen_w, art_w = 1.0, 1.5, 0.5
combined_features = hstack([
    numerical_sparse * num_w,
    genre_features * gen_w,
    artist_features * art_w
])

In [23]:
knn_model = NearestNeighbors(n_neighbors=51, metric='cosine')
knn_model.fit(combined_features)

In [24]:
def content_based_recommend(song_index, top_n=50):
    if song_index >= len(df):
        return f"Index {song_index} out of bounds."
    
    distances, indices = knn_model.kneighbors(combined_features[song_index], n_neighbors=top_n + 1)
    recommended_indices = indices[0][1:]
    return df.iloc[recommended_indices][['track_name', 'artists', 'track_genre', 'popularity']]

In [25]:
def recommend_by_song_name(song_name, top_n=50):
    matches = df[df['track_name'].str.contains(song_name, case=False, na=False)]
    if matches.empty:
        return f"No song found with name similar to '{song_name}'"
    
    song_index = matches.index[0]
    print(f"\nRecommendations based on: '{df.loc[song_index, 'track_name']}' by {df.loc[song_index, 'artists']}")
    return content_based_recommend(song_index, top_n)

In [26]:
song_index = 38155  
print(df.loc[song_index, ['track_name', 'artists']])
print(content_based_recommend(song_index, top_n=50))

print(recommend_by_song_name("Rap God", top_n=50))

track_name               Nanchaku
artists       Seedhe Maut;MC STAN
Name: 38155, dtype: object
                                             track_name  \
38026                                    Astaghfirullah   
37877                                      Ek Din Pyaar   
37705                                        Shana Bann   
37706                                    Basti Ka Hasti   
38119                                          Tadipaar   
38086                                     Kahan Par Hai   
37780                                             Bitch   
38098                                     Chalo Chalein   
38096                                        SICKO MODE   
37753                                         Freestyle   
38065                                        Positivity   
37948                                         Lil Bunty   
37881                                         They Know   
37974                                      KR L$DA SIGN   
37785               

In [27]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth

In [28]:
def get_spotify_track_id(sp, track_name, artist_name):
    main_artist = artist_name.split(";")[0].strip()
    query = f"track:{track_name} artist:{main_artist}"
    try:
        results = sp.search(q=query, type='track', limit=1)
        tracks = results.get('tracks', {}).get('items', [])
        if tracks:
            return tracks[0]['id']
    except spotipy.exceptions.SpotifyException as e:
        print(f"Spotify search failed for '{track_name}' by '{main_artist}': {e}")
    return None

In [29]:
def recommend_and_create_playlist(base_track_id, num_recommendations=50):
    recommendations = content_based_recommend(base_track_id, top_n=num_recommendations)
    if isinstance(recommendations, str):
        print(recommendations)
        return

    sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
        client_id='6ba6e7ce109a4f5386a882a14af9548e',
        client_secret='215a70945069468aa485345cfd0bc79b',
        redirect_uri='http://127.0.0.1:8000/callback',
        scope='playlist-modify-public playlist-modify-private'
    ))

    user_id = sp.current_user()['id']

    base_song_name = df.loc[base_track_id, 'track_name']
    playlist_name = f"Recommendations Based on {base_song_name}"
    playlist = sp.user_playlist_create(user=user_id, name=playlist_name, public=False)
    playlist_id = playlist['id']

    track_uris = []
    for _, row in recommendations.iterrows():
        track_id = get_spotify_track_id(sp, row['track_name'], row['artists'])
        if track_id:
            track_uris.append(f"spotify:track:{track_id}")

    if not track_uris:
        print("No valid Spotify tracks found.")
        return

    for i in range(0, len(track_uris), 100):
        sp.playlist_add_items(playlist_id=playlist_id, items=track_uris[i:i+100])

    print(f"✅ Playlist '{playlist_name}' created with {len(track_uris)} tracks.")

In [30]:
sample_track_id = 37938 
recommend_and_create_playlist(sample_track_id, num_recommendations=50)

✅ Playlist 'Recommendations Based on HIGHEST IN THE ROOM' created with 50 tracks.
