In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer #For extracting tags and filtering out stop words

In [None]:
dataset_PATH = #Enter the path to dataset.csv

In [None]:
songs = pd.read_csv(dataset_PATH)

In [4]:
songs = songs[['index','track_name','artists','album_name','duration','danceability','energy','key','loudness','speechiness','acousticness','liveness','valence','tempo','track_genre']]

In [5]:
songs.isnull().sum()
songs.dropna(inplace=True)

In [6]:
songs['duration'] = songs['duration'].apply(lambda x:round(x/1000))

In [7]:
songs.head()

Unnamed: 0,index,track_name,artists,album_name,duration,danceability,energy,key,loudness,speechiness,acousticness,liveness,valence,tempo,track_genre
0,0,Comedy,Gen Hoshino,Comedy,231,0.676,0.461,1,-6.746,0.143,0.0322,0.358,0.715,87.917,acoustic
1,1,Ghost - Acoustic,Ben Woodward,Ghost (Acoustic),150,0.42,0.166,1,-17.235,0.0763,0.924,0.101,0.267,77.489,acoustic
2,2,To Begin Again,Ingrid Michaelson;ZAYN,To Begin Again,211,0.438,0.359,0,-9.734,0.0557,0.21,0.117,0.12,76.332,acoustic
3,3,Can't Help Falling In Love,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,202,0.266,0.0596,0,-18.515,0.0363,0.905,0.132,0.143,181.74,acoustic
4,4,Hold On,Chord Overstreet,Hold On,199,0.618,0.443,2,-9.681,0.0526,0.469,0.0829,0.167,119.949,acoustic


In [8]:
songs['track_name']=songs['track_name'].apply(lambda x:x.lower())
songs['artists']=songs['artists'].apply(lambda x:x.lower())
songs['album_name']=songs['album_name'].apply(lambda x:x.lower())

In [9]:
songs['artists']=songs['artists'].apply(lambda x: ';'.join([artist.strip() for artist in x.split(';')]))

In [10]:
from sklearn.feature_extraction.text import HashingVectorizer
hashing = HashingVectorizer(
    tokenizer=lambda x:x.split(';'),
    n_features=4096,
    binary=True,
    alternate_sign=False,
    norm=None
)

artist_features = hashing.fit_transform(songs['artists'])



In [11]:
import re

# Remove special chars but keep essential info (feat., remix etc.)
songs['track_name']=songs['track_name'].apply(
    lambda x: re.sub(r'[^\w\s\-]', '', x.lower())  # Keep hyphens and words
    .replace('-', ' ')  # Replace hyphens with spaces (optional)
    .strip()
)


In [12]:
# Clean album names in one line
songs['album_name']=songs['album_name'].apply(
    lambda x: ' '.join(
        ''.join(
            c if c.isalnum() else ' ' 
            for c in str(x).lower()
        ).split()  # Removes extra spaces
    )
)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TRACK NAMES (TF-IDF)
tfidf_track = TfidfVectorizer(
    max_features=4000,      # 4000 unique words
    stop_words='english',  # Filters "the", "and" etc.
    lowercase=True
)

# ARTISTS (Hashing)
hashing = HashingVectorizer(
    n_features=4096,       # 4096-dim vector
    tokenizer=lambda x: x.split(';'),
    binary=True
)

# ALBUMS (TF-IDF)
tfidf_album = TfidfVectorizer(
    max_features=4000       # 4000 unique album terms
)

In [14]:
track_features = tfidf_track.fit_transform(songs['track_name'])
artist_features = hashing.fit_transform(songs['artists'])
album_features = tfidf_album.fit_transform(songs['album_name'])

In [15]:
print(f"Track shape: {track_features.shape}") 
print(f"Artist shape: {artist_features.shape}")
print(f"Album shape: {album_features.shape}") 

Track shape: (113999, 4000)
Artist shape: (113999, 4096)
Album shape: (113999, 4000)


In [16]:
songs.head()

Unnamed: 0,index,track_name,artists,album_name,duration,danceability,energy,key,loudness,speechiness,acousticness,liveness,valence,tempo,track_genre
0,0,comedy,gen hoshino,comedy,231,0.676,0.461,1,-6.746,0.143,0.0322,0.358,0.715,87.917,acoustic
1,1,ghost acoustic,ben woodward,ghost acoustic,150,0.42,0.166,1,-17.235,0.0763,0.924,0.101,0.267,77.489,acoustic
2,2,to begin again,ingrid michaelson;zayn,to begin again,211,0.438,0.359,0,-9.734,0.0557,0.21,0.117,0.12,76.332,acoustic
3,3,cant help falling in love,kina grannis,crazy rich asians original motion picture soun...,202,0.266,0.0596,0,-18.515,0.0363,0.905,0.132,0.143,181.74,acoustic
4,4,hold on,chord overstreet,hold on,199,0.618,0.443,2,-9.681,0.0526,0.469,0.0829,0.167,119.949,acoustic


In [17]:
from sklearn.preprocessing import OneHotEncoder

# Reshape for encoder and fit
genre_encoder = OneHotEncoder()
genre_encoded = genre_encoder.fit_transform(songs[['track_genre']])  # returns sparse matrix

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

genre_vectorizer = CountVectorizer()
genre_encoded = genre_vectorizer.fit_transform(songs['track_genre'])  # returns sparse matrix


In [19]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
audio_features_columns = ['danceability', 'energy', 'duration', 'speechiness', 'liveness', 'tempo']

scaler = StandardScaler()
print("Fitting scaler...")
scaler.fit(songs[audio_features_columns])
print("Scaler fitted.")


# The scaler.transform() method is now ready to be used
final_features = hstack([
    scaler.transform(songs[audio_features_columns]),  # scaled numeric
    track_features,                                   # TF-IDF
    artist_features,                                  # Hashing
    album_features,                                   # TF-IDF
    genre_encoded                                     # One-hot or vectorized genre
])


print(f"Final shape: {final_features.shape}")

Fitting scaler...
Scaler fitted.
Final shape: (113999, 12216)


In [20]:
from sklearn.decomposition import TruncatedSVD

# Pick dimensionality — 128 or 256 are common choices
svd = TruncatedSVD(n_components=256, random_state=42)

print("Fitting TruncatedSVD on sparse features...")
final_reduced = svd.fit_transform(final_features).astype('float32')
print("Reduced shape:", final_reduced.shape)


Fitting TruncatedSVD on sparse features...
Reduced shape: (113999, 256)


In [21]:
import faiss

# Normalize for cosine similarity
faiss.normalize_L2(final_reduced)

# Build FAISS index (inner product acts like cosine now)
d = final_reduced.shape[1]
index = faiss.IndexFlatIP(d)
index.add(final_reduced)

print("FAISS index built with", index.ntotal, "vectors.")


FAISS index built with 113999 vectors.


In [22]:
import faiss

# Normalize vectors for cosine similarity
print("Normalizing vectors for cosine similarity...")
faiss.normalize_L2(final_reduced)

# Build FAISS index using inner product (acts as cosine after normalization)
d = final_reduced.shape[1]
index = faiss.IndexFlatIP(d)
index.add(final_reduced)

print("FAISS index built with", index.ntotal, "vectors.")


Normalizing vectors for cosine similarity...
FAISS index built with 113999 vectors.


In [23]:
def recommend_by_name(song_name, k=10):
    processed_input = re.sub(r'[^\w\s\-]', '', song_name.lower()).replace('-', ' ').strip()

    matches = songs[songs['track_name'] == processed_input]
    if matches.empty:
        matches = songs[songs['track_name'].str.contains(processed_input, case=False, na=False)]
        if matches.empty:
            print(f"No matches found for '{song_name}'.")
            return None

    track_index = matches.index[0]

    query_vector = final_reduced[track_index:track_index+1]
    faiss.normalize_L2(query_vector)

    D, I = index.search(query_vector, k + 5)

    recommended_indices = [
        idx for idx in I[0]
        if idx != track_index and songs.iloc[idx]['track_name'] != processed_input
    ][:k]

    print(f"\nTop {k} recommendations for: '{songs.iloc[track_index]['track_name']}' by {songs.iloc[track_index]['artists']}")
    return songs.iloc[recommended_indices][['track_name', 'artists', 'album_name', 'track_genre']]


In [24]:
name = input("Enter the song name: ")

recommend_by_name(name, k=25)



Top 25 recommendations for: 'smack that' by akon;eminem


Unnamed: 0,track_name,artists,album_name,track_genre
60913,find it,elephant man,find it,j-dance
20338,theres nothing holdin me back,shawn mendes,tutti in piscina canzoni per bambini,dance
60568,edison,wednesday campanella,maneki neko edison,j-dance
60026,edison,wednesday campanella,neon,j-dance
20811,ferrari,james hype;miggy dela rosa,ferrari,dance
20251,numb,marshmello;khalid,numb,dance
20717,temperature,sean paul,the trinity,dance
60435,entanglement raw,teejay,entanglement,j-dance
20962,ferrari,james hype;miggy dela rosa,beats electro mood,dance
60155,bubbles,tokyo machine,bubbles,j-dance
