In [1]:
import pandas as pd
import sklearn

In [2]:
data = pd.read_csv("song_dataset.csv", encoding='latin1', na_values=["??"])

In [3]:
data.head()

Unnamed: 0,Title,Lead Actor,Singer 1,Singer 2,Composer,Genre,Movie
0,Tum Hi Ho,Aditya Roy Kapur,Arijit Singh,,Mithoon,Romantic,Aashiqui 2
1,Tere Sang Yaara,Akshay Kumar,Atif Aslam,,Arko Pravo Mukherjee,Romantic,Rustom
2,Kesariya,Ranbir Kapoor,Arijit Singh,,Pritam,Romantic,Brahmastra: Part One  Shiva
3,Zinda,Farhan Akhtar,Siddharth Mahadevan,,Shankar Ehsaan Loy,Motivational,Bhaag Milkha Bhaag
4,Apna Bana Le,Varun Dhawan,Arijit Singh,,Sachin-Jigar,Romantic,Bhediya


In [4]:
data.shape

(66, 7)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
selected_cols = ['Title', 'Lead Actor', 'Singer 1', 'Singer 2', 'Composer', 'Genre', 'Movie']

data[selected_cols].fillna('',inplace=True)

data['combined'] = data[selected_cols].astype(str).agg(' '.join, axis=1)

vectorizer = TfidfVectorizer()
Model = vectorizer.fit_transform(data['combined'])

similarity = cosine_similarity(Model)

In [7]:
def recommend(input):
    input = input.lower()
    
    # Title case sensitive
    index = data[ data['combined'].str.lower().str.contains(input)].index.tolist()

    if len(index) == 0:
        return "Song not found in dataset."

    index = index[0]

    # Similarity scores
    scores = list(enumerate(similarity[index]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Top 5 similar movies
    top_movies = scores[1:6]

    print("\nTop 5 recommendations for",input,":")
    for i,score in top_movies:
        print(data['Title'][i],", Score: ",score)

In [8]:
recommend("Romantic")


Top 5 recommendations for romantic :
Tujhe Kitna Chahne Lage , Score:  0.22333822559876612
Tum Se Hi , Score:  0.21978659119965807
Phir Mohabbat , Score:  0.17884631275741103
Khairiyat , Score:  0.15569073288994187
Dil Ibaadat , Score:  0.1384674143760169


In [9]:
import joblib
joblib.dump(similarity, 'similarity.pkl')
joblib.dump(data, 'song_list.pkl') # This

['song_list.pkl']