In [2]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import pickle


In [3]:
# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Load the dataset
df = pd.read_csv("song_dataset.csv")

In [6]:
# Display the last 15 rows
df.tail(15)

Unnamed: 0,Song Name,Artist Name,Release Date,Genres,Tracks,Popularity,Album Name,Album Type,Song Duration (ms)
3341,Peanut Butter Jelly,Galantis,2015-06-05,"dance pop, edm, pop, pop dance","Forever Tonight, Gold Dust, In My Head, Runawa...",64,Pharmacy,album,203133
3342,Pretty Low,Galantis,2024-06-21,"dance pop, edm, pop, pop dance",Pretty Low,62,Pretty Low,single,172655
3343,Run (with Galantis),Galantis,2022-03-18,"dance pop, edm, pop, pop dance","History (with Joel Corry), Crazy What Love Can...",53,Only Honest On The Weekend (Deluxe),album,190093
3344,Bones (feat. OneRepublic),Galantis,2020-02-07,"dance pop, edm, pop, pop dance","Steel, Faith (with Dolly Parton) [feat. Mr. Pr...",52,Church,album,205792
3345,One Cry (feat. Rosa Linn),Galantis,2024-05-17,"dance pop, edm, pop, pop dance","Dust, One, Two, & 3, BANG BANG! (My Neurodiver...",56,Rx,album,186790
3346,I Could Be The One (Avicii Vs. Nicky Romero) -...,Nicky Romero,2012-12-26,"big room, dutch edm, dutch house, edm, electro...",I Could Be The One (Avicii Vs. Nicky Romero) -...,69,I Could Be The One [Avicii vs Nicky Romero],single,208316
3347,All You Need Is Love,Nicky Romero,2023-05-19,"big room, dutch edm, dutch house, edm, electro...",All You Need Is Love,56,All You Need Is Love,single,139927
3348,Lights Up - Dimitri Vegas & Like Mike Edit,Nicky Romero,2024-06-07,"big room, dutch edm, dutch house, edm, electro...","Lights Up - Dimitri Vegas & Like Mike Edit, Li...",53,Lights Up (Dimitri Vegas & Like Mike Edit),single,190746
3349,Toulouse - Bobby Anthony Vocal Mix,Nicky Romero,2012-03-02,"big room, dutch edm, dutch house, edm, electro...",Toulouse - Bobby Anthony Vocal Mix,47,Toulouse (Bobby Anthony Vocal Mix),single,178124
3350,I Wanna Dance,Nicky Romero,2023-10-20,"big room, dutch edm, dutch house, edm, electro...",I Wanna Dance,48,I Wanna Dance,single,198805


In [7]:
# Count the number of singles in album type
print(df['Album Type'].value_counts())

album          1929
single         1255
compilation     172
Name: Album Type, dtype: int64


In [8]:
# Display the shape of the dataframe
print(df.shape)

(3356, 9)


In [9]:
# Count missing values
print(df.isnull().sum())

Song Name              0
Artist Name            0
Release Date           0
Genres                15
Tracks                 0
Popularity             0
Album Name             0
Album Type             0
Song Duration (ms)     0
dtype: int64


In [10]:

# Display rows where genres is null
print(df[df['Genres'].isnull()])

                                     Song Name          Artist Name  \
1472  Minnalgal Koothadum - From "Polladhavan"  G. V. Prakash Kumar   
1473     Engeyum Eppothum - From "Polladhavan"  G. V. Prakash Kumar   
1474                                  Hey Baby  G. V. Prakash Kumar   
1475                             Vinave Vinave  G. V. Prakash Kumar   
1476                               A Love Life  G. V. Prakash Kumar   
1477                                 Oday Oday  G. V. Prakash Kumar   
1478                                  Challaga  G. V. Prakash Kumar   
1479            Neeye Sol - From "Polladhavan"  G. V. Prakash Kumar   
1480                                 Nee Valle  G. V. Prakash Kumar   
1481      Alibaba Thangam - From "Polladhavan"  G. V. Prakash Kumar   
1612                                      NUMB               Chitra   
1613                                Aftertaste               Chitra   
1614                                 Throwaway               Chitra   
1615  

In [11]:
# Count distinct genres
print(df['Genres'].nunique())

181


In [None]:
# Display the most common genres with their counts
print(df['Genres'].value_counts().head(10))

filmi, modern bollywood                  640
filmi                                    130
k-pop, k-pop girl group                  130
pop                                      100
desi pop, filmi, modern bollywood         80
k-pop                                     50
melodic rap, rap, trap                    50
filmi, indian folk, modern bollywood      50
filmi, hare krishna, modern bollywood     40
dance pop, pop                            40
Name: Genres, dtype: int64


In [12]:
# Drop rows with null genres
df.dropna(subset=['Genres'], inplace=True)

In [13]:
# Count missing values again
print(df.isnull().sum())

# Display the shape of the dataframe
print(df.shape)

Song Name             0
Artist Name           0
Release Date          0
Genres                0
Tracks                0
Popularity            0
Album Name            0
Album Type            0
Song Duration (ms)    0
dtype: int64
(3341, 9)


In [14]:
# Convert release date to datetime
df['Release Date'] = pd.to_datetime(df['Release Date'])

# Convert song duration from milliseconds to minutes
df['Song Duration (min)'] = (df['Song Duration (ms)'] / 60000).apply('{:.1f}'.format).astype(float)

In [15]:
# Drop the original song duration column
df.drop(columns=['Song Duration (ms)'], inplace=True)

In [16]:

# Check for duplicates
print(df.duplicated().sum())

# Display duplicate data
print(df[df["Song Name"].duplicated()])


479
                                              Song Name   Artist Name  \
101                       Fortnight (feat. Post Malone)   Post Malone   
160                                          LA CANCIÓN      J Balvin   
164                                           I Like It      J Balvin   
170                                lovely (with Khalid)        Khalid   
202                     Eastside (with Halsey & Khalid)        Halsey   
...                                                 ...           ...   
3275                                        The Spectre   Alan Walker   
3278                                            Secrets         KSHMR   
3340                                          Mountains      Galantis   
3346  I Could Be The One (Avicii Vs. Nicky Romero) -...  Nicky Romero   
3355                                         In My Head  Nicky Romero   

     Release Date                                             Genres  \
101    2024-04-18                     dfw rap, 

In [17]:
# Sort data in descending order of release date
df = df.sort_values(by='Release Date', ascending=False)

In [18]:



# Display summary of the dataframe

print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3341 entries, 2417 to 1307
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Song Name            3341 non-null   object        
 1   Artist Name          3341 non-null   object        
 2   Release Date         3341 non-null   datetime64[ns]
 3   Genres               3341 non-null   object        
 4   Tracks               3341 non-null   object        
 5   Popularity           3341 non-null   int64         
 6   Album Name           3341 non-null   object        
 7   Album Type           3341 non-null   object        
 8   Song Duration (min)  3341 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 261.0+ KB
None
        Popularity  Song Duration (min)
count  3341.000000          3341.000000
mean     63.495959             3.881802
std      12.389574             2.132188
min      11.0000

In [19]:

# Display top 5 rows with highest popularity
print(df.nlargest(5, 'Popularity'))

                                 Song Name    Artist Name Release Date  \
0                       BIRDS OF A FEATHER  Billie Eilish   2024-05-17   
1                                    LUNCH  Billie Eilish   2024-05-17   
100  I Had Some Help (Feat. Morgan Wallen)    Post Malone   2024-05-10   
2                                  CHIHIRO  Billie Eilish   2024-05-17   
101          Fortnight (feat. Post Malone)    Post Malone   2024-04-18   

                             Genres  \
0                      art pop, pop   
1                      art pop, pop   
100  dfw rap, melodic rap, pop, rap   
2                      art pop, pop   
101  dfw rap, melodic rap, pop, rap   

                                                Tracks  Popularity  \
0    SKINNY, LUNCH, CHIHIRO, BIRDS OF A FEATHER, WI...          98   
1    SKINNY, LUNCH, CHIHIRO, BIRDS OF A FEATHER, WI...          95   
100              I Had Some Help (Feat. Morgan Wallen)          95   
2    SKINNY, LUNCH, CHIHIRO, BIRDS OF A 

In [20]:
# Standardize text data to lowercase and remove leading/trailing whitespaces
text_columns = ['Song Name', 'Artist Name', 'Album Name', 'Album Type', 'Genres']
for col in text_columns:
    df[col] = df[col].str.lower().str.strip().replace(r'^\s+|\s+?$', '', regex=True)

# Remove punctuation from text columns
for col in text_columns:
    df[col] = df[col].str.translate(str.maketrans('', '', string.punctuation))

In [21]:

# Remove stop words
df['Song Name'] = df['Song Name'].apply(lambda x: ' '.join(word for word in x.split() if word not in ENGLISH_STOP_WORDS))
df['Tracks'] = df['Tracks'].apply(lambda x: ' '.join(word for word in x.split() if word not in ENGLISH_STOP_WORDS))

# Lemmatize text
lemmatizer = WordNetLemmatizer()
df['Song Name'] = df['Song Name'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))
df['Tracks'] = df['Tracks'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))


In [22]:

# Tokenize text
df['Song Name'] = df['Song Name'].apply(word_tokenize)
df['Tracks'] = df['Tracks'].apply(word_tokenize)

# Convert text to vectors using TF-IDF
vectorizer = TfidfVectorizer()
df['Song Name'] = df['Song Name'].apply(lambda x: ' '.join(x))
X_bow = vectorizer.fit_transform(df['Song Name'])
t_bow = vectorizer.transform(df['Tracks'].apply(lambda x: ' '.join(x)))


In [23]:
# Drop 'Album Type' column
df.drop(columns=['Album Type'], inplace=True)

In [24]:
# Scale numerical features
scaler = StandardScaler()
df[['Popularity', 'Song Duration (min)']] = scaler.fit_transform(df[['Popularity', 'Song Duration (min)']])



In [25]:

# Split and binarize genres
df['Genres'] = df['Genres'].str.split(',')
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df['Genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
df = pd.concat([df, genres_df], axis=1)




In [26]:
# Drop rows with null values
df.dropna(inplace=True)
print(df.isnull().sum())

Song Name                                                   0
Artist Name                                                 0
Release Date                                                0
Genres                                                      0
Tracks                                                      0
                                                           ..
reggaeton reggaeton colombiano trap latino urbano latino    0
reggaeton trap latino urbano latino                         0
tamil pop                                                   0
tollywood                                                   0
vintage tollywood                                           0
Length: 189, dtype: int64


In [27]:

# Preprocess tracks column
def preprocess_tracks(track_list):
    track_list = [track.lower() for track in track_list]
    track_list = [track.translate(str.maketrans('', '', string.punctuation)) for track in track_list]
    track_list = [' '.join(word for word in track.split() if word not in ENGLISH_STOP_WORDS) for track in track_list]
    track_list = [' '.join(lemmatizer.lemmatize(word) for word in track.split()) for track in track_list]
    return track_list


In [28]:

df['Tracks'] = df['Tracks'].apply(preprocess_tracks)

In [29]:
# Save preprocessed dataframe to CSV
df.to_csv("preprocessed_song_dataset.csv", index=False)

In [30]:
# Load preprocessed dataset
df = pd.read_csv("preprocessed_song_dataset.csv")
df.drop_duplicates(subset=['Song Name'], inplace=True)

In [31]:
df.head(2)

Unnamed: 0,Song Name,Artist Name,Release Date,Genres,Tracks,Popularity,Album Name,Song Duration (min),afrofuturism alternative rb art pop escape room experimental rb trip hop,afrofuturism alternative rb rb urban contemporary,...,punjabi pop,rap slap house,rb,rb rap,rb rap uk contemporary rb urban contemporary,reggaeton reggaeton colombiano trap latino urbano latino,reggaeton trap latino urbano latino,tamil pop,tollywood,vintage tollywood
0,tough,quavo,2024-07-03,['atl hip hop melodic rap rap trap'],['tough'],1.00939,tough,-0.366721,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tobey feat big sean babytron,big sean,2024-07-02,['detroit hip hop hip hop pop rap rb rap south...,"['tobey', '', 'feat', '', 'big', 'sean', 'baby...",1.00939,tobey feat big sean and babytron,0.383794,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#drop null values
df.dropna(inplace=True)

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Combine relevant features into a single string for each song
def combine_features(row):
    artist_name = str(row['Artist Name'])
    genres = ' '.join(eval(row['Genres'])) if isinstance(row['Genres'], str) else ''
    tracks = ' '.join(eval(row['Tracks'])) if isinstance(row['Tracks'], str) else ''
    album_name = str(row['Album Name']) if isinstance(row['Album Name'], str) else ''
    return ' '.join([artist_name, genres, tracks, album_name])

df['combined_features'] = df.apply(combine_features, axis=1)

# Vectorize the combined features using CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# Function to get recommendations based on song name
def get_recommendations(song_name, cosine_sim=cosine_sim):
    # Check if the song name exists in the dataset
    if song_name.lower() not in df['Song Name'].str.lower().values:
        return "Song not found in the dataset."

    # Get the index of the song that matches the song name
    idx = df[df['Song Name'].str.lower() == song_name.lower()].index[0]

    # Get the pairwise similarity scores of all songs with that song
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the songs based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the 10 most similar songs
    sim_scores = sim_scores[1:11]

    # Get the song names
    song_indices = [i[0] for i in sim_scores]
    return df['Song Name'].iloc[song_indices]




In [35]:
# Example: Get recommendations for a given song name
recommendations = get_recommendations('tough')
print(recommendations)

48                 mink
1440            brainer
1762         pick phone
177               plomo
480                hurt
175              knocka
35      gim me second 2
966              thrill
16                 star
1883          traphouse
Name: Song Name, dtype: object


In [39]:
def evaluate_model(data, cosine_sim, top_k=10):
    hit_count = 0
    total_songs = len(data)
    
    for idx in range(total_songs):
        # Get the pairwise similarity scores of all songs with the current song
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Sort the songs based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get the indices of the top K most similar songs
        sim_scores = sim_scores[1:top_k+1]
        
        # Get the song names
        song_indices = [i[0] for i in sim_scores]
        recommended_songs = data['Song Name'].iloc[song_indices].values
        
        # Check if the left-out song appears in the recommendations
        left_out_song = data['Song Name'].iloc[idx]
        if left_out_song in recommended_songs:
            hit_count += 1
    
    # Calculate the hit rate
    hit_rate = hit_count / total_songs
    return hit_rate

# Evaluate the model
hit_rate = evaluate_model(df, cosine_sim, top_k=10)
print(f"Hit Rate: {hit_rate:.2f}")


Hit Rate: 0.19


In [37]:

# Save the TF-IDF vectorizer and preprocessed dataframe
pickle.dump(count, open('count_vectorizer.pkl', 'wb'))
pickle.dump(df, open('song_dataset.pkl', 'wb'))

In [38]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming data is already loaded and preprocessed
# Load the preprocessed data
data = pd.read_csv("preprocessed_song_dataset.csv")

def combine_features(row):
    artist_name = str(row['Artist Name'])
    genres = ' '.join(eval(row['Genres'])) if isinstance(row['Genres'], str) else ''
    tracks = ' '.join(eval(row['Tracks'])) if isinstance(row['Tracks'], str) else ''
    album_name = str(row['Album Name']) if isinstance(row['Album Name'], str) else ''
    return ' '.join([artist_name, genres, tracks, album_name])

data['combined_features'] = data.apply(combine_features, axis=1)

# Vectorize the combined features using CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# Save the similarity matrix and the preprocessed data
with open('count_vectorizer.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

data.to_pickle('song_dataset.pkl')
