In [32]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors 
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import nltk
from nltk.stem.porter import PorterStemmer
import pickle

In [33]:
# Load data
df = pd.read_csv("spotify_millsongdata.csv").sample(5000).reset_index(drop=True)


In [34]:
# Preprocessing
stemmer = PorterStemmer()
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\basil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [36]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [37]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])


In [38]:
# Train KNN model
knn_model = NearestNeighbors(n_neighbors=20, metric='cosine')
knn_model.fit(tfidf_matrix)

In [39]:
print(df.columns)


Index(['artist', 'song', 'link', 'text'], dtype='object')


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Text preprocessing and feature extraction
tfidf_vectorizer = TfidfVectorizer()
X_train_text = tfidf_vectorizer.fit_transform(df['text'])
y_train_text = df['song']  # Assuming you want to predict 'song'

# Train Naive Bayes classifier
nb_text_classifier = MultinomialNB()
nb_text_classifier.fit(X_train_text, y_train_text)


In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Text preprocessing and feature extraction
print("Vectorizing text data...")
tfidf_vectorizer = TfidfVectorizer()
X_dt = tfidf_vectorizer.fit_transform(df['text'])
y_dt = df['song']

# Train Decision Tree Classifier
print("Training Decision Tree Classifier...")
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_dt, y_dt)
print("Training complete.")


In [42]:
# Serialize objects
pickle.dump(knn_model, open('knn_model.pkl', 'wb'))
pickle.dump(df, open('df.pkl', 'wb'))
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pkl', 'wb'))
pickle.dump(nb_text_classifier, open('nb_text_classifier.pkl', 'wb'))
pickle.dump(dt_classifier, open('dt_classifier.pkl', 'wb'))

In [43]:
# Recommendation function
def recommendation(song_title, df, similarity_matrix):
    idx = df[df['song'] == song_title].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]  # Exclude the queried song itself

    similar_songs = [df.iloc[i[0]]['song'] for i in sim_scores]
    return similar_songs
