In [1]:
!pip install pandas numpy spotipy pyyaml tqdm scikit-learn matplotlib scikit-image



In [2]:
import pandas as pd
import re
import nltk
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV


In [3]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /Users/asray/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/asray/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/asray/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Functions for text processing
def clean_lyrics(text):
    text = text.replace('\r', ' ').replace('\n', ' ')
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\?\!]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [5]:
def preprocess_lyrics(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


In [6]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index


In [7]:
def get_average_word_vectors(words, embeddings_index, dim=100):
    word_vectors = [embeddings_index[word] for word in words if word in embeddings_index]
    if len(word_vectors) == 0:
        return np.zeros(dim)
    return np.mean(word_vectors, axis=0)


In [8]:
def text_to_embeddings(text, embeddings_index, dim=100):
    words = text.split()
    return get_average_word_vectors(words, embeddings_index, dim)


In [10]:
# Load GloVe embeddings
glove_file_path = 'glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_file_path)


In [12]:
# Set up Spotify API credentials
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id='51bbdd6542df423d97fc26f78a62b3be',
                                               client_secret='2c07786b92a749fe939e28c2425ad638',
                                               redirect_uri='google.com',
                                               scope='user-library-read'))

In [13]:
def get_audio_features(song_link):
    try:
        # Extract the track ID from the Spotify URL/URI
        if "spotify.com" in song_link:
            track_id = song_link.split('/')[-1].split('?')[0]
        elif "spotify:track:" in song_link:
            track_id = song_link.split(':')[-1]
        else:
            raise ValueError("Unsupported URL / URI format")
        
        # Fetch the audio features
        features = sp.audio_features(track_id)
        return features[0] if features else None
    except Exception as e:
        print(f"Error fetching audio features for {song_link}: {e}")
        return None


In [15]:
# Load the dataset
df = pd.read_csv('spotify_millsongdata.csv')


In [16]:
# Clean and preprocess the text
df['cleaned_text'] = df['text'].apply(clean_lyrics)
df['preprocessed_text'] = df['cleaned_text'].apply(preprocess_lyrics)


In [17]:
# Convert preprocessed text to embeddings
df['embedding'] = df['preprocessed_text'].apply(lambda x: text_to_embeddings(x, embeddings_index))


In [18]:
# Fetch audio features and add them to the DataFrame
df['audio_features'] = df['link'].apply(get_audio_features)


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [19]:
# Drop rows where audio features could not be fetched
df.dropna(subset=['audio_features'], inplace=True)

In [26]:
# Load the dataset
df = pd.read_csv('spotify_millsongdata.csv')

# Check if the dataset is loaded correctly and has data
print(df.head())
print(f"Number of rows in the dataset: {len(df)}")


  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text  
0  Look at her face, it's a wonderful face  \r\nA...  
1  Take it easy with me, please  \r\nTouch me gen...  
2  I'll never know why I had to go  \r\nWhy I had...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  
Number of rows in the dataset: 57650


In [27]:
def combine_features(row):
    embedding = row['embedding']
    audio_features = list(row['audio_features'].values())
    combined = np.hstack([embedding, audio_features])
    return combined

# Test combine_features on a single row
print(combine_features(df.iloc[0]))


KeyError: 'embedding'

In [23]:
# Combine embeddings and audio features
df['combined_features'] = df.apply(combine_features, axis=1)

# Convert combined features into a feature matrix
feature_matrix = np.vstack(df['combined_features'].values)


ValueError: Cannot set a DataFrame with multiple columns to the single column combined_features

In [22]:
# Convert combined features into a feature matrix
feature_matrix = np.array(df['combined_features'].tolist())


KeyError: 'combined_features'

In [None]:
# Split the data into training and testing sets
X_train, X_test = train_test_split(feature_matrix, test_size=0.2, random_state=42)

In [None]:
# Train a k-NN model
model = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')
model.fit(X_train)


In [None]:
# Define a parameter grid for tuning
param_grid = {
    'n_neighbors': [5, 10, 15, 20],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(NearestNeighbors(), param_grid, cv=5)


In [None]:
# Fit GridSearchCV to the training data
grid_search.fit(X_train)


In [None]:
# Get the best parameters
best_params = grid_search.best_params_


In [None]:
# Train the model with the best parameters
best_model = NearestNeighbors(n_neighbors=best_params['n_neighbors'], algorithm=best_params['algorithm'])
best_model.fit(X_train)


In [None]:
def get_recommendations_for_song(song_name, df, model, n_neighbors=5):
    try:
        song_idx = df[df['song'] == song_name].index[0]
    except IndexError:
        return f"Song '{song_name}' not found in the dataset."
    
    song_features = df.iloc[song_idx]['combined_features'].reshape(1, -1)
    distances, indices = model.kneighbors(song_features, n_neighbors=n_neighbors)
    recommended_songs = df.iloc[indices[0]]
    return recommended_songs[['song', 'artist', 'link']]


In [None]:
def recommend_songs():
    user_input = input("Enter a song name: ")
    recommendations = get_recommendations_for_song(user_input, df, best_model)
    
    if isinstance(recommendations, str):
        print(recommendations)
    else:
        print("Recommendations:")
        for i, row in recommendations.iterrows():
            print(f"{i+1}. {row['song']} by {row['artist']} - {row['link']}")


In [None]:
# Example usage
recommend_songs()


In [None]:
# Get the user ID
user_id = sp.current_user()['id']

def create_playlist(playlist_name):
    # Create a new playlist
    playlist = sp.user_playlist_create(user_id, playlist_name)
    return playlist['id']

def add_tracks_to_playlist(playlist_id, track_ids):
    # Add tracks to the playlist
    sp.user_playlist_add_tracks(user_id, playlist_id, track_ids)

In [None]:
def get_recommendations_for_song(song_name, df, model, n_neighbors=5):
    try:
        song_idx = df[df['song'] == song_name].index[0]
    except IndexError:
        return f"Song '{song_name}' not found in the dataset."
    
    song_features = df.iloc[song_idx]['combined_features'].reshape(1, -1)
    distances, indices = model.kneighbors(song_features, n_neighbors=n_neighbors)
    recommended_songs = df.iloc[indices[0]]
    return recommended_songs[['song', 'artist', 'link']]


In [None]:
def recommend_songs_and_create_playlist():
    user_input = input("Enter a song name: ")
    recommendations = get_recommendations_for_song(user_input, df, best_model)
    
    if isinstance(recommendations, str):
        print(recommendations)
    else:
        print("Recommendations:")
        track_ids = []
        for i, row in recommendations.iterrows():
            print(f"{i+1}. {row['song']} by {row['artist']} - {row['link']}")
            track_id = row['link'].split('/')[-1].split('?')[0]
            track_ids.append(track_id)
        
        playlist_name = input("Enter a name for your new playlist: ")
        playlist_id = create_playlist(playlist_name)
        add_tracks_to_playlist(playlist_id, track_ids)
        print(f"Playlist '{playlist_name}' created and songs added.")


In [None]:
# Example usage
recommend_songs_and_create_playlist()