In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from dotenv import load_dotenv
import os
import requests
import base64
import logging
import spacy
from spacy_langdetect import LanguageDetector

load_dotenv()

CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")

auth_manager = SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET)

spotify = spotipy.Spotify(auth_manager=auth_manager)

In [None]:
def get_access_token(client_id, client_secret):
    """
    Get access token for spotify
    """
    # Encode as Base64
    client_creds = f"{client_id}:{client_secret}"
    client_creds_b64 = base64.b64encode(client_creds.encode())

    # Make a request for the token
    token_url = "https://accounts.spotify.com/api/token"
    token_data = {
        "grant_type": "client_credentials"
    }
    token_headers = {
        "Authorization": f"Basic {client_creds_b64.decode()}"
    }
    
    r = requests.post(token_url, data=token_data, headers=token_headers)
    if r.status_code not in range(200, 299):
        raise Exception("Could not authenticate client.")
    
    token_response_data = r.json()
    return token_response_data["access_token"]

def search_for_playlists(access_token, query, total=50):
    """
    Look for spotify playlist relevant to specific search terms
    """
    playlists = []
    offset = 0
    while len(playlists) < total:
        search_url = f"https://api.spotify.com/v1/search?q={query}&type=playlist&limit=50&offset={offset}"
        search_headers = {
            "Authorization": f"Bearer {access_token}"
        }

        response = requests.get(search_url, headers=search_headers)
        if response.status_code not in range(200, 299):
            logging.warning(f"Error: {response.status_code} \nResponse: {response}" )
            break

        search_results = response.json()
        items = search_results['playlists']['items']
        playlists.extend(items)

        if not items:
            break

        offset += 50

    return playlists[:total]

def get_playlists(access_token, search_terms):
    unique_playlists = {}
    for term in search_terms:
        playlists = search_for_playlists(access_token, term, 1000 // len(search_terms))
        for playlist in playlists:
            # Use playlist ID as the key to ensure uniqueness
            unique_playlists[playlist['id']] = playlist

    # If you need the playlists in list form
    all_unique_playlists = list(unique_playlists.values())
    return all_unique_playlists


def get_playlist_tracks(access_token, all_unique_playlists):
    playlist_dictionary = {}

    for playlist in all_unique_playlists:
        playlist_id = playlist['id']  # Assuming each playlist has an 'id' field
        tracks = []

        try:
            # Fetch tracks from the playlist
            playlist_tracks = spotify.playlist_tracks(playlist_id)

            # Check if the playlist tracks are successfully retrieved
            if 'items' in playlist_tracks:
                for item in playlist_tracks['items']:
                    track = item.get('track')
                    if track:  # Ensure the track details are available
                        artist_data = [(artist['name'], artist['id']) for artist in track['artists']]
                        track_data = {
                            "artist": artist_data,
                            "song_id": track['id'],
                            "song_name": track['name']
                        }
                        tracks.append(track_data)

        except Exception as e:
            print(f"Error occurred while processing playlist {playlist_id}: {e}")

        playlist_dictionary[playlist_id] = tracks

    return playlist_dictionary

def get_songs_dataframe(tracks):
    df = pd.DataFrame(columns=['song_name', 'song_id', 'artist_name', 'artist_id', 'playlist'])
    for playlist, songs in tracks.items():
        for song in songs:
            artist_name = song['artist'][0][0]
            artist_id = song['artist'][0][1]
            song_name = song['song_name']
            song_id = song['song_id']
            df = pd.concat([df, pd.DataFrame({'playlist': [playlist], 'song_name': [song_name], 'song_id': [song_id], 'artist_name': [artist_name], 'artist_id': [artist_id]})], ignore_index=True)
    grouped_df = df.groupby(['song_name', 'song_id', 'artist_name', 'artist_id'])['playlist'].apply(list).reset_index()
    return grouped_df

# Function to detect language
def detect_language(text):
    doc = nlp(text)
    return doc._.language['language']

def remove_non_english_song_titles(songs):
    # Load the spacy model
    nlp = spacy.load('en_core_web_sm')

    # Register the language detector with its factory name
    LanguageDetectorFactory = LanguageDetector()
    spacy.Language.factory("language_detector", func=lambda nlp, name: LanguageDetectorFactory)

    # Add the language detector to the pipeline
    nlp.add_pipe('language_detector', last=True)

    # Apply language detection
    songs['language'] = songs['song_name'].apply(detect_language)

    # Filter out non-English songs
    english_songs = songs[songs['language'] == 'en']
    
    return english_songs


In [None]:
access_token = get_access_token(CLIENT_ID, CLIENT_SECRET)
search_terms = ['party', 'classical', 'rock', 'jazz', 'rap', 'blues', 'pop']


playlists = get_playlists(access_token, search_terms)
tracks = get_playlist_tracks(access_token, playlists)
songs = get_songs_dataframe(tracks)

In [None]:
songs.to_excel("songs_wout_lyrics.xlsx", index=False)
songs.to_csv("songs_wout_lyrics.csv", index=False)

In [None]:
english_songs =  remove_non_english_song_titles(songs)

In [None]:
english_songs.to_excel("english_songs_wout_lyrics.xlsx", index=False)
english_songs.to_csv("english_songs_wout_lyrics.csv", index=False)