# Importing Libraries:

In [None]:
!pip install spotipy
!pip install langdetect
!pip install pandas

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Spotipy library which is a Python client for the Spotify Web API.
import spotipy
#SpotifyClientCredentials class from Spotipy for authentication.
from spotipy.oauth2 import SpotifyClientCredentials
# ReadTimeout exception from the requests library to handle timeout errors.
from requests.exceptions import ReadTimeout
# SpotifyException from Spotipy to handle Spotify-specific errors.
from spotipy.exceptions import SpotifyException
# time module to use functions like sleep for pausing execution.
import time

#!pip install langdetect nltk
from langdetect import detect, DetectorFactory
import pandas as pd

#### The Below code interacts with the Spotify Web API using the Spotipy library to collect data on tracks from different genres and markets. Here's a breakdown of each step:

In [2]:
#Storing the Credentials
client_id = 'XXX'
client_secret = 'YYY'


# Initializing Spotify client
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

# Retrieving available genres for recommendations
genres = (sp.recommendation_genre_seeds())['genres']

#  creating a list of markets as we want only english tracks
markets = ['US', 'GB','IN','AU','CA']

# Defining a function to loop through the markets and genres
def get_tracks_for_genre_and_market(genre, market):
    track_data = []
    # Search() :searches for tracks in a given genre and market, limited to 50 results at a time.
    results = sp.search(q=f'genre:{genre}', limit=50, type='track', market=market) 
    
    while results and len(track_data) < 30000:
        track_ids = [item['id'] for item in results['tracks']['items']]  
        if not track_ids:
            break
        
        # Fetching the required audio features
        audio_features = sp.audio_features(track_ids) 
        
        for i, item in enumerate(results['tracks']['items']): # returns an iterator that produces pairs of an index and the corresponding item from the iterable.
            track = item
            features = audio_features[i] if audio_features[i] else {}
            track_info = {
                'Name': track['name'],
                'Artist': track['artists'][0]['name'],
                'Genre': genre, # The genre of the track.
                'Popularity': track['popularity'], # Popularity of the track.
                'Acousticness': features.get('acousticness', None), # Likelihood of the track being acoustic.
                'Danceability': features.get('danceability', None), # Suitability for dancing.
                'Energy': features.get('energy', None), # Intensity and activity of the track.
                'Instrumentalness': features.get('instrumentalness', None), # Likelihood of the track being instrumental.
                'Liveness': features.get('liveness', None), # Presence of audience in the recording.
                'Loudness': features.get('loudness', None), # Overall loudness of the track.
                'Speechiness': features.get('speechiness', None), # Presence of spoken words.
                'Tempo': features.get('tempo', None), # Tempo of the track in beats per minute (BPM).
                'Valence': features.get('valence', None), # Musical positiveness conveyed.
                'Mode':features.get('mode', None), # Represents the musical mode (Major or Minor). It will be 1 for Major and 0 for Minor.
                'Key':features.get('key', None) ,  # Represents the musical key. It is an integer from 0 to 11 corresponding to musical notes (C = 0, C♯/D♭ = 1, etc.).
                'Duration_ms': track['duration_ms'], # Duration of the track in milliseconds.
                'release_date': track['album']['release_date']
            }
            track_data.append(track_info)
        
            if len(track_data) >= 30000:
                break
        
        results = sp.next(results) if results.get('next') else None # Fetches the next set of results if available, otherwise, stops. 
    
    return track_data
    
#Creating an empty list to store the tracks
all_tracks = []
try:
    for genre in genres:
        for market in markets:
            track_data = get_tracks_for_genre_and_market(genre, market)
            all_tracks.extend(track_data)
            print(f"Tracks collected: {len(all_tracks)} from market {market}") # Printing the tracks collection 
            
            if len(all_tracks) >= 30000:
                all_tracks = all_tracks[:30000]
                break
except (ReadTimeout, SpotifyException) as e:
    print(f"Error occurred: {e}")
    time.sleep(5)       # Pause before retrying to avoid rate limit issues

print(f"Total tracks collected: {len(all_tracks)}")

Max Retries reached


Error occurred: http status: 429, code:-1 - /v1/audio-features/?ids=7jIAttgQTpLDoNtykIQXjH,4E6cwWJWZw2zWf7VFbH7wf,1EzrEOXmMH3G43AXT1y7pA,5vjLSffimiIP26QG5WcN2K,3S0OXQeoh0w6AY8WQVckRW,1jyddn36UN4tVsJGtaJfem,53QF56cjZA9RTuuMZDrSA6,24CcvMOaNniXXcxA8HjUw1,6UIxGIqWlO5wsddY44AV1R,6Uy6K3KdmUdAfelUp0SeXn,5htghP7rThIe6oXBN6uYI5,38YgZVHPWOWsKrsCXz6JyP,1fEGtTZjrjJW8eUeewnNJR,1MxHIIzcTeFnaNsDGY3pfj,07m8PuXxxv5J4qPEDq6ZkK,0IktbUcnAGrvD03AWnz3Q8,0RD3NWnHlyBCRwgNZy8QAn,5auMzVHFr5Zfw6IbKarZ56,6WuBXGHGpSqpkFCl7ei8xp,7aohwSiTDju51QmC54AUba,2rs6UMzlu1pMGGVw60tiHm,2qLMf6TuEC3ruGJg4SMMN6,0EKBV6GybPtALXUgWqWrym,7x4b0UccXSKBWxWmjcrG2T,1jHNQodYIGvk187xrISw1i,1umKFpJdXDtquGCz4oBVDJ,2D4BSm5Z8Hq5zYbSgJwEOh,4BU9s6QLGFCaxlgaPWWuFY,4oa14QBfWRDfJy2agySy0L,0oOY4xChdxmGgpyQrY2FBz,0S5MCBH1qTVCsckidwvS8G,0sKAZF2T0hD3SfEr0U0rqX,6lfxq3CG4xtTiEg7opyCyx,2WZyfujzMweFLnozyUJBkW,0OiCqRaoKK6nuSIty8zHjl,7BXW1QCg56yzEBV8pW8pah,2lxBZVbkiCXC1soks2RXwV,7jJH8F3PHlNvxfqEAAfFDl,0R7EWhquaAICmyE5MZqt3q,5fj76kVAnqRKKhAw5d06jj,6HbxpoChDH9T

In [None]:
# Converting the data to a Dataframe
df=pd.DataFrame(all_tracks)

# saving the complete dataset
df.to_excel('tracks.xlsx',index=False)


#### Verifying and storing only english language tracks using language detect library:

In [3]:
df= pd.read_excel(r'tracks.xlsx')


# Ensure consistent results
DetectorFactory.seed = 0


def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Apply the language detection function
df['is_english'] = df['Name'].apply(is_english)  

# Filter out only English tracks
english_tracks = df[df['is_english']]


english_tracks.reset_index(inplace=True)
english_tracks.info()

FileNotFoundError: [Errno 2] No such file or directory: 'tracks.xlsx'

#### Saving the final dataset:

In [None]:
# Save the filtered dataset
english_tracks.to_excel('Hollywood_tracks.xlsx', index=False)