# Importing Libraries:

In [None]:
!pip install spotipy
!pip install langdetect
!pip install pandas

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Spotipy library which is a Python client for the Spotify Web API.
import spotipy
#SpotifyClientCredentials class from Spotipy for authentication.
from spotipy.oauth2 import SpotifyClientCredentials
# ReadTimeout exception from the requests library to handle timeout errors.
from requests.exceptions import ReadTimeout
# SpotifyException from Spotipy to handle Spotify-specific errors.
from spotipy.exceptions import SpotifyException
# time module to use functions like sleep for pausing execution.
import time

#!pip install langdetect nltk
from langdetect import detect, DetectorFactory
import pandas as pd

#### The Below code interacts with the Spotify Web API using the Spotipy library to collect data on tracks from different genres and markets. Here's a breakdown of each step:

In [37]:
#Storing the Credentials
client_id = '696604424d2c48bd942131d6cfe4786b'
client_secret = '63c6aab43d0843d79d7403584ae42b34'


# Initializing Spotify client
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

# Retrieving available genres for recommendations
genres = (sp.recommendation_genre_seeds())['genres']

#  creating a list of markets as we want only english tracks
markets = ['US', 'GB','IN','AU','CA']

# Defining a function to loop through the markets and genres
def get_tracks_for_genre_and_market(genre, market):
    track_data = []
    # Search() :searches for tracks in a given genre and market, limited to 50 results at a time.
    results = sp.search(q=f'genre:{genre}', limit=50, type='track', market=market) 
    
    while results and len(track_data) < 30000:
        track_ids = [item['id'] for item in results['tracks']['items']]  
        if not track_ids:
            break
        
        # Fetching the required audio features
        audio_features = sp.audio_features(track_ids) 
        
        for i, item in enumerate(results['tracks']['items']): # returns an iterator that produces pairs of an index and the corresponding item from the iterable.
            track = item
            features = audio_features[i] if audio_features[i] else {}
            track_info = {
                'Name': track['name'],
                'Artist': track['artists'][0]['name'],
                'Genre': genre, # The genre of the track.
                'Popularity': track['popularity'], # Popularity of the track.
                'Acousticness': features.get('acousticness', None), # Likelihood of the track being acoustic.
                'Danceability': features.get('danceability', None), # Suitability for dancing.
                'Energy': features.get('energy', None), # Intensity and activity of the track.
                'Instrumentalness': features.get('instrumentalness', None), # Likelihood of the track being instrumental.
                'Liveness': features.get('liveness', None), # Presence of audience in the recording.
                'Loudness': features.get('loudness', None), # Overall loudness of the track.
                'Speechiness': features.get('speechiness', None), # Presence of spoken words.
                'Tempo': features.get('tempo', None), # Tempo of the track in beats per minute (BPM).
                'Valence': features.get('valence', None), # Musical positiveness conveyed.
                'Mode':features.get('mode', None), # Represents the musical mode (Major or Minor). It will be 1 for Major and 0 for Minor.
                'Key':features.get('key', None) ,  # Represents the musical key. It is an integer from 0 to 11 corresponding to musical notes (C = 0, C♯/D♭ = 1, etc.).
                'Duration_ms': track['duration_ms'], # Duration of the track in milliseconds.
                'release_date': track['album']['release_date']
            }
            track_data.append(track_info)
        
            if len(track_data) >= 30000:
                break
        
        results = sp.next(results) if results.get('next') else None # Fetches the next set of results if available, otherwise, stops. 
    
    return track_data
    
#Creating an empty list to store the tracks
all_tracks = []
try:
    for genre in genres:
        for market in markets:
            track_data = get_tracks_for_genre_and_market(genre, market)
            all_tracks.extend(track_data)
            print(f"Tracks collected: {len(all_tracks)} from market {market}") # Printing the tracks collection 
            
            if len(all_tracks) >= 30000:
                all_tracks = all_tracks[:30000]
                break
except (ReadTimeout, SpotifyException) as e:
    print(f"Error occurred: {e}")
    time.sleep(5)       # Pause before retrying to avoid rate limit issues

print(f"Total tracks collected: {len(all_tracks)}")

Tracks collected: 50 from market US
Tracks collected: 100 from market GB
Tracks collected: 150 from market AU
Tracks collected: 200 from market CA
Tracks collected: 250 from market NZ
Tracks collected: 300 from market US
Tracks collected: 350 from market GB
Tracks collected: 400 from market AU
Tracks collected: 450 from market CA
Tracks collected: 500 from market NZ
Tracks collected: 550 from market US
Tracks collected: 600 from market GB
Tracks collected: 650 from market AU
Tracks collected: 700 from market CA
Tracks collected: 750 from market NZ
Tracks collected: 800 from market US
Tracks collected: 850 from market GB
Tracks collected: 900 from market AU
Tracks collected: 950 from market CA
Tracks collected: 1000 from market NZ
Tracks collected: 1050 from market US
Tracks collected: 1100 from market GB
Tracks collected: 1150 from market AU
Tracks collected: 1200 from market CA
Tracks collected: 1250 from market NZ
Tracks collected: 1300 from market US
Tracks collected: 1350 from mark

Max Retries reached


Error occurred: http status: 429, code:-1 - /v1/audio-features/?ids=36gcliMRX1vCpgnrZE3dFZ,6BePGk3eCan4FqaW2X8Qy3,4EmH2iRucAgCOnhuJRotUi,5YaskwnGDZFDRipaqzbwQx,18asYwWugKjjsihZ0YvRxO,6f3Slt0GbA2bPZlz0aIFXN,3IhM5Mber8KA0NaRNpK2px,2nwmp06W0pxsQltnsW4ElE,57uNXnf4ciHYP4HktbIbzC,4wtR6HB3XekEengMX17cpc,5E1XI3JnHOoSEcY2hYnbhj,0zKbDrEXKpnExhGQRe9dxt,4kWO6O1BUXcZmaxitpVUwp,2x4JC9TxW2LACuuxbsncfG,6wLqNGHQIja6xqT0cfrzBB,3j11iDncb7ZeDMw7lFucqM,5f69wzvle0b9ColR7wuvEK,0BLxPjFEOG8G1q8AKLuz3V,5pNFibJLq7dvoDVIIcQBkn,5GjnIpUlLGEIYk052ISOw9,10EpXLXKHmNSVKvX7A5hg8,7mobUfp1aL8A6CdugCMWft,6FAmtZoa7jq6bH9GBjCRkM,7fZtgiJSNJ2QarKKKSssBp,3jksOdXCaDXyGiZ7L4YZbp,6ibDVMcMUNqZ5eXT9sD4Vy,2f2dRxjiUOoV5qhZFbnVO6,1moFkZDqcjQNeXtyoanLHv,2yEHX6MqAXPyD7bm47A2Br,77OuOZhDxJSQLYnNIpX2AG,0IAFmrpi9KF0PP3LONJonm,4vvnuJlgBeNVwq3TNmLMNX,4PNKy9P3xrzqMwVzRjxBKt,4bczCp8ojdY0R4EpCaqkfQ,2YOuBWIyHgyvHnLMpjzBf2,3qCCQas6tIP15Yjgu3gl9S,1uWUn8OYhfcc8xf94W3BqZ,1g5Jqwo02PuitYfv19B6Jn,05c4AAJKIulqI8vQQ41Rch,19bua05ITWjFrk9iGxOA7r,0W4N0KzHKWQp

In [57]:
# Converting the data to a Dataframe
df=pd.DataFrame(all_tracks)
# saving the complete dataset
df.to_excel('tracks.xlsx',index=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27450 entries, 0 to 27449
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              27450 non-null  object 
 1   Artist            27450 non-null  object 
 2   Genre             27450 non-null  object 
 3   Popularity        27450 non-null  int64  
 4   Acousticness      27450 non-null  float64
 5   Danceability      27450 non-null  float64
 6   Energy            27450 non-null  float64
 7   Instrumentalness  27450 non-null  float64
 8   Liveness          27450 non-null  float64
 9   Loudness          27450 non-null  float64
 10  Speechiness       27450 non-null  float64
 11  Tempo             27450 non-null  float64
 12  Valence           27450 non-null  float64
 13  Mode              27450 non-null  int64  
 14  Key               27450 non-null  int64  
 15  Duration_ms       27450 non-null  int64  
 16  release_date      27450 non-null  object

In [None]:
#### Verifying and storing only english language tracks using language detect library:

In [59]:
# Ensure consistent results
DetectorFactory.seed = 0


def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Apply the language detection function
df['is_english'] = df['Name'].apply(is_english)  

# Filter out only English tracks
english_tracks = df[df['is_english']]


english_tracks.reset_index(inplace=True)
english_tracks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13794 entries, 0 to 13793
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             13794 non-null  int64  
 1   Name              13794 non-null  object 
 2   Artist            13794 non-null  object 
 3   Genre             13794 non-null  object 
 4   Popularity        13794 non-null  int64  
 5   Acousticness      13794 non-null  float64
 6   Danceability      13794 non-null  float64
 7   Energy            13794 non-null  float64
 8   Instrumentalness  13794 non-null  float64
 9   Liveness          13794 non-null  float64
 10  Loudness          13794 non-null  float64
 11  Speechiness       13794 non-null  float64
 12  Tempo             13794 non-null  float64
 13  Valence           13794 non-null  float64
 14  Mode              13794 non-null  int64  
 15  Key               13794 non-null  int64  
 16  Duration_ms       13794 non-null  int64 

In [61]:
# Save the filtered dataset
english_tracks.to_excel('Hollywood_tracks.xlsx', index=False)

In [379]:
# Function to recommend songs from the same cluster
def recommend_songs(song_name, data, num_recommendations):
    song_name=song_name.lower()
    if song_name in data['Name'].values:
        cluster = data.loc[data['Name'] == song_name, 'Cluster_id'].values[0]
    recommendations = data[(data['Cluster_id'] == cluster) & (data['Name'] != song_name)].sample(num_recommendations)
    return recommendations[['Name']]


recommendations = recommend_songs('hold on',filtered_df,10)
recommendations

Unnamed: 0,Name
1159,all goes wrong
2231,now you can sleep
1166,tough talk (feat. kwengface)
2528,obey (with yungblud)
1237,satisfied
2260,gyal you a party animal - sped up
1679,heartwork
1785,when she loved me - john connearn version
1326,u are my high (with future)
1182,hollow


In [357]:
def recommend_songs(song_name, sim_matrix, data,top_n):
    song_name=song_name.lower()
    sim_values=sim_matrix[song_name].values.tolist()
    sim_values=enumerate(sim_values)
    sim_values=sorted(sim_values, key=lambda x: x[1], reverse=True)
    top_n_indices=[x[0] for x in sim_values[0:top_n+1]]
    return data.iloc[top_n_indices,0:3]

In [378]:
recommend_songs('hold on', sim_df, songs_df,10)

Unnamed: 0,Name,Genre,Artist
3116,photograph,singer-songwriter,Ed Sheeran
2711,your song,piano,Elton John
3032,can we kiss forever?,sad,Kina
445,easy on me,british,Adele
821,heading south,country,Zach Bryan
345,"up where we belong - from ""an officer and a ge...",blues,Joe Cocker
723,lost in the fire (feat. the weeknd),club,Gesaffelstein
4,i won't give up,acoustic,Jason Mraz
54,someone you loved - acoustic,acoustic,Plamina
2870,back to the old house - 2011 remaster,punk,The Smiths
