In [11]:
import pandas as pd
import spotipy as sp
import time
from spotipy.oauth2 import SpotifyClientCredentials

In [12]:
from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os


In [25]:


# Load Spotify credentials from .env file
load_dotenv()
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

if not client_id or not client_secret:
    raise ValueError("Spotify credentials are missing. Make sure SPOTIPY_CLIENT_ID and SPOTIPY_CLIENT_SECRET are set in your .env file.")

# Initialize Spotify client
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = Spotify(client_credentials_manager=client_credentials_manager)

# Load the dataset
df = pd.read_csv('../data/3_combined_clean_dataset.csv')  # Replace 'your_data.csv' with the path to your file
print(f"Loaded DataFrame with {len(df)} rows.")  # Debugging step

# Drop rows with missing Song or Artist
df = df.dropna(subset=['Song', 'Artist'])

# Initialize storage for extended data
extended_data = []

# Set up a counter for API calls
api_call_count = 0

for index, row in df.iterrows():
    song_name = row['Song']
    artist_name = row['Artist']
    
    try:
        # Print progress
        print(f"Processing row {index + 1} of {len(df)}: {song_name} by {artist_name}")
        
        # Search for the song in Spotify
        results = sp.search(q=f"track:{song_name} artist:{artist_name}", type='track', limit=1)
        api_call_count += 1

        if results['tracks']['items']:
            track = results['tracks']['items'][0]
            track_data = {
                'song': song_name,
                'artist': artist_name,
                'track_id': track['id'],
                'album_id': track['album']['id'],
                'album_name': track['album']['name'],
                'artist_id': track['artists'][0]['id'],
                'popularity': track['popularity'],
                'release_date': track['album']['release_date'],
                'track_duration_ms': track['duration_ms']
            }
            extended_data.append(track_data)
        else:
            # No match found
            extended_data.append({
                'song': song_name,
                'artist': artist_name,
                'track_id': None,
                'album_id': None,
                'album_name': None,
                'artist_id': None,
                'popularity': None,
                'release_date': None,
                'track_duration_ms': None
            })
    except Exception as e:
        print(f"Error searching for {song_name} by {artist_name}: {e}")
        extended_data.append({
            'song': song_name,
            'artist': artist_name,
            'track_id': None,
            'album_id': None,
            'album_name': None,
            'artist_id': None,
            'popularity': None,
            'release_date': None,
            'track_duration_ms': None
        })

    # Rate-limiting logic
    if api_call_count % 95 == 0:  # Pause every 90 API calls
        print("Rate limit reached. Waiting for 30 seconds...")
        time.sleep(30)
    
    time.sleep(0.25)  # Short delay between API calls

# Convert the extended data into a DataFrame
extended_df = pd.DataFrame(extended_data)

# Save the extended dataset to a CSV file
extended_df.to_csv('extended_spotify_data.csv', index=False)

print(f"Extended dataset saved successfully with {len(extended_df)} rows!")


Loaded DataFrame with 10042 rows.
Processing row 1 of 10041: Die With A Smile by Lady Gaga & Bruno Mars
Processing row 2 of 10041: A Bar Song (Tipsy) by Shaboozey
Processing row 3 of 10041: Birds Of A Feather by Billie Eilish
Processing row 4 of 10041: Lose Control by Teddy Swims
Processing row 5 of 10041: APT. by ROSE & Bruno Mars
Processing row 6 of 10041: That's So True by Gracie Abrams
Processing row 7 of 10041: Luther by Kendrick Lamar & SZA
Processing row 8 of 10041: I Had Some Help by Post Malone Featuring Morgan Wallen
Processing row 9 of 10041: Espresso by Sabrina Carpenter
Processing row 10 of 10041: Taste by Sabrina Carpenter
Processing row 11 of 10041: TV Off by Kendrick Lamar Featuring Lefty Gunplay
Processing row 12 of 10041: Beautiful Things by Benson Boone
Processing row 13 of 10041: Love Somebody by Morgan Wallen
Processing row 14 of 10041: Too Sweet by Hozier
Processing row 15 of 10041: Timeless by The Weeknd & Playboi Carti
Processing row 16 of 10041: Squabble Up by 

In [3]:
extended_df.shape
extended_df.head()

NameError: name 'extended_df' is not defined

In [18]:
extended_df.isna().sum()

song                    0
artist                  0
track_id                0
album_id                0
album_name              0
artist_id               0
popularity              0
release_date            0
track_duration_ms       0
artist_popularity    6277
genres               6277
dtype: int64

In [19]:
extended_df.dropna(inplace=True)

In [20]:
extended_df.duplicated().sum()

np.int64(0)

In [21]:
extended_df.to_csv('../data/4_extended_spotify_data.csv', index=False)

In [18]:
extended_df = pd.read_csv('../data/4_extended_spotify_data.csv') 
extended_df.head()
extended_df.shape

(6277, 9)

### Test code to get the genres and artist popularity for 10 first rows.

In [26]:
import pandas as pd
import spotipy as sp
import time
import ast
from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os

# Load Spotify credentials from .env file
load_dotenv()
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

if not client_id or not client_secret:
    raise ValueError("Spotify credentials are missing. Make sure SPOTIPY_CLIENT_ID and SPOTIPY_CLIENT_SECRET are set in your .env file.")

# Initialize Spotify client
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = Spotify(client_credentials_manager=client_credentials_manager)

# Load your existing DataFrame
extended_df = pd.read_csv('../data/4_extended_spotify_data.csv')

# Create a copy of the first 10 rows for testing
test_df = extended_df.head(10).copy()

# Add genres and artist popularity to the DataFrame
all_genres = set()  # Initialize a set to track unique genres

# Process test rows
for index, row in test_df.iterrows():
    artist_id = row['artist_id']
    
    if artist_id:  # Proceed only if artist_id is available
        try:
            # Print progress
            print(f"Processing row {index + 1} of {len(test_df)}")
            
            # Fetch artist information from the Spotify API
            artist_info = sp.artist(artist_id)
            
            # Get artist popularity
            test_df.at[index, 'artist_popularity'] = artist_info.get('popularity', None)
            
            # Get genres and store them directly as a string representation
            genres = artist_info.get('genres', [])
            all_genres.update(genres)  # Update set of unique genres
            test_df.at[index, 'genres'] = str(genres)  # Store as string
            
            print(f"Found genres: {genres}")  # Debug print
        
        except Exception as e:
            print(f"Error fetching data for artist_id {artist_id}: {e}")
            test_df.at[index, 'artist_popularity'] = None
            test_df.at[index, 'genres'] = '[]'
    
    else:
        test_df.at[index, 'artist_popularity'] = None
        test_df.at[index, 'genres'] = '[]'
    
    # Sleep for 0.5 seconds after each request
    time.sleep(0.5)

# Convert stringified genres back to lists
test_df['genres'] = test_df['genres'].apply(ast.literal_eval)

# Convert genres to one-hot encoded columns
print("\nCreating one-hot encoded columns for genres...")
for genre in all_genres:
    test_df[genre] = test_df['genres'].apply(lambda genre_list: genre in genre_list)

# Drop the original 'genres' column as it's now one-hot encoded
test_df.drop(columns=['genres'], inplace=True)

# Save the test DataFrame to a new CSV
test_output_path = '../data/test_spotify_data_with_genres.csv'
test_df.to_csv(test_output_path, index=False)

print(f"\nTest dataset saved successfully to {test_output_path}")
print(f"Number of rows: {len(test_df)}")
print(f"Number of unique genres found: {len(all_genres)}")
print("\nUnique genres found:")
print(sorted(all_genres))

# Display the first few rows of the result
print("\nFirst few rows of the processed data:")
print(test_df.head())

Processing row 1 of 10




KeyboardInterrupt: 

### Code to get the genres and artist popularity for all rows.

In [21]:
import pandas as pd
import spotipy as sp
import time
import ast
from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os

# Load Spotify credentials from .env file
load_dotenv()
client_id = os.getenv("SPOTIPY_CLIENT_ID")
client_secret = os.getenv("SPOTIPY_CLIENT_SECRET")

if not client_id or not client_secret:
    raise ValueError("Spotify credentials are missing. Make sure SPOTIPY_CLIENT_ID and SPOTIPY_CLIENT_SECRET are set in your .env file.")

# Initialize Spotify client
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = Spotify(client_credentials_manager=client_credentials_manager)

# Load your existing DataFrame
extended_df = pd.read_csv('../data/4_extended_spotify_data.csv')

# Add genres and artist popularity to the DataFrame
all_genres = set()  # Initialize a set to track unique genres

# Process all rows
for index, row in extended_df.iterrows():
    artist_id = row['artist_id']
    
    if artist_id:  # Proceed only if artist_id is available
        try:
            # Print progress
            print(f"Processing row {index + 1} of {len(extended_df)}")
            
            # Fetch artist information from the Spotify API
            artist_info = sp.artist(artist_id)
            
            # Get artist popularity
            extended_df.at[index, 'artist_popularity'] = artist_info.get('popularity', None)
            
            # Get genres and store them directly as a string representation
            genres = artist_info.get('genres', [])
            all_genres.update(genres)  # Update set of unique genres
            extended_df.at[index, 'genres'] = str(genres)  # Store as string
        
        except Exception as e:
            print(f"Error fetching data for artist_id {artist_id}: {e}")
            extended_df.at[index, 'artist_popularity'] = None
            extended_df.at[index, 'genres'] = '[]'
    
    else:
        extended_df.at[index, 'artist_popularity'] = None
        extended_df.at[index, 'genres'] = '[]'
    
    # Sleep for 0.5 seconds after each request to avoid hitting rate limits
    time.sleep(0.5)
    
    # Rate-limiting logic
    if index % 50 == 0 and index > 0:  # Pause every 50 API calls
        print(f"Processed {index + 1} rows. Waiting for 30 seconds to avoid rate limit...")
        time.sleep(30)

# Convert stringified genres back to lists
extended_df['genres'] = extended_df['genres'].apply(ast.literal_eval)

# Convert genres to one-hot encoded columns
print("Creating one-hot encoded columns for genres...")
for genre in all_genres:
    extended_df[genre] = extended_df['genres'].apply(lambda genre_list: genre in genre_list)

# Drop the original 'genres' column as it's now one-hot encoded
extended_df.drop(columns=['genres'], inplace=True)

# Save the updated DataFrame to a new CSV
output_path = '../data/5_extended_spotify_data_with_genres.csv'
extended_df.to_csv(output_path, index=False)

print(f"Dataset saved successfully to {output_path} with {len(extended_df)} rows and {len(all_genres)} genre columns!")
print(f"Total number of unique genres found: {len(all_genres)}")

Processing row 1 of 6277




KeyboardInterrupt: 