# Spotify Track Images Retrival using Python

## Workflow 
1. Read CSV
2. Fetch Unique ids from Main datframe  (spotify_history.csv)
3. fetch ids where image is there (not null or not 'no') from Music.csv
4. compare how many id from main are present in Music.csv
5. make a list of ids where image is not there 
6. fetch image using a loop . each loop will run for 10 times and then again fetch access token and run the loop to fetch images from spotify

In [3]:
# Library 
import numpy as np
import pandas as pd

# Load the main dataset
df_main = pd.read_csv("spotify_history.csv")

# Load the dataset with existing images
df_sub = pd.read_csv("Music.csv")

In [5]:
# Basic info 
print(df_main.shape)
print(df_sub.shape)

print(df_main.columns)
print(df_sub.columns)

(149860, 11)
(65795, 20)
Index(['spotify_track_uri', 'ts', 'platform', 'ms_played', 'track_name',
       'artist_name', 'album_name', 'reason_start', 'reason_end', 'shuffle',
       'skipped'],
      dtype='object')
Index(['name', 'artist', 'spotify_id', 'preview', 'img', 'danceability',
       'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'acousticness_artist', 'danceability_artist',
       'energy_artist', 'instrumentalness_artist', 'liveness_artist',
       'speechiness_artist', 'valence_artist'],
      dtype='object')


In [7]:
df_main.head(3)

Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped
0,2J3n32GeLmMjwuAzyhcSNe,2013-07-08 02:44:34,web player,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,autoplay,clickrow,False,False
1,1oHxIPqJyvAYHy0PVrDU98,2013-07-08 02:45:37,web player,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,clickrow,clickrow,False,False
2,487OPlneJNni3NWC8SYqhW,2013-07-08 02:50:24,web player,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,clickrow,unknown,False,False


In [9]:
df_sub.head(3)

Unnamed: 0,name,artist,spotify_id,preview,img,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,acousticness_artist,danceability_artist,energy_artist,instrumentalness_artist,liveness_artist,speechiness_artist,valence_artist
0,Mood (feat. iann dior),24kGoldn,3tjFYV6RSFtuktYl3ZtYcq,https://p.scdn.co/mp3-preview/45cb08fdb67744ab...,https://i.scdn.co/image/ab67616d0000b273ff8c98...,0.7,0.722,-3.558,0.0369,0.221,0.0,0.272,0.756,0.118269,0.731588,0.681235,2e-06,0.16,0.123765,0.566824
1,Blinding Lights,The Weeknd,0VjIjW4GlUZAMYd2vXMi3b,,https://i.scdn.co/image/ab67616d0000b2738863bc...,0.514,0.73,-5.934,0.0598,0.00146,9.5e-05,0.0897,0.334,0.271439,0.574808,0.607873,0.010594,0.201728,0.086811,0.305189
2,Dynamite,BTS,0t1kP63rueHleOhQkYSXFY,https://p.scdn.co/mp3-preview/a707728846c105f4...,https://i.scdn.co/image/ab67616d0000b273755995...,0.746,0.765,-4.41,0.0993,0.0112,0.0,0.0936,0.737,0.09935,0.614798,0.786452,4.4e-05,0.202608,0.128515,0.533369


## Conclussion : 
1. Main datagframe has no images but Music has image path. 
2. spotify_track_uri and spotify_id are same. 

In [12]:
# Get unique track IDs from df_main
unique_main_ids = set(df_main['spotify_track_uri'].unique())

In [14]:
# Filter IDs where image is available
available_image_ids = set(df_sub.loc[(df_sub['img'].notnull()) & (df_sub['img'] != 'no'), 'spotify_id'].unique())

In [18]:
# Find common IDs (IDs that already have images)
common_ids = unique_main_ids.intersection(available_image_ids)

# Count of matching IDs
print(f"Total IDs in main dataset: {len(unique_main_ids)}")
print(f"Total IDs with available images: {len(available_image_ids)}")
print(f"Total matching IDs: {len(common_ids)}")

Total IDs in main dataset: 16527
Total IDs with available images: 37414
Total matching IDs: 2601


In [22]:
# Find track IDs that are in df_main but not in df_sub (i.e., missing images)
missing_ids = unique_main_ids - available_image_ids

# Convert to a list for further processing
missing_ids_list = list(missing_ids)

print(f"Total missing IDs (need image fetching): {len(missing_ids_list)}")

Total missing IDs (need image fetching): 13926


In [26]:
filtered_df = df_main[df_main['track_name'].str.contains(r'\b100,000 People\b', regex=True, na=False)]
filtered_df

Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped
90566,1CGFOwYJ4FIDZysCcVzaFS,2021-01-11 20:24:08,android,344021,"100,000 People",Kings of Leon,"The Bandit / 100,000 People",trackdone,trackdone,True,False
90806,1CGFOwYJ4FIDZysCcVzaFS,2021-01-14 00:42:58,android,214184,"100,000 People",Kings of Leon,"The Bandit / 100,000 People",trackdone,logout,False,False
91007,1CGFOwYJ4FIDZysCcVzaFS,2021-01-15 05:04:21,android,344021,"100,000 People",Kings of Leon,"The Bandit / 100,000 People",trackdone,trackdone,False,False
91066,1CGFOwYJ4FIDZysCcVzaFS,2021-01-15 20:26:17,android,344021,"100,000 People",Kings of Leon,"The Bandit / 100,000 People",trackdone,trackdone,False,False
91101,1CGFOwYJ4FIDZysCcVzaFS,2021-01-16 01:01:00,android,344021,"100,000 People",Kings of Leon,"The Bandit / 100,000 People",trackdone,trackdone,False,False
...,...,...,...,...,...,...,...,...,...,...,...
144776,2wuxCm6QikYr215FwRMF1I,2024-07-02 15:32:59,mac,344013,"100,000 People",Kings of Leon,When You See Yourself,trackdone,trackdone,False,False
144976,2wuxCm6QikYr215FwRMF1I,2024-07-20 23:52:09,android,344013,"100,000 People",Kings of Leon,When You See Yourself,trackerror,trackdone,True,False
145900,2wuxCm6QikYr215FwRMF1I,2024-08-20 03:51:28,android,184466,"100,000 People",Kings of Leon,When You See Yourself,fwdbtn,logout,True,False
145901,2wuxCm6QikYr215FwRMF1I,2024-08-20 06:12:49,android,160562,"100,000 People",Kings of Leon,When You See Yourself,appload,trackdone,True,False


In [79]:
import time
import requests

# Function to get Spotify Access Token
def get_spotify_token(client_id, client_secret):
    auth_url = 'https://accounts.spotify.com/api/token'
    auth_response = requests.post(auth_url, {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret,
    })
    auth_data = auth_response.json()
    return auth_data['access_token']

"""# Function to get image URL for a track
def fetch_image_url(track_id, token):
    url = f"https://api.spotify.com/v1/tracks/{track_id}"
    response = requests.get(url, headers={'Authorization': f'Bearer {token}'})
    
    if response.status_code == 200:
        track_data = response.json()
        return track_data['album']['images'][0]['url']
    elif response.status_code == 401:  # Token expired
        return "TOKEN_EXPIRED"
    else:
        return None  # Track not found or other error """
# Function to get image URL for a track
def fetch_image_url(track_id, token):
    url = f"https://api.spotify.com/v1/tracks/{track_id}"
    response = requests.get(url, headers={'Authorization': f'Bearer {token}'})

    if response.status_code == 200:
        track_data = response.json()

        # Check if 'album' key exists and has 'images'
        if 'album' in track_data and 'images' in track_data['album'] and track_data['album']['images']:
            return track_data['album']['images'][0]['url']
        else:
            return "NO_IMAGE"  # Handle missing images gracefully

    elif response.status_code == 401:  # Token expired
        return "TOKEN_EXPIRED"
    else:
        return None  # Track not found or other error

# Spotify API credentials (Replace with your own)
CLIENT_ID = 'f7d77d6ae3f14dc8baebb37b9e758cb1'
CLIENT_SECRET = '29b46d3070974bf1a9837674c9a7d988'

# Get initial access token
token = get_spotify_token(CLIENT_ID, CLIENT_SECRET)

In [81]:
# Dictionary to store the new image URLs
new_images = {}

# Fetch images in batches of 10
for i, track_id in enumerate(missing_ids_list):
    image_url = fetch_image_url(track_id, token)

    # Handle token expiration
    if image_url == "TOKEN_EXPIRED":
        print("Refreshing token...")
        token = get_spotify_token(CLIENT_ID, CLIENT_SECRET)
        image_url = fetch_image_url(track_id, token)

    # Store the image URL
    if image_url:
        new_images[track_id] = image_url

    # Refresh token after every 10 requests
    if (i + 1) % 10 == 0:
        print(f"Fetched {i+1}/{len(missing_ids_list)} images. Refreshing token...")
        token = get_spotify_token(CLIENT_ID, CLIENT_SECRET)
        time.sleep(1)  # Sleep to avoid rate limits

print(f"Total fetched images: {len(new_images)}")

Fetched 10/13926 images. Refreshing token...
Fetched 20/13926 images. Refreshing token...
Fetched 30/13926 images. Refreshing token...
Fetched 40/13926 images. Refreshing token...
Fetched 50/13926 images. Refreshing token...
Fetched 60/13926 images. Refreshing token...
Fetched 70/13926 images. Refreshing token...
Fetched 80/13926 images. Refreshing token...
Fetched 90/13926 images. Refreshing token...
Fetched 100/13926 images. Refreshing token...
Fetched 110/13926 images. Refreshing token...
Fetched 120/13926 images. Refreshing token...
Fetched 130/13926 images. Refreshing token...
Fetched 140/13926 images. Refreshing token...
Fetched 150/13926 images. Refreshing token...
Fetched 160/13926 images. Refreshing token...
Fetched 170/13926 images. Refreshing token...
Fetched 180/13926 images. Refreshing token...
Fetched 190/13926 images. Refreshing token...
Fetched 200/13926 images. Refreshing token...
Fetched 210/13926 images. Refreshing token...
Fetched 220/13926 images. Refreshing token.

In [83]:
new_images

{'03Ntkzzjkz7nFJldcPbL90': 'https://i.scdn.co/image/ab67616d0000b273687f3027948e525c171d0440',
 '4sS9IzHSH6HLWeC6Ies0LX': 'https://i.scdn.co/image/ab67616d0000b273e0cbf625a33f6aa1accaeef4',
 '7xpraKSakDJ6WWgjQEtvgT': 'https://i.scdn.co/image/ab67616d0000b2730bd846ed7a25f0ed4e313c35',
 '3qwVqJyXKNiPZLz9VBMd6r': 'https://i.scdn.co/image/ab67616d0000b27381106b7ade8ee85b56545f71',
 '6gj08XDlv9Duc2fPOxUmVD': 'https://i.scdn.co/image/ab67616d0000b2733b11178cccd78ec77fc12dbc',
 '4SXqvKpxCmPBKrmPPkR7y5': 'https://i.scdn.co/image/ab67616d0000b273467f9e87cc0a3cb2ff705f93',
 '3nN5n70vM1tAMWBDZaRbxS': 'https://i.scdn.co/image/ab67616d0000b27315287ceace3d570a5491b834',
 '53PHRtl7ZP8gkiWiX4rm16': 'https://i.scdn.co/image/ab67616d0000b273c4821abf3351f0eedd6aff3b',
 '6fpIcWxRGGqa681gUHJonh': 'https://i.scdn.co/image/ab67616d0000b273d87dd9780854f07967cae8ae',
 '4IJovoHQkKgwmH9SzAKcB8': 'https://i.scdn.co/image/ab67616d0000b2736462162d227ec2be2ebee12d',
 '7y5nJToPWMjtJLcLEGxyGf': 'https://i.scdn.co/imag

In [87]:
df_new_images = pd.DataFrame(list(new_images.items()), columns=['track_id', 'image_url'])

In [91]:
df_new_images.columns

Index(['track_id', 'image_url'], dtype='object')

In [101]:
df_sub.columns

Index(['name', 'artist', 'spotify_id', 'preview', 'img', 'danceability',
       'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'acousticness_artist', 'danceability_artist',
       'energy_artist', 'instrumentalness_artist', 'liveness_artist',
       'speechiness_artist', 'valence_artist'],
      dtype='object')

In [109]:
# Filter out rows with null or 'no' values in the 'img' column in df_sub
df_sub_filtered = df_sub[df_sub['img'].notnull() & (df_sub['img'] != 'no')]

# Select the relevant columns from df_sub_filtered
df_sub_filtered = df_sub_filtered[['spotify_id', 'img']]

# Rename the columns to match df_new_images
df_sub_filtered = df_sub_filtered.rename(columns={'spotify_id': 'track_id', 'img': 'image_url'})

# Use pd.concat to append the filtered df_sub to df_new_images
df_final = pd.concat([df_new_images, df_sub_filtered], ignore_index=True)

# Display the final dataframe
df_final

                     track_id  \
0      03Ntkzzjkz7nFJldcPbL90   
1      4sS9IzHSH6HLWeC6Ies0LX   
2      7xpraKSakDJ6WWgjQEtvgT   
3      3qwVqJyXKNiPZLz9VBMd6r   
4      6gj08XDlv9Duc2fPOxUmVD   
...                       ...   
39176  5v4NKLZzmlPtmHH9NZy3Cn   
39177  3wOHgqLMIqHierF0D8VoMl   
39178  5QP8hImTI6nJFXIcewXM64   
39179  323bMKFBvCrQ7YSrdCps3W   
39180  6yOghbMYTTUdOGiExT0rIF   

                                               image_url  
0      https://i.scdn.co/image/ab67616d0000b273687f30...  
1      https://i.scdn.co/image/ab67616d0000b273e0cbf6...  
2      https://i.scdn.co/image/ab67616d0000b2730bd846...  
3      https://i.scdn.co/image/ab67616d0000b27381106b...  
4      https://i.scdn.co/image/ab67616d0000b2733b1117...  
...                                                  ...  
39176  https://i.scdn.co/image/ab67616d0000b27302d1c8...  
39177  https://i.scdn.co/image/ab67616d0000b273778ef2...  
39178  https://i.scdn.co/image/ab67616d0000b2733273a4...  
39179  https:

In [115]:
df_final[df_final.image_url=='no']

Unnamed: 0,track_id,image_url


In [117]:
df_final.nunique()

track_id     39181
image_url    20842
dtype: int64

In [119]:
df_final.to_csv('final_images.csv', index=False)

In [None]:
https://i.scdn.co/image/ab67616d0000b2739101f0c8469b9a840dcd9a58
https://i.scdn.co/image/ab67616d0000b273b19ec20d6f01971a89bbb1ac
https://i.scdn.co/image/ab67616d000048513781f893aec9251308dddfe5
https://i.scdn.co/image/ab67616d00004851c9e961be2679c41930a370cd
https://i.scdn.co/image/ab67616d0000b273c6bfaf942ed981d5c4c922e4
https://i.scdn.co/image/ab67616d0000b27341720ef0ae31e10d39e43ca2
https://i.scdn.co/image/ab67616d0000b27314ba3e999835ddf8012772de
https://i.scdn.co/image/ab67616d0000b273bfa99afb5ef0d26d5064b23b
https://i.scdn.co/image/ab67616d0000b27301003bf641243fcc56944428
https://i.scdn.co/image/ab67616d0000b2735400a55ed99557265d925dc4
https://i.scdn.co/image/ab67616d0000b273a03bcdbf45f9ed71cf0947bd
https://i.scdn.co/image/ab67616d0000b27301003bf641243fcc56944428

In [121]:
import pandas as pd

# List of track IDs and image URLs (replace with your actual data)
track_ids = [
    '2GAljG6WxV0XU7N88TWhb1', '36RlHKPmB8uiS422NQWsTi', '6nRwc5GgNvBMkKaynhQzrm', 
    '74X1epeRufHckhuX1KFD04', '77Y57qRJBvkGCUw9qs0qMg', '4n1ZGm3TxYmoYe1YR8cMus', 
    '0Pjsm4AaJGPOXHoB6xCgJB', '1BLOVHYYlH4JUHQGcpt75R', '0FQLQfuCKSlcVSOPscqCJ6', 
    '3LXFi1Xk9qJsYXYEjDcuVH', '714hERk9U1W8FMYkoC83CO', '0FQLQfuCKSlcVSOPscqCJ6'
]

image_urls = [
    'https://i.scdn.co/image/ab67616d0000b2739101f0c8469b9a840dcd9a58', 
    'https://i.scdn.co/image/ab67616d0000b273b19ec20d6f01971a89bbb1ac', 
    'https://i.scdn.co/image/ab67616d000048513781f893aec9251308dddfe5', 
    'https://i.scdn.co/image/ab67616d00004851c9e961be2679c41930a370cd', 
    'https://i.scdn.co/image/ab67616d0000b273c6bfaf942ed981d5c4c922e4', 
    'https://i.scdn.co/image/ab67616d0000b27341720ef0ae31e10d39e43ca2', 
    'https://i.scdn.co/image/ab67616d0000b27314ba3e999835ddf8012772de', 
    'https://i.scdn.co/image/ab67616d0000b273bfa99afb5ef0d26d5064b23b', 
    'https://i.scdn.co/image/ab67616d0000b27301003bf641243fcc56944428', 
    'https://i.scdn.co/image/ab67616d0000b2735400a55ed99557265d925dc4', 
    'https://i.scdn.co/image/ab67616d0000b273a03bcdbf45f9ed71cf0947bd', 
    'https://i.scdn.co/image/ab67616d0000b27301003bf641243fcc56944428'
]

# Create a DataFrame
df = pd.DataFrame({
    'track_id': track_ids,
    'image_url': image_urls
})

# Save DataFrame to CSV
df.to_csv('track_images.csv', index=False)

# Display the DataFrame
print(df)

                  track_id                                          image_url
0   2GAljG6WxV0XU7N88TWhb1  https://i.scdn.co/image/ab67616d0000b2739101f0...
1   36RlHKPmB8uiS422NQWsTi  https://i.scdn.co/image/ab67616d0000b273b19ec2...
2   6nRwc5GgNvBMkKaynhQzrm  https://i.scdn.co/image/ab67616d000048513781f8...
3   74X1epeRufHckhuX1KFD04  https://i.scdn.co/image/ab67616d00004851c9e961...
4   77Y57qRJBvkGCUw9qs0qMg  https://i.scdn.co/image/ab67616d0000b273c6bfaf...
5   4n1ZGm3TxYmoYe1YR8cMus  https://i.scdn.co/image/ab67616d0000b27341720e...
6   0Pjsm4AaJGPOXHoB6xCgJB  https://i.scdn.co/image/ab67616d0000b27314ba3e...
7   1BLOVHYYlH4JUHQGcpt75R  https://i.scdn.co/image/ab67616d0000b273bfa99a...
8   0FQLQfuCKSlcVSOPscqCJ6  https://i.scdn.co/image/ab67616d0000b27301003b...
9   3LXFi1Xk9qJsYXYEjDcuVH  https://i.scdn.co/image/ab67616d0000b2735400a5...
10  714hERk9U1W8FMYkoC83CO  https://i.scdn.co/image/ab67616d0000b273a03bcd...
11  0FQLQfuCKSlcVSOPscqCJ6  https://i.scdn.co/image/ab67616d0000

In [125]:
df_final2= pd.concat([df_final, df], ignore_index=True)

In [129]:
df_final.shape

(39181, 2)

In [131]:
df_final2.to_csv('final_images.csv', index=False)