### This notebook is to scrape the album cover of the 20k sampled songs

In [None]:
# === 0) Imports & setup ===
import os, io, time, math, requests
import pandas as pd
from pathlib import Path
from PIL import Image
from tqdm import tqdm

# Spotify
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Optional: set your credentials here or in your env
os.environ['SPOTIPY_CLIENT_ID'] = '####################'
os.environ['SPOTIPY_CLIENT_SECRET'] = '######################'

ALBUM_OUT_DIR = Path("image_data/album_cover")
ALBUM_OUT_DIR.mkdir(parents=True, exist_ok=True)

In [25]:
# Load the data 
spotify_train_20k = pd.read_csv('spotify_data/spotify_train_20k.csv')
spotify_test_20k = pd.read_csv('spotify_data/spotify_test_20k.csv')


spotify_sample_20k = pd.concat([spotify_train_20k, spotify_test_20k])

# try on the first 5 songs (comment this when trying to scale)
# spotify_sample_20k = spotify_sample_20k.head(5)


In [26]:
# === 1) Helpers ===
def init_spotify():
    return spotipy.Spotify(
        auth_manager=SpotifyClientCredentials(),
        requests_timeout=20,
        retries=3
    )

def pick_best_image(images, prefer_min_px=640):
    if not images:
        return None
    images_sorted = sorted(images, key=lambda im: min(im.get("height",0), im.get("width",0)))
    for im in images_sorted:
        if min(im.get("height",0), im.get("width",0)) >= prefer_min_px:
            return im
    return images_sorted[-1]

def download_image(url, out_path):
    try:
        r = requests.get(url, timeout=20)
        r.raise_for_status()
        img = Image.open(io.BytesIO(r.content)).convert("RGB")
        out_path.parent.mkdir(parents=True, exist_ok=True)
        img.save(out_path, format="JPEG", quality=92)
        return img.size  # (w,h)
    except Exception:
        return None

def get_deezer_cover_from_isrc(isrc):
    try:
        r = requests.get("https://api.deezer.com/search", params={"q": f"isrc:{isrc}"}, timeout=15)
        r.raise_for_status()
        data = r.json().get("data", [])
        if not data:
            return None
        alb = data[0].get("album", {}) or {}
        return alb.get("cover_xl") or alb.get("cover_big") or alb.get("cover_medium") or alb.get("cover")
    except Exception:
        return None

def fetch_album_covers_spotify(track_ids, out_dir=ALBUM_OUT_DIR, batch_size=50, sleep_s=0.0):
    """Return DF with: track_id, track_name, artist_name, album_name, release_date,
       album_cover_url, album_cover_path, cover_width, cover_height, isrc
    """
    sp = init_spotify()
    rows = []
    # dedupe while preserving order
    seen = set()
    track_ids = [tid for tid in track_ids if (tid and not (tid in seen or seen.add(tid)))]

    for i in tqdm(range(0, len(track_ids), batch_size), desc="Spotify covers"):
        chunk = track_ids[i:i+batch_size]
        try:
            resp = sp.tracks(chunk) or {}
            tracks = resp.get("tracks", []) or []
        except Exception:
            tracks = []

        for t in tracks:
            if not t:
                continue
            tid   = t.get("id")
            tname = t.get("name")
            aname = ", ".join([a["name"] for a in t.get("artists", [])]) or None
            alb   = t.get("album", {}) or {}
            an    = alb.get("name")
            rd    = alb.get("release_date")
            images = alb.get("images", []) or []
            isrc  = (t.get("external_ids") or {}).get("isrc")

            chosen = pick_best_image(images, prefer_min_px=640)
            url    = chosen.get("url") if chosen else None

            cover_path = None
            w = h = None
            if url and tid:
                out_path = out_dir / f"{tid}.jpg"
                size = download_image(url, out_path)
                if size:
                    w, h = size
                    cover_path = str(out_path)

            rows.append({
                "track_id": tid,
                "track_name": tname,
                "artist_name": aname,
                "album_name": an,
                "release_date": rd,
                "album_cover_url": url,
                "album_cover_path": cover_path,
                "cover_width": w,
                "cover_height": h,
                "isrc": isrc,
            })

        if sleep_s > 0:
            time.sleep(sleep_s)

    return pd.DataFrame(rows)

def fetch_album_covers_deezer_isrc(isrc_list, out_dir=ALBUM_OUT_DIR):
    rows = []
    for isrc in tqdm(isrc_list, desc="Deezer fallback"):
        url = get_deezer_cover_from_isrc(isrc)
        w = h = None
        cover_path = None
        if url:
            out_path = out_dir / f"{isrc}.jpg"
            size = download_image(url, out_path)
            if size:
                w, h = size
                cover_path = str(out_path)
        rows.append({
            "isrc": isrc,
            "deezer_cover_url": url,
            "deezer_cover_path": cover_path,
            "deezer_cover_width": w,
            "deezer_cover_height": h,
        })
    return pd.DataFrame(rows)


In [27]:
# === 2) Run on your df_sample_10k ===
# Required column: 'track_id'
track_ids_all = (
    spotify_sample_20k["track_id"]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
    .tolist()
)

covers_spotify = fetch_album_covers_spotify(track_ids_all, out_dir=ALBUM_OUT_DIR)

# Deezer fallback for rows still missing local image BUT having ISRC
need_fallback = covers_spotify[
    covers_spotify["album_cover_path"].isna() & covers_spotify["isrc"].notna()
]["isrc"].dropna().unique().tolist()

if len(need_fallback) > 0:
    deezer_df = fetch_album_covers_deezer_isrc(need_fallback, out_dir=ALBUM_OUT_DIR)
    # Join Deezer results back
    covers_spotify = covers_spotify.merge(deezer_df, on="isrc", how="left")
    # fill missing path/url/size from deezer
    covers_spotify["album_cover_path"] = covers_spotify["album_cover_path"].fillna(covers_spotify["deezer_cover_path"])
    covers_spotify["album_cover_url"]  = covers_spotify["album_cover_url"].fillna(covers_spotify["deezer_cover_url"])
    covers_spotify["cover_width"]  = covers_spotify["cover_width"].fillna(covers_spotify["deezer_cover_width"])
    covers_spotify["cover_height"] = covers_spotify["cover_height"].fillna(covers_spotify["deezer_cover_height"])
    covers_spotify.drop(columns=[c for c in covers_spotify.columns if c.startswith("deezer_")], inplace=True, errors="ignore")


Spotify covers: 100%|██████████| 430/430 [1:03:13<00:00,  8.82s/it]
Deezer fallback: 100%|██████████| 2/2 [00:00<00:00,  4.62it/s]
  covers_spotify["cover_width"]  = covers_spotify["cover_width"].fillna(covers_spotify["deezer_cover_width"])
  covers_spotify["cover_height"] = covers_spotify["cover_height"].fillna(covers_spotify["deezer_cover_height"])


In [28]:
# === 3) Merge back to your 10k sample & save a manifest ===
cols_keep = [
    "track_id","track_name","artist_name","album_name","release_date",
    "isrc","album_cover_url","album_cover_path","cover_width","cover_height"
]
covers_spotify = covers_spotify[cols_keep].drop_duplicates("track_id")

spotify_sample_20k_with_covers = spotify_sample_20k.merge(covers_spotify, on="track_id", how="left")

# Save a clean manifest (CSV)
MANIFEST_PATH = Path("image_data/spotify_album_cover_manifest.csv")
MANIFEST_PATH.parent.mkdir(parents=True, exist_ok=True)
spotify_sample_20k_with_covers.to_csv(MANIFEST_PATH, index=False)


In [29]:
# === 4) Quick summary ===
total = len(spotify_sample_20k_with_covers)
saved = spotify_sample_20k_with_covers["album_cover_path"].notna().sum()
print(f"Album covers saved locally: {saved}/{total} ({saved/total:.1%})")
print(f"Manifest: {MANIFEST_PATH}")
spotify_sample_20k_with_covers.head(5)

Album covers saved locally: 21444/21500 (99.7%)
Manifest: image_data\spotify_album_cover_manifest.csv


Unnamed: 0,track_id,artist_name_x,track_name_x,pop,artist_tier,year,danceability,energy,key,loudness,...,time_signature,track_name_y,artist_name_y,album_name,release_date,isrc,album_cover_url,album_cover_path,cover_width,cover_height
0,0EWF6VAofw5mnG0pUzcV3U,Thelem,False Imprint,flop,B,2013,0.672,0.567,0,-10.392,...,4,False Imprint,Thelem,Bring Me Down,2013-10-28,NZIN01200046,https://i.scdn.co/image/ab67616d0000b273315fdf...,image_data\album_cover\0EWF6VAofw5mnG0pUzcV3U.jpg,640.0,640.0
1,5i6oXQ079nFUszXyRZu4uY,Audiojack,6EQUJ5 - Original Mix,regular,A,2017,0.805,0.62,4,-8.551,...,4,6EQUJ5 - Original Mix,Audiojack,Implications,2017-06-09,GB7NR1718103,https://i.scdn.co/image/ab67616d0000b2731329eb...,image_data\album_cover\5i6oXQ079nFUszXyRZu4uY.jpg,640.0,640.0
2,76Nd1pesCxNehpUl4k63I4,The Little Willies,Easy As The Rain,regular,A,2006,0.576,0.163,2,-14.544,...,4,Easy As The Rain,The Little Willies,The Little Willies,2006-01-01,USBN20500480,https://i.scdn.co/image/ab67616d0000b2734d7a43...,image_data\album_cover\76Nd1pesCxNehpUl4k63I4.jpg,640.0,640.0
3,6ZthdsKjWtiCxnxbhs74vF,Gunna,Drip or Drown,hit,A,2017,0.952,0.478,5,-8.52,...,4,Drip or Drown,Gunna,Drip or Drown,2017-11-30,QZ5FA1751845,https://i.scdn.co/image/ab67616d0000b273340cf6...,image_data\album_cover\6ZthdsKjWtiCxnxbhs74vF.jpg,640.0,640.0
4,4FcknXDJ8yW2QYUl0cm6uJ,Imran Khan,Bewafa,hit,B,2009,0.704,0.563,4,-4.739,...,4,Bewafa,Imran Khan,Bewafa,2009-07-27,GBRCG0900013,https://i.scdn.co/image/ab67616d0000b273a9eec0...,image_data\album_cover\4FcknXDJ8yW2QYUl0cm6uJ.jpg,640.0,640.0


In [30]:
# find the rows in spotify_sample_20k_with_covers that have no album cover path
spotify_sample_20k_with_covers[spotify_sample_20k_with_covers["album_cover_path"].isna()]


Unnamed: 0,track_id,artist_name_x,track_name_x,pop,artist_tier,year,danceability,energy,key,loudness,...,time_signature,track_name_y,artist_name_y,album_name,release_date,isrc,album_cover_url,album_cover_path,cover_width,cover_height
16016,5W7DOVGQLTigu09afW7QMT,Sidhu Moose Wala,295,hit,C,2021,0.557,0.794,11,-5.698,...,4,,,,0000,,,,,
18609,7ldMHsWJkczg8QqaJcSVjE,puremind,Malo Tebya,regular,U,2022,0.57,0.92,1,-5.348,...,4,,,,0000,,,,,
18905,1m1apFo0NAaaw7h9NNoG6R,Lawrence,Gilbert [Mixed],regular,A,2023,0.631,0.392,1,-16.066,...,4,Gilbert [Mixed],Lawrence,Connecting The Dots (DJ Mix),2023-01-13,DEU672300078,https://i.scdn.co/image/ab67616d0000b2732992fb...,,,
19136,0QUxWlGwGeV9MZhwgBvTmV,Alberto Gomez,Uno - Orquesta,flop,U,2023,0.521,0.149,3,-8.607,...,3,Uno - Orquesta,Alberto Gomez,"Zorzales de Antaño, Vol. 1: Alberto Gomez",2023-01-25,QM4TW2301517,,,,
19488,3FUBAPKKO6OHkl2fexm0yn,Pxlish Beatz,Malo Tebya - Pxlish Beatz Remix,regular,U,2022,0.402,0.796,7,-6.887,...,4,,,,0000,,,,,
20200,0CCidxt43DTmHtvq465p1L,The Body,Conspiracy Privilege,flop,A,2022,0.334,0.886,2,-4.593,...,4,,,,,,,,,
20201,0nRFROGcDtNFntI8ZtHT5z,Solange,Twinkle Twinkle Little Star,flop,S,2023,0.811,0.626,11,-4.775,...,4,,,,,,,,,
20202,5HP9LwDRT5FGOokrN1HnZ3,Mark Roswell Trio,Blue Moon,regular,U,2023,0.473,0.132,3,-19.769,...,4,,,,,,,,,
20203,3RCTsZxDQZp6dz7QGJNr3a,Lata Mangeshkar,Mai Chali Mai Chali (Remix),flop,A,2023,0.645,0.724,9,-7.443,...,4,,,,,,,,,
20204,1MpCaOeUWhox2Fgigbe1cL,Beyoncé,I'M THAT GIRL,regular,S,2022,0.554,0.535,5,-8.959,...,4,,,,,,,,,
