# Read out Data from Kaggle Dataset, get preview URL-s and save to file

## this should be run only once

In [4]:
import pandas as pd
import os
from spotify_preview import get_spotify_preview_url
from tqdm import tqdm

# --- 0) Load & dedupe your balanced track/genre file ---
df = pd.read_csv('track_genres_balanced.csv')
df = df.drop_duplicates(subset=['track_id'])
df = df.dropna(subset=['genre'])

# --- 1) Prep output CSV (header only once) ---
#output_csv = 'track_genre_balanced_url.csv'
#pd.DataFrame(columns=['track_id','genre','preview']).to_csv(output_csv, index=False)
#output_csv = pd.read_csv('track_genre_balanced_url.csv')

output_csv = 'track_genre_balanced_url.csv'

if os.path.exists(output_csv):
    # load already-fetched track_ids and drop them from df
    done = pd.read_csv(output_csv, usecols=['track_id'])
    processed_ids = set(done['track_id'].astype(str))
    df = df[~df['track_id'].astype(str).isin(processed_ids)]
    write_header = False
else:
    # new file → write header
    pd.DataFrame(columns=['track_id','genre','preview']) \
      .to_csv(output_csv, index=False)
    write_header = False  # header is already there

# --- 2) Parameters ---
BATCH_SIZE = 100   # how many tracks to process per “mini‐batch”
PAUSE      = 0.1   # if you want a small sleep between API calls

# --- 3) Loop with a single progress bar over all tracks ---
with tqdm(total=len(df), desc="Fetching previews") as pbar:
    for start in range(0, len(df), BATCH_SIZE):
        chunk = df.iloc[start:start + BATCH_SIZE]
        rows = []

        # 4) Per‐track lookup
        for _, row in chunk.iterrows():
            track_id = row['track_id']
            genre    = row['genre']

            preview = get_spotify_preview_url(track_id)
            if preview:
                rows.append({
                    'track_id': track_id,
                    'genre':     genre,
                    'preview':   preview
                })
            # else: silently skip or print an error if you prefer

            pbar.update(1)
            if PAUSE:
                import time; time.sleep(PAUSE)

        # 5) Append this batch’s hits to disk
        if rows:
            pd.DataFrame(rows).to_csv(
                output_csv,
                mode='a',
                header=False,
                index=False
            )


Fetching previews:   6%|▋         | 76/1183 [00:35<33:39,  1.82s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/64ffsubBonytxZc5fQJhdO


Fetching previews:   9%|▊         | 102/1183 [00:55<34:18,  1.90s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/2Iu5wxKFiEEQDQK1Pldsis


Fetching previews:   9%|▉         | 111/1183 [01:03<33:10,  1.86s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/6syvS9gZzjB8b9DdKVhAJH


Fetching previews:  15%|█▌        | 180/1183 [01:54<53:30,  3.20s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/2qrVR11O44iJ0DVTNCExjA


Fetching previews:  19%|█▉        | 225/1183 [02:25<29:37,  1.86s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/3njPW0vttbjt5j1Elt6sJI


Fetching previews:  32%|███▏      | 381/1183 [03:26<23:39,  1.77s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/3T7zNYia3nk9d8uXhO9Xud


Fetching previews:  53%|█████▎    | 630/1183 [05:23<16:28,  1.79s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/41Sfs0E8hr8w2BvzUtof4O


Fetching previews:  54%|█████▎    | 633/1183 [05:29<20:57,  2.29s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/3H9aA6IO5gfHW72m8YU8Iv


Fetching previews:  57%|█████▋    | 675/1183 [05:56<15:49,  1.87s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/0lvHnw9Exl8jLV3zuRsksJ


Fetching previews:  67%|██████▋   | 792/1183 [07:06<12:08,  1.86s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/17sSDGIRIkB0jOKb2cBURf


Fetching previews:  77%|███████▋  | 911/1183 [08:03<09:15,  2.04s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/5RcZ5jbBgKDdM6BuoSeh8P


Fetching previews:  77%|███████▋  | 912/1183 [08:08<13:32,  3.00s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/0YQrHOpi219lZA8SDly4iG


Fetching previews:  90%|█████████ | 1069/1183 [09:31<03:31,  1.85s/it]

Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/2iql0ydkQX1hZ375EyRFFF


Fetching previews: 100%|██████████| 1183/1183 [10:19<00:00,  1.91it/s]
