# Read big csv file into pandas dataframe

In [2]:
import pandas as pd
import csv
import time
from tqdm import tqdm
from spotify_api import get_multiple_field_information
from auth import simple_authenticate

def first_or_none(val):
    if isinstance(val, (list, tuple)):
        print(val[0])
        return val[0] if val else None
    return val

songs_dataframe = pd.read_csv('1_2_m_songs_dataset_deduped.csv')
songs_dataframe = songs_dataframe.dropna(subset=['id'])
songs_dataframe['artist_ids'] = (
    songs_dataframe['artist_ids']
      .astype(str)                            # make sure it’s a string
      .str.extract(r"'([^']*)'")              # grab whatever is inside the first pair of single‐quotes
      .iloc[:,0]                              # get the extracted Series
      .where(songs_dataframe['artist_ids'].notna())        # put NaNs back where they were
)
songs_dataframe = songs_dataframe.dropna(subset=['artist_ids'])
songs_dataframe = songs_dataframe.drop_duplicates(subset=['id'])

bearer_token = simple_authenticate()

OUTPUT_CSV = "1_m_songs.csv"
BATCH_SIZE  = 50        # Spotify "artists" endpoint max
CHUNK_SIZE  = 10_000    # rows per chunk
PAUSE       = 0.1       # seconds between API calls (avoid rate-limit)


# Prepare output file (overwrites)
with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["track_id", "genre"])

# Cache for artist - genre lookups for no double lookups
artist_cache = {}

n = len(songs_dataframe)

# Iterate in row-chunks with a progress bar
for start in tqdm(range(0, n, CHUNK_SIZE), desc="Processing chunks"):

    chunk = songs_dataframe.iloc[start : start + CHUNK_SIZE]
    
    # Find which artist_ids we still need to look up
    to_lookup = [aid for aid in chunk["artist_ids"].unique() 
                 if aid not in artist_cache]
    
    # Call the API in batches of 50
    for i in range(0, len(to_lookup), BATCH_SIZE):
        batch = to_lookup[i : i + BATCH_SIZE]
        resp = get_multiple_field_information(
            bearer_token, 
            "artists", 
            BATCH_SIZE,
            *batch
        )
        # fill cache with missing artist genres combinations
        if resp and "artists" in resp:
            for artist in resp["artists"]:
                aid    = artist.get("id")
                genres = artist.get("genres") or []
                artist_cache[aid] = genres
        else:
            # on failure or empty, record as no-genres
            for aid in batch:
                artist_cache[aid] = []
        time.sleep(PAUSE)
    
    # Build output rows for this chunk
    rows = []
    for _, row in chunk.iterrows():
        aid    = row["artist_ids"]
        genres = artist_cache.get(aid, [])
        if genres:
            primary = genres[0]
            rows.append([row["id"], primary])
        # else: skip any track whose artist has no genres
    
    # Append to CSV
    if rows:
        with open(OUTPUT_CSV, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerows(rows)



Processing chunks: 100%|██████████| 120/120 [10:34<00:00,  5.29s/it]


# Do genre analyis to avoid unneccecary compute time

In [6]:
import pandas as pd

# 1) Your set of genres
popular_genres = {
    'country', 'metal', 'heavy-metal', 'hardstyle',
    'blues', 'rap', 'hip hop', 'classical', 'folk',
    'jazz', 'lo-fi', 'soul', 'punk', 'r&b', 'latin',
    'rock', 'rock-n-roll', 'techno', 'pop', 'house', 'phonk',
    'indie', 'idm', 'reggae', 'funk', 'acid',
    'death-metal', 'bluegrass', 'acoustic', 'electronic',
    'tango', 'forro', 'breakbeat', 'grindcore'
}

# 2) Load your file
df = pd.read_csv("1_m_songs.csv")  # columns: track_id, genre

# 3) Filter to only those popular genres
df = df[df["genre"].isin(popular_genres)]

# 4) For each genre, sample up to 2000 rows
balanced_parts = []
for genre, grp in df.groupby("genre"):
    if len(grp) > 2000:
        sampled = grp.sample(n=2000, random_state=42)
    else:
        sampled = grp
    balanced_parts.append(sampled)

balanced_df = pd.concat(balanced_parts, ignore_index=True)

# 5) Optionally shuffle the final DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 6) Write out the balanced CSV
balanced_df.to_csv("track_genres_balanced.csv", index=False)

print("Per-genre counts after balancing:")
print(balanced_df["genre"].value_counts())



Per-genre counts after balancing:
genre
reggae        2000
bluegrass     2000
jazz          2000
r&b           2000
idm           2000
folk          2000
classical     2000
tango         2000
grindcore     2000
blues         2000
punk          2000
country       2000
soul          2000
lo-fi         1303
breakbeat      967
metal          668
rap            612
rock           581
indie          526
pop            431
techno         291
hip hop        234
funk           214
house          205
hardstyle       85
latin           37
phonk           13
electronic       3
Name: count, dtype: int64


# Extract track features multithreaded

In [4]:
import os
import pickle
import shutil
import librosa
import time
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

## THIS ONLY ON MACOS ## DISABLE ON WINDOWS OR LINUX ##
import multiprocessing as mp
mp.set_start_method("fork", force=True)

def summarize_feature(feature_array):
    means = np.mean(feature_array)
    stds = np.std(feature_array)
    medians = np.median(feature_array)
    return [means, stds, medians] 

def flatten(track_features):
    return [feature for features in track_features for feature in features]

def extract_features_librosa(file_path):
    y, sr = librosa.load(file_path, sr=None)
    features = [
        summarize_feature(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12)),
        summarize_feature(librosa.feature.chroma_stft(y=y, sr=sr)),
        summarize_feature(librosa.feature.chroma_cqt(y=y, sr=sr)),
        summarize_feature(librosa.feature.chroma_cens(y=y, sr=sr)),
        summarize_feature(librosa.feature.chroma_vqt(y=y, sr=sr, intervals='equal')),
        summarize_feature(librosa.feature.melspectrogram(y=y, sr=sr)),
        summarize_feature(librosa.feature.spectral_centroid(y=y, sr=sr)),
        summarize_feature(librosa.feature.spectral_bandwidth(y=y, sr=sr)),
        summarize_feature(librosa.feature.spectral_contrast(y=y, sr=sr)),
        summarize_feature(librosa.feature.spectral_flatness(y=y)),
        summarize_feature(librosa.feature.spectral_rolloff(y=y, sr=sr)),
        summarize_feature(librosa.feature.poly_features(y=y, sr=sr)),
        summarize_feature(librosa.feature.tonnetz(y=y, sr=sr)),
        summarize_feature(librosa.feature.zero_crossing_rate(y)),
        summarize_feature(librosa.feature.tempogram(y=y, sr=sr)),
        summarize_feature(librosa.feature.fourier_tempogram(y=y, sr=sr)),
        summarize_feature(librosa.feature.tempogram_ratio(y=y, sr=sr)),
        summarize_feature(librosa.feature.rms(y=y)),
        np.ravel(librosa.feature.tempo(y=y, sr=sr))
    ]
    return flatten(features)

def process_file(file, folder_path, id_to_genre):
    try:
        file_path = os.path.join(folder_path, file)
        file_id = os.path.splitext(file)[0]
        features = extract_features_librosa(file_path)
        genre = id_to_genre.get(file_id, None)
        return features, genre, file
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return None, None, file

while True:

    # Load the CSV file with track IDs and genres
    tracks_info_df = pd.read_csv("track_genre_balanced_url.csv")

    # Load pickle
    results_file = 'audio_features.pkl'
    if os.path.exists(results_file):
        with open(results_file, 'rb') as file:
            saved_data = pickle.load(file)
        X = saved_data.get('X', [])
        y_labels = saved_data.get('y_labels', [])
        processed_files = set(saved_data.get('processed_files', []))
    else:
        X = []
        y_labels = []
        processed_files = set()

    # Get the list of all files in the folder and filter out the processed ones
    folder_path = './audio_previews'
    all_files = [f for f in os.listdir(folder_path) if f.endswith('.mp3')]
    files_to_process = [f for f in all_files if f not in processed_files]

    # Create a mapping from track_id to genre
    id_to_genre = dict(zip(tracks_info_df['track_id'], tracks_info_df['genre']))

    # Define some constants
    batch_size = os.cpu_count() or 4  # Use all available CPU cores but one or 4 if none available
    backup_interval = 100
    counter = 0

    with tqdm(total=len(files_to_process), desc="Processing files", unit="file") as pbar:
        with ProcessPoolExecutor(max_workers=batch_size) as executor:
            futures = []
            for i in range(0, len(files_to_process), batch_size):
                batch = files_to_process[i:i+batch_size]
                futures = [executor.submit(process_file, file, folder_path, id_to_genre) for file in batch]

                for future in futures:
                    features, genre, file = future.result()
                    if features is not None:
                        X.append(features)
                        y_labels.append(genre)
                        processed_files.add(file)
                        counter += 1
                    
                    pbar.update(1)

                # Save after each batch
                with open(results_file, 'wb') as f:
                    pickle.dump({
                        'X': X,
                        'y_labels': y_labels,
                        'processed_files': list(processed_files)
                    }, f)

                if counter % backup_interval == 0:
                    backup_file = results_file.replace(".pkl", "_backup.pkl")
                    shutil.copy(results_file, backup_file)
                    print(f"Backup created at {backup_file}")

    print(f"Done processing {counter} new tracks.")

    time.sleep(900) # Sleep for 15 minutes before the next iteration

Processing files:  50%|█████     | 194/387 [09:15<08:05,  2.52s/file]

Backup created at audio_features_backup.pkl


Processing files: 100%|██████████| 387/387 [17:49<00:00,  2.76s/file]


Done processing 387 new tracks.


Processing files:   7%|▋         | 194/2765 [09:37<2:04:26,  2.90s/file]

Backup created at audio_features_backup.pkl


Processing files:  14%|█▍        | 395/2765 [18:59<1:40:10,  2.54s/file]

Backup created at audio_features_backup.pkl


Processing files:  21%|██▏       | 593/2765 [27:58<1:47:04,  2.96s/file]

Backup created at audio_features_backup.pkl


Processing files:  29%|██▉       | 795/2765 [36:36<1:17:36,  2.36s/file]

Backup created at audio_features_backup.pkl


Processing files:  36%|███▌      | 993/2765 [45:09<1:17:01,  2.61s/file]

Backup created at audio_features_backup.pkl


Processing files:  43%|████▎     | 1193/2765 [53:41<1:11:39,  2.74s/file]

Backup created at audio_features_backup.pkl


Processing files:  51%|█████     | 1398/2765 [1:02:14<41:15,  1.81s/file]  

Backup created at audio_features_backup.pkl


Processing files:  58%|█████▊    | 1594/2765 [1:10:50<54:32,  2.79s/file]  

Backup created at audio_features_backup.pkl


Processing files:  65%|██████▍   | 1795/2765 [1:19:24<39:09,  2.42s/file]

Backup created at audio_features_backup.pkl


  return pitch_tuning(
Processing files:  72%|███████▏  | 1993/2765 [1:27:52<36:48,  2.86s/file]

Backup created at audio_features_backup.pkl


Processing files:  79%|███████▉  | 2193/2765 [1:38:17<27:24,  2.87s/file]

Backup created at audio_features_backup.pkl


Processing files:  87%|████████▋ | 2393/2765 [1:48:18<20:05,  3.24s/file]

Backup created at audio_features_backup.pkl


Processing files:  94%|█████████▍| 2594/2765 [1:59:21<10:49,  3.80s/file]

Backup created at audio_features_backup.pkl


Processing files: 100%|██████████| 2765/2765 [2:08:29<00:00,  2.79s/file]


Done processing 2765 new tracks.


Processing files:   2%|▏         | 196/11243 [11:19<9:26:11,  3.08s/file] 

Backup created at audio_features_backup.pkl


Processing files:   4%|▎         | 396/11243 [21:36<7:57:56,  2.64s/file] 

Backup created at audio_features_backup.pkl


Processing files:   5%|▌         | 594/11243 [32:09<10:57:08,  3.70s/file]

Backup created at audio_features_backup.pkl


Processing files:   7%|▋         | 797/11243 [43:51<6:32:49,  2.26s/file] 

Backup created at audio_features_backup.pkl


Processing files:   9%|▉         | 996/11243 [54:50<6:45:36,  2.37s/file] 

Backup created at audio_features_backup.pkl


Processing files:  11%|█         | 1200/11243 [1:06:45<10:18:25,  3.69s/file]

Backup created at audio_features_backup.pkl


  return pitch_tuning(
Processing files:  12%|█▏        | 1400/11243 [1:19:05<8:20:16,  3.05s/file] 

Backup created at audio_features_backup.pkl


Processing files:  14%|█▍        | 1600/11243 [1:33:21<7:07:27,  2.66s/file] 

Backup created at audio_features_backup.pkl


Processing files:  16%|█▌        | 1793/11243 [1:49:18<11:30:53,  4.39s/file]

Backup created at audio_features_backup.pkl


Processing files:  18%|█▊        | 1994/11243 [2:04:37<9:14:27,  3.60s/file] 

Backup created at audio_features_backup.pkl


Processing files:  20%|█▉        | 2200/11243 [2:14:26<4:58:28,  1.98s/file] 

Backup created at audio_features_backup.pkl


Processing files:  21%|██▏       | 2399/11243 [2:23:33<4:08:35,  1.69s/file] 

Backup created at audio_features_backup.pkl


Processing files:  23%|██▎       | 2597/11243 [2:34:03<6:20:01,  2.64s/file] 

Backup created at audio_features_backup.pkl


Processing files:  25%|██▍       | 2796/11243 [2:43:18<5:32:20,  2.36s/file]

Backup created at audio_features_backup.pkl


Processing files:  27%|██▋       | 2997/11243 [2:51:52<4:50:26,  2.11s/file]

Backup created at audio_features_backup.pkl


Processing files:  28%|██▊       | 3195/11243 [3:00:26<5:34:11,  2.49s/file]

Backup created at audio_features_backup.pkl


Processing files:  30%|██▉       | 3318/11243 [3:05:40<4:01:22,  1.83s/file]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x105e13950>>
Traceback (most recent call last):
  File "/Users/agres/Projects/seb/predictify/.venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Process ForkProcess-44:
Process ForkProcess-39:
Process ForkProcess-43:
Process ForkProcess-38:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Library/Frameworks

BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

In [1]:
# Merge two pkl files

import pickle
import os

def merge_pickles(file1, file2, output_file):
    with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
        data1 = pickle.load(f1)
        data2 = pickle.load(f2)

    # Merge the data
    merged_data = {
        'X': data1['X'] + data2['X'],
        'y_labels': data1['y_labels'] + data2['y_labels'],
        'processed_files': list(set(data1['processed_files'] + data2['processed_files']))
    }

    # Save the merged data
    with open(output_file, 'wb') as f_out:
        pickle.dump(merged_data, f_out)

# Usage
file1 = 'audio_features.pkl'
file2 = 'audio_features_chris.pkl'
output_file = 'audio_features_merged.pkl'

merge_pickles(file1, file2, output_file)