# Read out data from Database, get preview URL-s and save to file
## This should be run only once

In [None]:
import sqlite3
from spotify_preview import get_spotify_preview_url
import pandas as pd


# -- Query Database for all unique track IDs and their first given genre by Spotify --
database_suffix = "PRODUCTION" # Can be: TEST or PRODUCTION
conn = sqlite3.connect(f"../../data/spotify_scrape_{database_suffix}.db")
cursor = conn.cursor()
query = """
    SELECT DISTINCT rp.track_id, rp.artist_id, ai.genres
    FROM recently_played rp
    JOIN artist_information ai ON rp.artist_id = ai.artist_id
    WHERE ai.genres IS NOT NULL AND ai.genres <> ''
"""
cursor.execute(query)

results = cursor.fetchall()
conn.close()

# -- Build a list of dictionaries for valid tracks --
rows = []
for entry in results:
    track_id = entry[0]
    genre = entry[2]
    preview_url = get_spotify_preview_url(track_id)
    if preview_url:
        rows.append({
            'id': track_id,
            'genre': genre,
            'preview': preview_url
        })
    else:
        print(f"Preview URL not found for track ID: {track_id}")

tracks_info_df = pd.DataFrame(rows)
tracks_info_df.to_csv('data.csv', index=False)


# Read out Data from Kaggle Dataset, get preview URL-s and save to file

## this should be run only once

In [None]:
import pandas as pd
from spotify_preview import get_spotify_preview_url

tracks_info_df_kaggle_dataset = pd.read_csv('./kaggle_data/dataset.csv')
tracks_info_df_kaggle_dataset = tracks_info_df_kaggle_dataset.drop_duplicates(subset=['track_id'])
tracks_info_df_kaggle_dataset = tracks_info_df_kaggle_dataset.dropna(subset=['track_genre'])

rows = []

for idx, row in tracks_info_df_kaggle_dataset.iterrows():
    track_id = row['track_id']
    genre = row['track_genre']
    preview_url = get_spotify_preview_url(track_id)
    if preview_url:
        rows.append({
            'id': track_id,
            'genre': genre,
            'preview': preview_url
        })
    else:
        print(f"Preview URL not found for track ID: {track_id}")

tracks_info_df_kaggle_dataset_preview = pd.DataFrame(rows)
tracks_info_df_kaggle_dataset_preview.to_csv('./dataset.csv', index=False)

# Code to merge the datasets into one file

In [None]:
raise Exception("Stop here to check the dataset")

df_1 = pd.read_csv('./dataset.csv')
df_2 = pd.read_csv('./data.csv')

combined_df = pd.concat([df_1, df_2], ignore_index=True)
combined_df = combined_df.drop_duplicates(subset=['id'])
commbined_df = combined_df.reset_index(drop=True)
combined_df.to_csv('./combined_dataset.csv', index=False)

# Read File

In [2]:
import pandas as pd
tracks_info_df = pd.read_csv('./combined_dataset.csv')

# Download all preview mp3 files

In [None]:
import os
import requests

# -- Prepare Download Directory --
download_folder = "audio_previews"
os.makedirs(download_folder, exist_ok=True)


for idx, row in tracks_info_df.iterrows():

    track_id = row['id']
    preview_url = row['preview']

    # Form the output filename for the preview
    output_filename = os.path.join(download_folder, f"{track_id}.mp3")
    
    # Skip if the file already exists
    if os.path.exists(output_filename):
        print(f"Preview for track {track_id} already exists. Skipping...")
        continue

    # Download the preview
    try:
        response = requests.get(preview_url, timeout=30)  # you can adjust timeout
        if response.status_code == 200:
            with open(output_filename, "wb") as f:
                f.write(response.content)
            print(f"Downloaded preview for track {track_id}")
        else:
            print(f"Failed to download track {track_id}: HTTP {response.status_code}")
    except Exception as e:
        print(f"Error downloading track {track_id}: {e}")


# Extract Track Features

In [3]:
import os
import pickle
import shutil
import librosa
import numpy as np


def summarize_feature(feature_array):
    """
    feature_array shape: [num_coeffs, num_frames]
    Returns: 1D numpy array containing mean, std, and median of each row.
    """
    means = np.mean(feature_array)
    stds = np.std(feature_array)
    medians = np.median(feature_array)

    return [means, stds, medians] 

def extract_features_librosa(file_path):

    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)
    
    # Extract features
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr)
    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)
    chroma_vqt = librosa.feature.chroma_vqt(y=y, sr=sr, intervals='equal')
    melspectogram = librosa.feature.melspectrogram(y=y, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12) # Numer of mfccs difference?
    rms = librosa.feature.rms(y=y)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_bandwith = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    spectral_flatness = librosa.feature.spectral_flatness(y=y) 
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    poly_features = librosa.feature.poly_features(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    zero_crossings = librosa.feature.zero_crossing_rate(y)
    
    # Estimate tempo and beats
    tempo = librosa.feature.tempo(y=y, sr=sr)
    tempogram = librosa.feature.tempogram(y=y, sr=sr)
    fourier_tempogram = librosa.feature.fourier_tempogram(y=y, sr=sr)
    tempogram_ratio = librosa.feature.tempogram_ratio(y=y, sr=sr)

    track_features = [
        summarize_feature(mfccs),
        summarize_feature(chroma_stft),
        summarize_feature(chroma_cqt),
        summarize_feature(chroma_cens),
        summarize_feature(chroma_vqt),
        summarize_feature(melspectogram),
        summarize_feature(spectral_centroid),
        summarize_feature(spectral_bandwith),
        summarize_feature(spectral_contrast),
        summarize_feature(spectral_flatness),
        summarize_feature(spectral_rolloff),
        summarize_feature(poly_features),
        summarize_feature(tonnetz),
        summarize_feature(zero_crossings),
        summarize_feature(tempogram),
        summarize_feature(fourier_tempogram),
        summarize_feature(tempogram_ratio),
        summarize_feature(rms),
        np.ravel(tempo) # Flatten Tempo into an 1D array
    ]

    def flatten(track_features):
        return [feature for features in track_features for feature in features]

    return flatten(track_features)


folder_path = './audio_previews'
results_file = 'audio_features.pkl'

# Check if the features pickle file exists, if yes load it
if os.path.exists(results_file):
    with open(results_file, 'rb') as file:
        saved_data = pickle.load(file)
    X = saved_data.get('X', [])
    y_labels = saved_data.get('y_labels', [])
    processed_files = set(saved_data.get('processed_files', []))
    print(f"Loaded {len(processed_files)} processed tracks")
else:
    X = []
    y_labels = []
    processed_files = set()

all_files = [file for file in os.listdir(folder_path) if file.endswith('.mp3')]
total_files_count = len(all_files)
counter = 0

for file in all_files:

    if file in processed_files:
        counter += 1
        continue

    counter += 1
    print(f'Processing file {counter}/{total_files_count}...')

    file_path = os.path.join(folder_path, file)
    file_id = os.path.splitext(file)[0]

    features = extract_features_librosa(file_path)
    # features_vector = np.concatenate([np.ravel(feat) for feat in features])
    X.append(features)

    try:
        genre = tracks_info_df.loc[tracks_info_df['id'] == file_id, 'genre'].iloc[0]
        y_labels.append(genre)
    except IndexError:
        print("This should actually not happen :(, couldnt find genre in the pandas dataframe based on ID")
        y_labels.append(None)
    
    processed_files.add(file)

    with open(results_file, 'wb') as file:
        pickle.dump({
            'X': X,
            'y_labels': y_labels,
            'processed_files': list(processed_files)
        }, file)

    if counter % 10 == 0:
        base, ext = os.path.splitext(results_file)
        backup_file = f"{base}_backup{ext}"

        shutil.copy(results_file, backup_file)


Loaded 14469 processed tracks
Processing file 1/66518...
Processing file 2/66518...
Processing file 3/66518...
Processing file 4/66518...
Processing file 5/66518...
Processing file 6/66518...
Processing file 7/66518...
Processing file 8/66518...
Processing file 11/66518...
Processing file 12/66518...
Processing file 13/66518...
Processing file 16/66518...
Processing file 17/66518...
Processing file 19/66518...
Processing file 22/66518...
Processing file 23/66518...
Processing file 24/66518...
Processing file 25/66518...
Processing file 26/66518...
Processing file 27/66518...
Processing file 30/66518...
Processing file 31/66518...
Processing file 33/66518...
Processing file 34/66518...
Processing file 35/66518...
Processing file 36/66518...
Processing file 37/66518...
Processing file 38/66518...
Processing file 42/66518...
Processing file 44/66518...
Processing file 46/66518...
Processing file 47/66518...
Processing file 50/66518...
Processing file 51/66518...
Processing file 52/66518..

KeyboardInterrupt: 