## Install/Import Dependencies

In [83]:
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import os
import numpy as np
import pandas as pd
import time
import multiprocessing as mp
import psutil
import tracemalloc
import threading
import json
import h5py
import tensorflow as tf
from essentia.standard import (
    MonoLoader,
    TensorflowPredictEffnetDiscogs,
    TensorflowPredictVGGish,
    TensorflowPredict2D,
)

#### Global Constants

In [84]:
load_dotenv()
DOWNLOAD_FOLDER = os.getenv("DOWNLOAD_FOLDER")
CPU_THREADS = int(os.getenv("CPU_THREADS"))
MODELS_PATH = "./models"

#### Data

In [85]:
# Get classes for moodtheme predictor model
with open("data/mtg_jamendo_moodtheme-discogs-effnet-1.json", "r") as jamendo_file:
    jamendo_metadata = json.load(jamendo_file)
jamendo_classes = jamendo_metadata["classes"]

# Get classes for instrument predictor model
with open("data/mtg_jamendo_instrument-discogs-effnet-1.json", "r") as jamendo_file:
    jamendo_instrument_metadata = json.load(jamendo_file)
jamendo_instrument_classes = jamendo_instrument_metadata["classes"]

In [86]:
songs_data = pd.read_csv("data/songs_data.csv", index_col=0)

#### Util Functions

In [87]:
def get_total_memory_usage(process):
    memory_summary = {
        f"Process {process.pid}": process.memory_info().rss / (1024 * 1024)
    }
    for child in process.children(recursive=True):
        memory_summary = memory_summary | {
            f"Child Process {child.pid}": child.memory_info().rss / (1024 * 1024)
        }
    return memory_summary


def print_memory_usage(process):
    print(get_total_memory_usage(process))
    snapshot = tracemalloc.take_snapshot()
    print(f"Top Consumer of Process {process.pid}: {snapshot.statistics('lineno')[0]}")


def monitor_memory_usage(process, kill_thread, interval=120):
    while True:
        try:
            if kill_thread.value:
                print("MONITOR THREAD KILLED")
                return
            print_memory_usage(process)
        except Exception as e:
            print(f"Thread ERROR: {e}")
            return
        time.sleep(interval)

#### Extract Features Functions

In [88]:
def run_essentia_models(audio16k, audio44k):
    features = {}

    # Get embeddings
    discogs_embeddings = TensorflowPredictEffnetDiscogs(
        graphFilename=MODELS_PATH + "/discogs-effnet-bs64-1.pb",
        output="PartitionedCall:1",
    )(audio16k)
    vggish_embeddings = TensorflowPredictVGGish(
        graphFilename=MODELS_PATH + "/audioset-vggish-3.pb",
        output="model/vggish/embeddings",
    )(audio16k)

    # Approachability
    approachability_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/approachability_regression-discogs-effnet-1.pb",
        output="model/Identity",
    )(discogs_embeddings)
    approachability = np.median(np.squeeze(approachability_predictions))

    # Engagement
    engagement_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/engagement_regression-discogs-effnet-1.pb",
        output="model/Identity",
    )(discogs_embeddings)
    engagement = np.median(np.squeeze(engagement_predictions))

    # Arousal/Valence
    arousal_valence_predictions = np.median(
        TensorflowPredict2D(
            graphFilename=MODELS_PATH + "/deam-audioset-vggish-2.pb",
            output="model/Identity",
        )(vggish_embeddings),
        axis=0,
    )
    valence = arousal_valence_predictions[0]
    arousal = arousal_valence_predictions[1]

    # Aggressive
    aggressive_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mood_aggressive-audioset-vggish-1.pb",
        output="model/Softmax",
    )(vggish_embeddings)
    aggressive = np.median(aggressive_predictions, axis=0)[0]

    # Happy
    happy_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mood_happy-audioset-vggish-1.pb",
        output="model/Softmax",
    )(vggish_embeddings)
    happy = np.median(happy_predictions, axis=0)[0]

    # Party
    party_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mood_party-audioset-vggish-1.pb",
        output="model/Softmax",
    )(vggish_embeddings)
    party = np.median(party_predictions, axis=0)[0]

    # Relaxed
    relaxed_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mood_relaxed-audioset-vggish-1.pb",
        output="model/Softmax",
    )(vggish_embeddings)
    relaxed = np.median(relaxed_predictions, axis=0)[0]

    # Sad
    sad_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mood_sad-audioset-vggish-1.pb",
        output="model/Softmax",
    )(vggish_embeddings)
    sad = np.median(sad_predictions, axis=0)[0]

    # Jamendo labels
    jamendo_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mtg_jamendo_moodtheme-discogs-effnet-1.pb"
    )(discogs_embeddings)
    jamendo_values = np.median(jamendo_predictions, axis=0)
    jamendo_dict = {
        jamendo_class: jamendo_value
        for (jamendo_class, jamendo_value) in zip(jamendo_classes, jamendo_values)
    }

    # Jamendo instrument labels
    jamendo_instrument_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mtg_jamendo_instrument-discogs-effnet-1.pb"
    )(discogs_embeddings)
    jamendo_instrument_values = np.median(jamendo_instrument_predictions, axis=0)
    jamendo_instrument_dict = {
        jamendo_class: jamendo_value
        for (jamendo_class, jamendo_value) in zip(
            jamendo_instrument_classes, jamendo_instrument_values
        )
    }

    # Acoustic
    acoustic_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mood_acoustic-audioset-vggish-1.pb",
        output="model/Softmax",
    )(vggish_embeddings)
    acoustic = np.median(acoustic_predictions, axis=0)[0]

    # Electronic
    electronic_predictions = TensorflowPredict2D(
        graphFilename=MODELS_PATH + "/mood_electronic-audioset-vggish-1.pb",
        output="model/Softmax",
    )(vggish_embeddings)
    electronic = np.median(electronic_predictions, axis=0)[0]

    # Voice/Instrumental
    voice_instrumental_predictions = np.median(
        TensorflowPredict2D(
            graphFilename=MODELS_PATH + "/voice_instrumental-audioset-vggish-1.pb",
            output="model/Softmax",
        )(vggish_embeddings),
        axis=0,
    )
    voice = voice_instrumental_predictions[0]
    instrumental = voice_instrumental_predictions[1]

    # Gender (Male/Female)
    gender_predictions = np.median(
        TensorflowPredict2D(
            graphFilename=MODELS_PATH + "/gender-audioset-vggish-1.pb",
            output="model/Softmax",
        )(vggish_embeddings),
        axis=0,
    )
    female = gender_predictions[0]
    male = gender_predictions[1]

    # Timbre (Bright/Dark)
    timbre_predictions = np.median(
        TensorflowPredict2D(
            graphFilename=MODELS_PATH + "/timbre-discogs-effnet-1.pb",
            output="model/Softmax",
        )(discogs_embeddings),
        axis=0,
    )
    bright = timbre_predictions[0]
    dark = timbre_predictions[1]

    # Reverb (Dry/Wet)
    reverb_predictions = np.median(
        TensorflowPredict2D(
            graphFilename=MODELS_PATH + "/nsynth_reverb-discogs-effnet-1.pb",
            output="model/Softmax",
        )(discogs_embeddings),
        axis=0,
    )
    dry = reverb_predictions[0]
    wet = reverb_predictions[1]

    # Return model results
    features = {
        "Embeddings": vggish_embeddings,
        "Approachability": approachability,
        "Engagement": engagement,
        "Valence": valence,
        "Arousal": arousal,
        "Aggressive": aggressive,
        "Happy": happy,
        "Party": party,
        "Relaxed": relaxed,
        "Sad": sad,
        "Jamendo Labels": jamendo_dict,
        "Jamendo Instruments": jamendo_instrument_dict,
        "Acoustic": acoustic,
        "Electronic": electronic,
        "Voice": voice,
        "Instrumental": instrumental,
        "Male": male,
        "Female": female,
        "Bright": bright,
        "Dark": dark,
        "Dry": dry,
        "Wet": wet,
    }
    return features

In [89]:
def extract_audio_features(audio_file):
    # Load the audio file
    audio16k = MonoLoader(filename=audio_file, sampleRate=16000)()
    audio44k = MonoLoader(filename=audio_file)()

    # Run algorithms
    algorithm_features = run_essentia_models(audio44k, audio16k)

    # Merge results
    return algorithm_features

#### Main Code

In [90]:
# Class constructed from song path
# Song path must follow this format: /some/path/(int)^(video id)^(title).mp3
#                               e.g  /some/path/0^LlWGt_84jpg^Special Breed.mp3
class SongPath:
    def __init__(self, song_path: str):
        self.path = song_path
        self.filename = os.path.basename(song_path)

        song_filename_split = self.filename.split("^")
        if len(song_filename_split) != 3:
            raise Exception(
                "The song's filename doesn't follow the correct format: /some/path/(int)^(video id)^(title).mp3"
            )

        self.index, self.video_id, self.title_with_extension = song_filename_split

        self.index = int(self.index)
        self.title = os.path.splitext(self.title_with_extension)[0]

    def __str__(self):
        return f"Idx: {self.index},  videoID: {self.video_id}, title: {self.title_with_extension}"

In [91]:
def process_song(song_path):
    song = SongPath(song_path)
    song_features = extract_audio_features(song.path)
    return song.index, song_features

In [92]:
def process_songs():
    tracemalloc.start()

    song_paths = np.array(
        [
            os.path.join(DOWNLOAD_FOLDER, song_filename)
            for song_filename in os.listdir(DOWNLOAD_FOLDER)
        ]
    )

    # songs_data_lower, songs_data_higher = [len(song_paths)//6*0, len(song_paths)//6*1]
    songs_data_lower, songs_data_higher = [0, 4]
    song_paths = song_paths[songs_data_lower:songs_data_higher]

    hdf5_file_path = "data/song_embeddings.h5"

    with h5py.File(hdf5_file_path, "w") as hdf5_file:
        with mp.Manager() as manager:
            kill_thread = manager.Value("b", False)

            main_process = psutil.Process(os.getpid())
            memory_thread = threading.Thread(
                target=monitor_memory_usage, args=(main_process, kill_thread)
            )
            memory_thread.start()

            song_results = []
            for song_path in tqdm(song_paths, desc="Processing Songs"):
                processed_song = process_song(song_path)
                song_results.append(processed_song)

            kill_thread.value = True

        # Aggregate results in the pandas dataframe
        songs_data_full = songs_data.copy(deep=True)
        for song_index, song_features in song_results:
            for feature, value in song_features.items():
                if feature == "Embeddings":
                    song_name = os.path.splitext(
                        os.path.basename(song_paths[song_index])
                    )[0]
                    hdf5_file.create_dataset(song_name, data=value, compression="gzip")
                    continue

                if feature not in songs_data_full.columns and isinstance(
                    value, (tuple, set, list, np.ndarray, dict)
                ):
                    songs_data_full[feature] = np.nan
                    songs_data_full[feature] = songs_data_full[feature].astype(object)
                songs_data_full.at[song_index, feature] = value

    return songs_data_full

In [93]:
embeddings_filepath = "/mnt/f/Alex Stuff/Songs/Embeddings/song_embeddings_16_20.h5"
index = 0  # The index of the song you want to retrieve

with h5py.File(embeddings_filepath, "r") as hdf5_file:
    # List all datasets in the HDF5 file (these are the song names)
    song_names = list(hdf5_file.keys())
    print(f"There are {len(song_names)} embeddings")

    # Get the song name by index (assuming index corresponds to the order of the datasets)
    if index < len(song_names):
        song_name = song_names[index]

        # Retrieve the embeddings for the selected song
        embedding = hdf5_file[song_name][:]
        print("Song: ", song_name)
    else:
        print(
            f"Index {index} is out of range. The file contains {len(song_names)} embeddings."
        )


if embedding is not None:
    print(f"Embedding for song at index {index}:\n", embedding)

There are 8191 embeddings
Song:  --m5OWq9pco
Embedding for song at index 0:
 [[0.         0.340993   0.30700594 ... 0.         0.34747636 0.3625402 ]
 [0.         0.05454385 0.         ... 0.         0.2682284  0.39225656]
 [0.         0.         0.         ... 0.         0.         0.20174174]
 ...
 [0.         0.5392596  0.         ... 0.         0.91209984 0.69586444]
 [0.         0.10439862 0.07131344 ... 0.         0.27246368 0.16343527]
 [0.         0.         0.         ... 0.         0.19906269 0.17168629]]


In [94]:
songs_data_full = pd.read_csv("data/songs_data_models_16_20.csv", index_col=0)
songs_data_full.dropna(subset=["Valence"])

  songs_data_full = pd.read_csv('data/songs_data_models_16_20.csv', index_col=0)


Unnamed: 0,title,artist,views,videoID,duration,Approachability,Engagement,Valence,Arousal,Aggressive,...,Acoustic,Electronic,Voice,Instrumental,Female,Male,Bright,Dark,Dry,Wet
3,Justness,Generallykoi,1,J14sCvTWh3Q,86,0.259539,0.419092,3.964335,3.808446,0.033272,...,0.214573,0.605151,0.828168,0.171831,0.757837,0.242163,0.497528,0.502472,0.826551,0.173449
31,No Tags,Camball,73,Xz2eUFVAoWI,136,0.474574,0.839281,4.790616,4.789355,0.134486,...,0.056560,0.671124,0.044506,0.955494,0.344944,0.655056,0.467128,0.532872,0.972234,0.027766
32,Carunculate,Pharaon,4,a9hdEU5nCbk,306,0.042061,0.975427,6.506446,6.499392,0.422891,...,0.006702,0.993338,0.413812,0.586188,0.835302,0.164698,0.435189,0.564811,0.995104,0.004896
33,Clivia,Anneke Douma,56,ybfyy3aCvSQ,227,0.846595,0.680412,4.847172,4.741749,0.004813,...,0.828676,0.077119,0.020385,0.979615,0.359653,0.640347,0.493081,0.506919,0.780079,0.219921
34,Happy Re-Exported,,48,GOD0Sx_6e04,187,0.717724,0.663500,5.292960,5.293967,0.025865,...,0.107621,0.397837,0.006957,0.993043,0.521881,0.478119,0.498138,0.501862,0.972313,0.027687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46695,Hyla,Backwhen,21459,3pv6r8mtgLo,297,0.435786,0.560643,4.597107,4.827908,0.095924,...,0.039670,0.979222,0.882672,0.117328,0.808517,0.191483,0.498139,0.501861,0.991154,0.008846
46696,C.O.O.T.S (feat. Kosher),Phillip Benjamin,17861,lrPM8jDdHQM,217,0.647317,0.840738,5.076136,5.266734,0.156650,...,0.044810,0.372832,0.002122,0.997877,0.161152,0.838848,0.467357,0.532643,0.967917,0.032083
46697,Intervein (feat. Drego),Babys World,20096,nDjvCwNPH5s,144,0.492274,0.838436,4.826987,5.096818,0.224504,...,0.035214,0.327201,0.003674,0.996326,0.106562,0.893438,0.500878,0.499122,0.983175,0.016825
46698,Simoon,Shinichiro Yokota,12648,Wt4-enulOnE,394,0.775618,0.621184,5.240323,5.310703,0.013411,...,0.122406,0.897513,0.839397,0.160603,0.656249,0.343751,0.466337,0.533663,0.999892,0.000108


In [95]:
songs_data_full1 = pd.read_csv("data/songs_data_models_0_1.csv", index_col=0)
songs_data_full1.dropna(subset=["Valence"], inplace=True)
songs_data_full2 = pd.read_csv("data/songs_data_models_1_4.csv", index_col=0)
songs_data_full2.dropna(subset=["Valence"], inplace=True)
songs_data_full3 = pd.read_csv("data/songs_data_models_4_8.csv", index_col=0)
songs_data_full3.dropna(subset=["Valence"], inplace=True)
songs_data_full4 = pd.read_csv("data/songs_data_models_8_12.csv", index_col=0)
songs_data_full4.dropna(subset=["Valence"], inplace=True)
songs_data_full5 = pd.read_csv("data/songs_data_models_12_16.csv", index_col=0)
songs_data_full5.dropna(subset=["Valence"], inplace=True)
songs_data_full6 = pd.read_csv("data/songs_data_models_16_20.csv", index_col=0)
songs_data_full6.dropna(subset=["Valence"], inplace=True)
songs_data_full7 = pd.read_csv("data/songs_data_models_20_24.csv", index_col=0)
songs_data_full7.dropna(subset=["Valence"], inplace=True)
songs_data_full8 = pd.read_csv("data/songs_data_models_24_28.csv", index_col=0)
songs_data_full8.dropna(subset=["Valence"], inplace=True)
songs_data_full9 = pd.read_csv("data/songs_data_models_28_32.csv", index_col=0)
songs_data_full9.dropna(subset=["Valence"], inplace=True)
songs_data_full10 = pd.read_csv("data/songs_data_models_32_36.csv", index_col=0)
songs_data_full10.dropna(subset=["Valence"], inplace=True)
songs_data_full11 = pd.read_csv("data/songs_data_models_36_40.csv", index_col=0)
songs_data_full11.dropna(subset=["Valence"], inplace=True)
songs_data_full12 = pd.read_csv("data/songs_data_models_40_44.csv", index_col=0)
songs_data_full12.dropna(subset=["Valence"], inplace=True)
songs_data_full13 = pd.read_csv("data/songs_data_models_44_48.csv", index_col=0)
songs_data_full13.dropna(subset=["Valence"], inplace=True)

  songs_data_full1 = pd.read_csv('data/songs_data_models_0_1.csv', index_col=0)
  songs_data_full2 = pd.read_csv('data/songs_data_models_1_4.csv', index_col=0)
  songs_data_full3 = pd.read_csv('data/songs_data_models_4_8.csv', index_col=0)
  songs_data_full4 = pd.read_csv('data/songs_data_models_8_12.csv', index_col=0)
  songs_data_full5 = pd.read_csv('data/songs_data_models_12_16.csv', index_col=0)
  songs_data_full6 = pd.read_csv('data/songs_data_models_16_20.csv', index_col=0)
  songs_data_full7 = pd.read_csv('data/songs_data_models_20_24.csv', index_col=0)
  songs_data_full8 = pd.read_csv('data/songs_data_models_24_28.csv', index_col=0)
  songs_data_full9 = pd.read_csv('data/songs_data_models_28_32.csv', index_col=0)
  songs_data_full10 = pd.read_csv('data/songs_data_models_32_36.csv', index_col=0)
  songs_data_full11 = pd.read_csv('data/songs_data_models_36_40.csv', index_col=0)
  songs_data_full12 = pd.read_csv('data/songs_data_models_40_44.csv', index_col=0)
  songs_data_full13 

In [96]:
songs_data_full_dfs = [
    songs_data_full1,
    songs_data_full2,
    songs_data_full3,
    songs_data_full4,
    songs_data_full5,
    songs_data_full6,
    songs_data_full7,
    songs_data_full8,
    songs_data_full9,
    songs_data_full10,
    songs_data_full11,
    songs_data_full12,
    songs_data_full13,
]
songs_data_full = pd.concat(songs_data_full_dfs, ignore_index=True)

In [97]:
songs_data_full = songs_data_full.drop_duplicates(subset=["title", "artist"])
songs_data_full

Unnamed: 0,title,artist,views,videoID,duration,Approachability,Engagement,Valence,Arousal,Aggressive,...,Acoustic,Electronic,Voice,Instrumental,Male,Female,Bright,Dark,Dry,Wet
0,Special Breed,PolyCulture,34.0,LlWGt_84jpg,331.0,0.202973,0.574019,4.781359,4.823567,0.551342,...,0.040053,0.920894,0.398074,0.601926,0.537490,0.462510,0.447406,0.552594,0.998639,0.001361
1,jucunda,"Bebusk, Erling Tristan Ernestus",5.0,U51SnhAhAtg,219.0,0.317391,0.260441,4.103666,4.030465,0.114513,...,0.194453,0.626164,0.762992,0.237008,0.478789,0.521211,0.458629,0.541371,0.999184,0.000816
2,Affectively Schizo!/Brock Broccoli,Evan,6.0,IU3QIC9hk2g,135.0,0.235196,0.444718,4.662875,4.824446,0.646831,...,0.029849,0.859955,0.461610,0.538390,0.642676,0.357324,0.444068,0.555932,0.999659,0.000341
3,Impossible Caution,,0.0,8LozgS78Fho,129.0,0.453635,0.420180,5.060728,4.996343,0.335226,...,0.031721,0.918375,0.568636,0.431364,0.529053,0.470947,0.434089,0.565911,0.999636,0.000364
4,I Will Be There,Unshaken Music Ministry,17.0,OCFFd_VAGvA,156.0,0.128143,0.628931,3.937197,3.621713,0.395907,...,0.152694,0.688217,0.328019,0.671981,0.689942,0.310058,0.446472,0.553528,0.931398,0.068602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114683,Bust It Open,Lil Wil,11728736.0,rI0pCNGVe-w,257.0,0.755990,0.894633,5.481124,5.671177,0.392241,...,0.012435,0.425957,0.001941,0.998059,0.876373,0.123627,0.477655,0.522345,0.857615,0.142385
114684,Ek Bar Shri Bhole Bhandari,Niranjan Pandya,11727167.0,Ad-tRcGQ3Ko,545.0,0.775667,0.802195,6.131566,6.327144,0.006165,...,0.288515,0.644367,0.017816,0.982184,0.752065,0.247935,0.556903,0.443097,0.755816,0.244183
114685,For Those About To Rave,"Scooter, Timmy Trumpet",11724591.0,11Uo4kLkxII,163.0,0.163583,0.961063,6.137606,5.747474,0.782708,...,0.010487,0.931411,0.083510,0.916490,0.504522,0.495478,0.457339,0.542661,0.999606,0.000394
114686,What's Going On (Official Video 2019),Marvin Gaye,11723995.0,o5TmORitlKk,240.0,0.809481,0.755785,5.132675,5.161399,0.039281,...,0.388992,0.217666,0.037883,0.962117,0.570458,0.429542,0.495524,0.504476,0.480476,0.519524


In [98]:
songs_data_full.to_csv("data/songs_data_models.csv")

asdasdas

In [99]:
songs_data_algo = pd.read_csv("data/songs_data_algo.csv", index_col=0)
songs_data_models = pd.read_csv("data/songs_data_models.csv", index_col=0)

In [100]:
songs_data_full_merged = pd.merge(
    songs_data_algo, songs_data_models, on=["title", "artist"], how="inner"
)
songs_data_full_clean = songs_data_full_merged.dropna(subset=["title"])
# Remove columns with '_y' suffix
songs_data_full_clean = songs_data_full_clean.loc[
    :, ~songs_data_full_clean.columns.str.endswith("_y")
]
# Rename columns with '_x' suffix to remove the suffix
songs_data_full_clean.columns = songs_data_full_clean.columns.str.replace(
    "_x", "", regex=False
)
songs_data_full_clean

Unnamed: 0,title,artist,views,videoID,duration,Danceability,Loudness,BPM,Key,Key Scale,...,Acoustic,Electronic,Voice,Instrumental,Male,Female,Bright,Dark,Dry,Wet
0,Special Breed,PolyCulture,34,LlWGt_84jpg,331,1.365855,5662.711426,128.004791,F,minor,...,0.040053,0.920894,0.398074,0.601926,0.537490,0.462510,0.447406,0.552594,0.998639,0.001361
1,Unnoticeable,Lost Ambitions,2,TD3za_a4uWo,61,1.054391,606.596680,144.313995,G,major,...,0.947159,0.027575,0.503096,0.496905,0.774429,0.225571,0.518094,0.481906,0.617442,0.382558
2,Cephalon,Goatmilk,5,nop-58k9B9g,252,1.021395,6454.855957,118.336517,Ab,minor,...,0.077337,0.654210,0.712755,0.287245,0.502008,0.497992,0.423688,0.576312,0.999856,0.000144
3,Wursts,Les Jirzik,2,W1zYQOa0-hk,133,0.852451,236.874496,81.195984,C,minor,...,0.224416,0.765489,0.682662,0.317337,0.491312,0.508688,0.478077,0.521923,0.940689,0.059311
4,Enemies Noncommitment 2 Battle,Cobra,1,Oha70K1krcc,293,1.422449,1893.882202,117.295532,B,minor,...,0.131151,0.556826,0.392752,0.607248,0.571989,0.428011,0.456348,0.543652,0.978122,0.021878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98298,Bust It Open,Lil Wil,11728736,rI0pCNGVe-w,257,1.287168,4837.315918,150.225586,E,minor,...,0.012435,0.425957,0.001941,0.998059,0.876373,0.123627,0.477655,0.522345,0.857615,0.142385
98299,Ek Bar Shri Bhole Bhandari,Niranjan Pandya,11727167,Ad-tRcGQ3Ko,545,1.184014,3355.202148,154.861115,Ab,minor,...,0.288515,0.644367,0.017816,0.982184,0.752065,0.247935,0.556903,0.443097,0.755816,0.244183
98300,For Those About To Rave,"Scooter, Timmy Trumpet",11724591,11Uo4kLkxII,163,1.057154,4819.777832,142.054779,A,minor,...,0.010487,0.931411,0.083510,0.916490,0.504522,0.495478,0.457339,0.542661,0.999606,0.000394
98301,What's Going On (Official Video 2019),Marvin Gaye,11723995,o5TmORitlKk,240,1.091572,1205.145142,101.417412,E,major,...,0.388992,0.217666,0.037883,0.962117,0.570458,0.429542,0.495524,0.504476,0.480476,0.519524


In [102]:
songs_data_full_clean.to_csv("data/songs_data_full.csv", index=True)

In [103]:
songs_data_full_imported = pd.read_csv("data/songs_data_full.csv", index_col=0)
songs_data_full_imported

Unnamed: 0,title,artist,views,videoID,duration,Danceability,Loudness,BPM,Key,Key Scale,...,Acoustic,Electronic,Voice,Instrumental,Male,Female,Bright,Dark,Dry,Wet
0,Special Breed,PolyCulture,34,LlWGt_84jpg,331,1.365855,5662.711426,128.004791,F,minor,...,0.040053,0.920894,0.398074,0.601926,0.537490,0.462510,0.447406,0.552594,0.998639,0.001361
1,Unnoticeable,Lost Ambitions,2,TD3za_a4uWo,61,1.054391,606.596680,144.313995,G,major,...,0.947159,0.027575,0.503096,0.496905,0.774429,0.225571,0.518094,0.481906,0.617442,0.382558
2,Cephalon,Goatmilk,5,nop-58k9B9g,252,1.021395,6454.855957,118.336517,Ab,minor,...,0.077337,0.654210,0.712755,0.287245,0.502008,0.497992,0.423688,0.576312,0.999856,0.000144
3,Wursts,Les Jirzik,2,W1zYQOa0-hk,133,0.852451,236.874496,81.195984,C,minor,...,0.224416,0.765489,0.682662,0.317337,0.491312,0.508688,0.478077,0.521923,0.940689,0.059311
4,Enemies Noncommitment 2 Battle,Cobra,1,Oha70K1krcc,293,1.422449,1893.882202,117.295532,B,minor,...,0.131151,0.556826,0.392752,0.607248,0.571989,0.428011,0.456348,0.543652,0.978122,0.021878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98298,Bust It Open,Lil Wil,11728736,rI0pCNGVe-w,257,1.287168,4837.315918,150.225586,E,minor,...,0.012435,0.425957,0.001941,0.998059,0.876373,0.123627,0.477655,0.522345,0.857615,0.142385
98299,Ek Bar Shri Bhole Bhandari,Niranjan Pandya,11727167,Ad-tRcGQ3Ko,545,1.184014,3355.202148,154.861115,Ab,minor,...,0.288515,0.644367,0.017816,0.982184,0.752065,0.247935,0.556903,0.443097,0.755816,0.244183
98300,For Those About To Rave,"Scooter, Timmy Trumpet",11724591,11Uo4kLkxII,163,1.057154,4819.777832,142.054779,A,minor,...,0.010487,0.931411,0.083510,0.916490,0.504522,0.495478,0.457339,0.542661,0.999606,0.000394
98301,What's Going On (Official Video 2019),Marvin Gaye,11723995,o5TmORitlKk,240,1.091572,1205.145142,101.417412,E,major,...,0.388992,0.217666,0.037883,0.962117,0.570458,0.429542,0.495524,0.504476,0.480476,0.519524


#### Convert columns to proper JSON format

In [1]:
import pandas as pd
import json

# Load the CSV file
input_file = "/mnt/d/Alex Stuff/songs_data_full.csv"  # Replace with your input CSV file name
output_file = "data/songs_data_full_formated.csv"  # Replace with the desired output CSV file name

# Read the CSV into a DataFrame
df = pd.read_csv(input_file)

# Columns to reformat
columns_to_jsonify = ["Chords Significance", "Jamendo Labels", "Jamendo Instruments"]

def convert_to_json(value):
    try:
        # Try to parse the value as JSON
        return json.dumps(json.loads(value)) if isinstance(value, str) else json.dumps(value)
    except json.JSONDecodeError:
        # If value is not valid JSON, return it as a JSON string
        return json.dumps([v.strip() for v in value.split(",")] if isinstance(value, str) else value)

# Reformat specified columns
for column in columns_to_jsonify:
    if column in df.columns:
        df[column] = df[column].apply(convert_to_json)

# Save the updated DataFrame to a new CSV file
df.to_csv(output_file, index=False)
print(f"Reformatted CSV saved to {output_file}")


Reformatted CSV saved to data/songs_data_full_formated.csv
