In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sys
import os
# Suppress TensorFlow GPU warnings and NUMA messages
import tensorflow as tf
import warnings

# Set TensorFlow logging to only show errors
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # 0=all, 1=info, 2=warnings, 3=errors only
tf.get_logger().setLevel('ERROR')

# Suppress specific warnings
warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow')
warnings.filterwarnings('ignore', message='.*NUMA node.*')
warnings.filterwarnings('ignore', message='.*Could not identify NUMA node.*')

sys.path.append(os.path.abspath("../scripts"))

2025-09-09 13:57:11.644667: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
from audio_pipeline import AudioProcessingPipeline

pipeline = AudioProcessingPipeline(
    db_path="data/sql/clean.db",
    output_dir="data/audio/raw",
    processed_dir="data/processed"
)

try:
    print("Processing songs from database...")
    processed_songs = pipeline.process_songs_from_database(
        table_name="all_awards",  # Adjust table name based on your database schema
        # limit=10,             # Process only 5 songs for demo
        force_reprocess=False,  # Use cache to avoid reprocessing
        song_column="song",     # Column name for song titles
        artist_column="artist"  # Column name for artist names
    )
    
    print(f"\n✅ Successfully processed {len(processed_songs)} unique songs")
    
    # Save the results to JSON
    if processed_songs:
        output_file = pipeline.save_processed_metadata(
            processed_songs, 
            "database_processed_songs.json"
        )
        print(f"📁 Saved metadata to: {output_file}")
        
        # Show summary of processed songs
        print("\nProcessed songs summary:")
        for i, song in enumerate(processed_songs, 1):
            print(f"  {i}. {song['artist']} - {song['song_title']} ({song['duration_seconds']:.1f}s)")
    
    # Show processing summary
    pipeline.get_processing_summary()
    
except Exception as e:
    print(f"❌ Error processing database: {e}")
    print("Make sure your database exists and has the correct table structure")

finally:
    pipeline.cleanup()

INFO:feature_extractor:Feature extractor initialized. Models will be loaded on first use.


Initializing feature extractor...
Audio Processing Pipeline initialized
Database: data/sql/clean.db
Output directory: data/audio/raw
Processed directory: data/processed
Cache file: data/processed/processing_cache.json
Processing songs from database...
Found 457 unique song-artist combinations in database

Processing song 1/457
Artist-song combination 'G-Dragon - HOME SWEET HOME' already processed (file: G-Dragon - HOME SWEET HOME.wav), skipping...
Successfully processed: G-Dragon - HOME SWEET HOME

Processing song 2/457
Artist-song combination 'aespa - Whiplash' already processed (file: aespa - Whiplash.wav), skipping...
Successfully processed: aespa - Whiplash

Processing song 3/457
Artist-song combination 'ROSÉ - toxic till the end' already processed (file: ROSÉ - toxic till the end.wav), skipping...
Successfully processed: ROSÉ - toxic till the end

Processing song 4/457
Artist-song combination 'Taeyeon - Letter To Myself' already processed (file: Taeyeon - Letter To Myself.wav), sk

	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=audio_file_path)
INFO:feature_extractor:Loading model: tempocnn
INFO:feature_extractor:Successfully loaded model: tempocnn
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/deepsquare-k16-3.pb`


Audio file: /home/ccaban/Coding/Projects/kpop-music-winners/data/audio/raw/NouerA - n (number of cases).wav
Sample rate: 16000 Hz
Duration: 185.85 seconds
Channels: 1
Data type: float32
✓ Sample rate is correct for Essentia models
True
Processing audio file: NouerA - n (number of cases).wav
Extracting audio features...
Extracting features from: /home/ccaban/Coding/Projects/kpop-music-winners/data/audio/raw/NouerA - n (number of cases).wav
Audio loaded successfully. Duration: 185.85 seconds


2025-09-09 14:02:41.499056: E tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Extracting tempo using TempoCNN...
TempoCNN failed: Operation timed out after 15 seconds
Using RhythmExtractor2013 fallback...


2025-09-09 14:02:41.527870: E tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
INFO:feature_extractor:Loading model: discogs_effnet
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/discogs-effnet-bs64-1.pb`
INFO:feature_extractor:Successfully loaded model: discogs_effnet


Extracting key/scale...
Extracting Discogs-EffNet embeddings...


INFO:feature_extractor:Loading model: danceability
INFO:feature_extractor:Successfully loaded model: danceability
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/danceability-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/danceability-discogs-effnet-1.pb`
INFO:feature_extractor:Loading model: mood_happy
INFO:feature_extractor:Successfully loaded model: mood_happy
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/mood_happy-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/mood_happy-discogs-effnet-1.pb`
INFO:feature_extractor:Loading model: voice_instrumental
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/voice_instrumental-discogs-effnet-1.pb`
INFO:feature_extractor:Successfully loaded model: voice_instrumental
[   INFO   ] TensorflowPredict: Successfully loaded graph 

Discogs-EffNet embeddings extracted successfully
Extracting EffNet-based features...
- Computing danceability (danceability)...
  danceability done
- Computing happiness (mood_happy)...
  happiness done
- Computing voice_instrumental (voice_instrumental)...
  voice_instrumental done
- Computing acoustic_electronic (mood_acoustic)...
  acoustic_electronic done
- Computing timbre (timbre)...
  timbre done
- Computing sad_mood (mood_sad)...
  sad_mood done
- Computing party_mood (mood_party)...
  party_mood done
- Computing relaxed_mood (mood_relaxed)...
  relaxed_mood done
- Computing engagement (engagement)...
  engagement done
- Computing approachability (approachability)...
  approachability done
Extracting multi-class features...
- Computing genre (genre_discogs400)...


INFO:feature_extractor:Successfully loaded model: genre_discogs400
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/genre_discogs400-discogs-effnet-1.pb`
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/genre_discogs400-discogs-effnet-1.pb`
INFO:feature_extractor:Loading model: mtg_jamendo_moodtheme
INFO:feature_extractor:Successfully loaded model: mtg_jamendo_moodtheme
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/mtg_jamendo_moodtheme-discogs-effnet-1.pb`
INFO:feature_extractor:Loading model: msd_musicnn
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/mtg_jamendo_moodtheme-discogs-effnet-1.pb`
INFO:feature_extractor:Successfully loaded model: msd_musicnn
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/msd-musicnn-1.pb`


  genre done - top class: 286 (prob: 0.428)
- Computing mood_theme (mtg_jamendo_moodtheme)...
  mood_theme done - top class: 18 (prob: 0.148)
All EffNet-based features extracted in 0.26 seconds
Extracting MusiCNN embeddings...
MusiCNN embeddings extracted successfully
Extracting arousal/valence...


INFO:feature_extractor:Loading model: deam
INFO:feature_extractor:Successfully loaded model: deam
[   INFO   ] TensorflowPredict: Successfully loaded graph file: `../essentia_models/deam-msd-musicnn-2.pb`
INFO:feature_extractor:Models cleaned up successfully
INFO:feature_extractor:Models cleaned up successfully


Successfully processed NouerA - n (number of cases).wav
Successfully processed: NouerA - n (number of cases)

Processing song 222/457
Artist-song combination 'ONEUS - X' already processed (file: ONEUS - X.wav), skipping...
Successfully processed: ONEUS - X

Processing song 223/457
Artist-song combination 'Cravity - SET NET G0?!' already processed (file: Cravity - SET NET G0?!.wav), skipping...
Successfully processed: Cravity - SET NET G0?!

Processing song 224/457
Artist-song combination 'QWER - Dear' already processed (file: QWER - Dear.wav), skipping...
Successfully processed: QWER - Dear

Processing song 225/457
Artist-song combination 'ATEEZ - Lemon Drop' already processed (file: ATEEZ - Lemon Drop.wav), skipping...
Successfully processed: ATEEZ - Lemon Drop

Processing song 226/457
Artist-song combination 'KISS OF LIFE - Lips Hips Kiss' already processed (file: KISS OF LIFE - Lips Hips Kiss.wav), skipping...
Successfully processed: KISS OF LIFE - Lips Hips Kiss

Processing song 22

In [9]:
import pandas as pd
import json

with open("../data/processed/database_processed_songs.json", 'r') as f:
    data = json.load(f)

df = pd.json_normalize(data, sep="__")
df

Unnamed: 0,artist,song_title,filename,duration_seconds,processing_timestamp,file_path,download_url,download_title,download_uploader,download_duration_sec,...,audio_features__mood_theme__top_3_class,audio_features__mood_theme__top_3_class_id,audio_features__mood_theme__top_3_probability,audio_features__mood_theme__most_likely_class,audio_features__mood_theme__most_likely_class_id,audio_features__mood_theme__most_likely_probability,audio_features__arousal_valence__valence,audio_features__arousal_valence__arousal,audio_features__arousal_valence__valence_normalized,audio_features__arousal_valence__arousal_normalized
0,G-Dragon,HOME SWEET HOME,G-Dragon - HOME SWEET HOME.wav,211.324813,2025-09-04T13:04:16.092009,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=fLi0EJfi_vg,G-DRAGON - HOME SWEET HOME (Official Audio) (f...,OfficialGDRAGON,211.0,...,happy,26,0.066164,energetic,18,0.176874,3.707301,3.782116,0.338413,0.347765
1,aespa,Whiplash,aespa - Whiplash.wav,190.542937,2025-09-04T13:04:27.040798,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=jWQx2f-CErU,aespa 에스파 'Whiplash' MV,SMTOWN,191.0,...,summer,51,0.121297,happy,26,0.149629,5.684407,5.393769,0.585551,0.549221
2,ROSÉ,toxic till the end,ROSÉ - toxic till the end.wav,233.755313,2025-09-04T13:04:38.235117,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=eA0lHNZ1KCA,ROSÉ - toxic till the end (OFFICIAL MUSIC VIDEO),ROSÉ,234.0,...,happy,26,0.062318,energetic,18,0.117204,3.519331,4.169734,0.314916,0.396217
3,Taeyeon,Letter To Myself,Taeyeon - Letter To Myself.wav,184.111000,2025-09-04T13:04:52.308510,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=MEwbIPJjs98,TAEYEON (태연) 'Letter To Myself' Official Audio,JXS_BP Official,184.0,...,happy,26,0.061053,energetic,18,0.156863,5.260572,5.985487,0.532571,0.623186
4,NCT DREAM,When I'm With You,NCT DREAM - When I'm With You.wav,211.440937,2025-09-04T13:05:03.029697,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=LwbDDjBhCvU,NCT DREAM (엔씨티 드림) 'When I'm With You' Officia...,JXS_BP Official,211.0,...,happy,26,0.112464,summer,51,0.220659,4.658324,4.951969,0.457290,0.493996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,Lee Chae-yeon,KNOCK,Lee Chae-yeon - KNOCK.wav,173.174438,2025-09-09T12:18:14.524200,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=joT0ZyB2GbQ,LEE CHAE YEON - KNOCK [Audio],BLISIT_OFFICIAL,173.0,...,love,31,0.052869,energetic,18,0.216202,4.880236,5.195324,0.485030,0.524415
453,TEMPEST,Dangerous,TEMPEST - Dangerous.wav,202.866938,2025-09-09T12:18:26.001786,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=ay7-dYjLhWk,Dangerous,TEMPEST,203.0,...,happy,26,0.049174,energetic,18,0.208617,5.269102,5.100383,0.533638,0.512548
454,EPEX,Sunshower,EPEX - Sunshower.wav,193.608000,2025-09-09T12:18:37.825435,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=VfSayX1awXY,EPEX (이펙스) 'Sunshower (여우가 시집가는 날)' Official A...,JXS_BP Official,194.0,...,love,31,0.078816,energetic,18,0.174473,4.692089,4.027515,0.461511,0.378439
455,DRIPPIN,SEVEN SINS,DRIPPIN - SEVEN SINS.wav,215.167750,2025-09-09T12:18:50.028589,/home/ccaban/Coding/Projects/kpop-music-winner...,https://www.youtube.com/watch?v=ye9XQnp8Ca0,DRIPPIN - SEVEN SINS [Audio],BLISIT_OFFICIAL,215.0,...,happy,26,0.060633,love,31,0.145197,4.494302,4.838457,0.436788,0.479807


In [10]:
df[['artist', 'song_title']]

Unnamed: 0,artist,song_title
0,G-Dragon,HOME SWEET HOME
1,aespa,Whiplash
2,ROSÉ,toxic till the end
3,Taeyeon,Letter To Myself
4,NCT DREAM,When I'm With You
...,...,...
452,Lee Chae-yeon,KNOCK
453,TEMPEST,Dangerous
454,EPEX,Sunshower
455,DRIPPIN,SEVEN SINS


In [11]:
import sqlite3

conn = sqlite3.connect("../data/sql/clean.db")
            
# Query unique song-artist combinations from database
# This is much more efficient than selecting all rows and deduplicating in Python
query = f"SELECT DISTINCT song, artist FROM all_awards"

df_artists_songs = pd.read_sql_query(query, conn)
conn.close()

In [12]:
df_artists_songs['combined'] = df_artists_songs.artist + " - " + df_artists_songs.song
df['combined'] = df.artist + " - " + df.song_title

In [13]:
set(df_artists_songs['combined']) - set(df['combined'])

set()