In [2]:
import pandas as pd
import os
import numpy as np
import librosa
import anthropic
import soundfile as sf
import csv
from mutagen.mp3 import MP3
from mutagen.id3 import ID3
import re

In [3]:
def sanitize_filename(filename):
    filename = re.sub(r'[<>:"/\\|?*]', '_', filename) 
    filename = re.sub(r'\s+', '_', filename)  
    return filename.strip('_')

def extract_piece_number(file_name):
    match = re.search(r'\b(Symphony|Concerto|Sonata|Op|No\.?|No|Piece)\s*\.?\s*(\d+)', file_name, re.IGNORECASE)
    if match:
        return f"{match.group(1)}_{match.group(2)}"  # E.g., Symphony_1, Concerto_2
    return ""

def split_audio(file_path, output_folder, segment_duration=29):
    # Load the audio file
    y, sr = librosa.load(file_path)
    total_duration = librosa.get_duration(y=y, sr=sr)
    
    # Calculate the number of segments
    segment_count = int(total_duration // segment_duration)
    
    # Extract the piece type and number from the file name
    file_name = os.path.basename(file_path)
    piece_number = extract_piece_number(file_name)
    
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Split the audio and save each segment
    for i in range(segment_count):
        start_time = i * segment_duration
        end_time = start_time + segment_duration
        segment = y[int(start_time * sr):int(end_time * sr)]
        
        # Format the file name based on the piece number and segment number
        if piece_number:
            segment_file_name = f"{piece_number}_Segment_{i+1}.wav"
        else:
            # Fallback if no piece number is found
            segment_file_name = f"Piece_Segment_{i+1}.wav"
        
        segment_file_name = sanitize_filename(segment_file_name)
        segment_file_path = os.path.join(output_folder, segment_file_name)
        
        # Save the segment
        sf.write(segment_file_path, segment, sr)

In [4]:
# Extract features from audio snippet function
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=30)  
    
    # Extract features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    key = librosa.core.estimate_tuning(y=y, sr=sr)
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    rhythm_patterns = librosa.feature.tempogram(onset_envelope=onset_env, sr=sr)
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    rms = librosa.feature.rms(y=y)
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    harmonic_progression = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr)
    # Print shapes for verification

    # Compute mean values of the features
    mfccs_mean = np.mean(mfccs, axis=1)
    chroma_mean = np.mean(chroma, axis=1)
    spectral_contrast_mean = np.mean(spectral_contrast, axis=1)
    #tempo_array = np.array([tempo])
    key_array = np.array([key])
    rhythm_mean = np.mean(rhythm_patterns, axis=1)
    pitch_range = np.array([pitches[pitches > 0].min(), pitches[pitches > 0].max()])
    rms_mean = np.mean(rms)
    harmonic_mean = np.mean(harmonic_progression, axis=1)
    # Combine features into a single 1D array
    features = np.concatenate([
        mfccs_mean,
        chroma_mean,
        spectral_contrast_mean,
        tempo,
        key_array,
        rhythm_mean,
        pitch_range,
        [rms_mean],
        harmonic_mean
    ])

    return features

In [5]:
def create_llm_prompt(features):
    # Decompose the features into their respective components
    mfccs_mean = features[:13]
    chroma_mean = features[13:25]
    spectral_contrast_mean = features[25:32]
    tempo = features[32]
    key = features[33]
    rhythm_mean = features[34:44]
    pitch_range = features[44:46]
    rms_mean = features[46]
    harmonic_mean = features[47:59]

    # Create a descriptive text format
    prompt = (
        "Given the following audio features extracted from a music snippet:\n"
        f"- MFCCs Mean (13 coefficients): {mfccs_mean.tolist()}\n"
        f"- Chroma Mean (12 coefficients): {chroma_mean.tolist()}\n"
        f"- Spectral Contrast Mean (7 coefficients): {spectral_contrast_mean.tolist()}\n"
        f"- Tempo (bpm): {tempo}\n"
        f"- Estimated Key/Tonality: {key}\n"
        f"- Rhythm Patterns Mean (10 coefficients): {rhythm_mean.tolist()}\n"
        f"- Pitch Range (min, max): {pitch_range.tolist()}\n"
        f"- Dynamics (RMS Mean): {rms_mean}\n"
        f"- Harmonic Progression Mean (12 coefficients): {harmonic_mean.tolist()}\n\n"
        "Predict the emotions that are most likely to be felt by the listener in one to two words at most and no other text."
    )
    
    return prompt

In [6]:
client = anthropic.Anthropic(api_key="insert your key") # $5.00 worth of promotional credit from Anthropic

In [7]:
def get_llm_response(prompt):
    response = client.messages.create(
        model="claude-3-sonnet-20240229",
        max_tokens=1000,
        temperature=0.7,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return response.content[0].text

In [8]:
def save_to_csv(filename, composer, piece_name, predicted_emotions):
    
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(["Composer", "Piece Name", "Predicted Emotions"])
        writer.writerow([composer, piece_name, predicted_emotions])

def process_audio_segments(folder_path, composer, output_file):
    audio_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".wav")],
                         key=lambda x: int(x.split('_')[-1].replace('.wav', '')))    
    for filename in audio_files:
        file_path = os.path.join(folder_path, filename)
        piece_name = filename.replace(".wav", "")
        new_features = extract_features(file_path)
        prompt = create_llm_prompt(new_features)
        predicted_emotions = get_llm_response(prompt)
        save_to_csv(output_file, composer, piece_name, predicted_emotions)

In [9]:
def get_piece_name_from_mp3(mp3_file_path):
    try:
        audio = MP3(mp3_file_path, ID3=ID3)
        title = audio.tags.get('TIT2')  # Get the title (TIT2 is the ID3 frame for the title)
        if title:
            return title.text[0]
        else:
            return os.path.splitext(os.path.basename(mp3_file_path))[0]  # Fallback to filename without extension
    except Exception as e:
        print(f"Error extracting title from {mp3_file_path}: {e}")
        return os.path.splitext(os.path.basename(mp3_file_path))[0]

In [10]:
def automate_audio_processing(input_folder, output_folder, output_file, composer,segment_duration=29):
    mp3_files = [f for f in os.listdir(input_folder) if f.endswith(".mp3")]
    
    for mp3_file in mp3_files:
        mp3_file_path = os.path.join(input_folder, mp3_file)
        piece_name = get_piece_name_from_mp3(mp3_file_path)
        
        piece_output_folder = os.path.join(output_folder, piece_name)
        split_audio(mp3_file_path, piece_output_folder, segment_duration)
        process_audio_segments(piece_output_folder, composer, output_file)

In [None]:
input_folder = r"specify path"  # Folder containing audio files from audio conversion process
output_folder = r"specify path"  # Folder to save the split audio segments
output_file = r"specify path"  # Output CSV containing preds
composer = "Ludwig van Beethoven"
automate_audio_processing(input_folder, output_folder, output_file, composer)