In [1]:
import os
import librosa
import soundfile as sf
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_excel('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Excel/baseline_data_w_topics_w_features.xlsx', engine='openpyxl')

In [3]:
df

In [5]:
print(df[df['Coast'] == 'east_coast'].shape)
print(df[df['Coast'] == 'west_coast'].shape)

In [13]:
df[df['Release Year'] == 1996][['Artist', 'Album']]

In [33]:
import os
import librosa
import soundfile as sf
import pandas as pd
from tqdm import tqdm

def process_music_files(df, output_folder, pickle_filename):
    """
    Processes music files to calculate onsets, resample audio, and save results.
    
    Parameters:
    - df: pandas DataFrame with a column 'path' containing paths to music files.
    - output_folder: string, path to the folder where resampled audio files will be saved.
    - pickle_filename: string, filename for the output pickle file.
    
    Returns:
    - df_result: pandas DataFrame with columns 'path', 'onsets', 'num_onsets'.
    """
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Lists to store results
    onsets_list = []
    num_onsets_list = []
    resampled_paths = []

    # Iterate over each file in the dataframe
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        music_path = row['Path']

        try:
            # Load audio file with original sampling rate
            y, sr = librosa.load(music_path, sr=None)

            # Calculate onsets (times in seconds)
            onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
            onset_times = librosa.frames_to_time(onset_frames, sr=sr)
            onsets_list.append(onset_times.tolist())
            num_onsets_list.append(len(onset_times))

            # Resample audio to 44,100 Hz
            if sr != 44100:
                y_resampled = librosa.resample(y, orig_sr=sr, target_sr=44100)
                sr_resampled = 44100
            else:
                y_resampled = y
                sr_resampled = sr

            # Save resampled audio to output folder
            base_name = os.path.basename(music_path)
            resampled_path = os.path.join(output_folder, base_name)
            sf.write(resampled_path, y_resampled, sr_resampled)
            resampled_paths.append(resampled_path)

        except Exception as e:
            print(f"Error processing {music_path}: {e}")
            onsets_list.append([])
            num_onsets_list.append(0)
            resampled_paths.append(None)

    # Create the result dataframe
    df_result = df.copy()
    df_result['onsets'] = onsets_list
    df_result['num_onsets'] = num_onsets_list

    # Save dataframe to pickle file
    df_result.to_pickle(pickle_filename)

    return df_result

In [34]:
df_result = process_music_files(df, output_folder='/Users/borosabel/Documents/Uni/Thesis/PopMusicInformationRetrieval/Data/music2', pickle_filename='pkl_data/dataset_with_onset.pkl')

In [20]:
df = pd.read_csv('./Out_19.csv')

In [27]:
df_processed = df.iloc[:, [10, -1, -2]]

In [28]:
df_processed