In [1]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob

In [10]:
SAMPLE_RATE = 16000
N_MEL = 128
N_MFCC = 40 
N_FFT = 2048
HOP_LENGTH = 512

ORIGINAL_LABEL_CSV = r"E:\数据集\ShipEar\data_preprocessing\shipsEar.xlsx"
IN_DIR = r"E:\数据集\ShipEar\data_preprocessing\4_Frame_Windows_2s_50%"

type_mapping = {
    'Fishboat': 0,
    'Trawler': 0,
    'Mussel boat': 0,
    'Tugboat': 0,
    'Dredger': 0,

    'Motorboat': 1,
    'Pilot ship': 1,
    'Sailboat': 1,

    'Passengers': 2,

    'Ocean liner': 3,
    'RORO': 3,

    'Natural ambient noise': 4
}

In [13]:
def extract_mel_npy(in_dir, out_dir, samplerate, n_mels, n_fft, hop_length):
    os.makedirs(out_dir, exist_ok=True)
  
    for wav in glob.glob(os.path.join(in_dir, '*.wav')):
        y, sr = librosa.load(wav, sr=samplerate)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        filename = os.path.splitext(os.path.basename(wav))[0]
        output_path = os.path.join(out_dir, f"{filename}.npy")
        np.save(output_path, mel_db)

In [14]:
def extract_mfcc_npy(in_dir, out_dir, samplerate, n_mfcc):
    os.makedirs(out_dir, exist_ok=True)
    for wav in glob.glob(os.path.join(in_dir, '*.wav')):
        y, sr = librosa.load(wav, sr=samplerate)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        filename = os.path.splitext(os.path.basename(wav))[0]
        output_path = os.path.join(out_dir, f"{filename}.npy")
        np.save(output_path, mfcc)

In [15]:
def create_segmented_label_csv(feature_dir, original_label_csv, output_csv, ext):
    original_df = pd.read_excel(original_label_csv)

    filename_to_info = {}
    for _, row in original_df.iterrows():
        base_filename = row['Filename'].replace('.wav', '')
        original_id = int(base_filename.split('__')[0])
        ship_type = row['Type']
        class_id = type_mapping.get(ship_type)
        filename_to_info[base_filename] = {
            'original_id' : original_id,
            'ship_type' : ship_type,
            'class_id' : class_id
        }
    
    feature_dir = os.path.join(feature_dir, f'*.{ext}')
    feature_files = sorted(glob.glob(feature_dir))

    rows = []

    for feature_file in feature_files:
        segment_filename = os.path.splitext(os.path.basename(feature_file))[0]
        parts = segment_filename.split('_')
        segment_num = parts[-1]
        original_filename = '_'.join(parts[:-1])

        file_info = filename_to_info.get(original_filename)
        original_id = file_info['original_id']
        class_id = file_info['class_id']
        ship_type = file_info['ship_type']

        rows.append({
            'ID': original_id,
            'Filename': original_filename,
            'SegmentedFilename': segment_filename, 
            'Type': ship_type,
            'ClassID': class_id
        })

    result_df = pd.DataFrame(rows).sort_values(['ID', 'SegmentedFilename'])
    output_dir = os.path.dirname(output_csv)
    os.makedirs(output_dir, exist_ok=True)
    result_df.to_csv(output_csv, index=False)
    print(f"生成标注文件完成！共处理 {len(rows)} 个文件。")

In [16]:
MEL_OUT_DIR = r"E:\数据集\ShipEar\data_preprocessing\5_Frame_Windows_2s_50%_mel_feature"
extract_mel_npy(IN_DIR, MEL_OUT_DIR, SAMPLE_RATE, N_MEL, N_FFT, HOP_LENGTH)

In [17]:
MEL_OUT_DIR = r"E:\数据集\ShipEar\data_preprocessing\5_Frame_Windows_2s_50%_mel_feature"
MEL_OUT_CSV = os.path.join(MEL_OUT_DIR, "MEL.csv")
create_segmented_label_csv(MEL_OUT_DIR, ORIGINAL_LABEL_CSV, MEL_OUT_CSV, "npy")

生成标注文件完成！共处理 11210 个文件。


In [18]:
MFCC_OUT_DIR = r"E:\数据集\ShipEar\data_preprocessing\6_Frame_Windows_2s_50%_mfcc_feature"
extract_mfcc_npy(IN_DIR, MFCC_OUT_DIR, SAMPLE_RATE, N_MFCC)

In [19]:
MFCC_OUT_DIR = r"E:\数据集\ShipEar\data_preprocessing\6_Frame_Windows_2s_50%_mfcc_feature"
MFCC_OUT_CSV = os.path.join(MFCC_OUT_DIR, "MFCC.csv")
create_segmented_label_csv(MFCC_OUT_DIR, ORIGINAL_LABEL_CSV, MFCC_OUT_CSV, "npy")

生成标注文件完成！共处理 11210 个文件。
