# 先把训练集70%、验证集15%、测试集15%的标注文件做好，原始音频切割成3s片段，分好样本集合后随机加噪声。

# 3s分帧加窗，直接在原始音频上处理

In [28]:
from pathlib import Path
import os
import librosa
import numpy as np
import glob
import soundfile as sf
import noisereduce as nr
import scipy.signal as signal
from scipy.io import wavfile
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import shutil
import hashlib

In [6]:
wav_path = r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename\Cargo\0_1.wav"

waveform, sr = librosa.load(wav_path, sr=None)
print(f"Sample rate: {sr} Hz")
print(f"Waveform shape: {waveform.shape}")
print(f"Duration: {len(waveform) / sr:.2f} s")

Sample rate: 32000 Hz
Waveform shape: (14624000,)
Duration: 457.00 s


In [7]:
frame_duration = 3  # 帧时长为3秒
# overlap_rate = 0.5  # 帧与帧之间的重叠率为50%
overlap_rate = 0.0  # 帧与帧之间的重叠率为0%

INPUT_ROOT = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename")
OUTPUT_ROOT = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz")

# 定义加窗函数
def apply_window(frame):
    window = np.hanning(len(frame))
    return frame * window

In [8]:
for category in ["Cargo", "Passengership", "Tanker", "Tug"]:
    in_dir = INPUT_ROOT / category
    out_dir = OUTPUT_ROOT / category
    out_dir.mkdir(parents=True, exist_ok=True)

    wav_paths = sorted(glob.glob(str(in_dir / "*.wav")))
    if not wav_paths:
        print(f"{category}: no wav files found, skip.")
        continue
    
    for wav_path in wav_paths:
        file_name = Path(wav_path).name
        file_stem = Path(wav_path).stem

        audio, sr = librosa.load(wav_path, sr=None)
        print(f"Processing file: {file_name}, Sample Rate: {sr}, Total Samples: {len(audio)}")

        frame_length = int(frame_duration * sr)
        hop_length = int(frame_length * (1.0 - overlap_rate))

        total_samples = len(audio)
        if total_samples < frame_length:
            pad_len = frame_length - total_samples
        else:
            remainder = (total_samples - frame_length) % hop_length
            pad_len = 0 if remainder == 0 else hop_length - remainder

        audio_padded = (
            np.pad(audio, (0, pad_len), mode="constant") if pad_len > 0 else audio
        )

        frames = librosa.util.frame(
            audio_padded, frame_length=frame_length, hop_length=hop_length
        )
        frames_windowed = np.apply_along_axis(apply_window, 0, frames)

        for i, frame_windowed in enumerate(frames_windowed.T, start=1):
            out_path = out_dir / f"{file_stem}_{i}.wav"
            wavfile.write(out_path, sr, frame_windowed)
            print(f"{category}: processed {file_name} -> {out_path.name}")
print("All files processed.")

Processing file: 0_1.wav, Sample Rate: 32000, Total Samples: 14624000
Cargo: processed 0_1.wav -> 0_1_1.wav
Cargo: processed 0_1.wav -> 0_1_2.wav
Cargo: processed 0_1.wav -> 0_1_3.wav
Cargo: processed 0_1.wav -> 0_1_4.wav
Cargo: processed 0_1.wav -> 0_1_5.wav
Cargo: processed 0_1.wav -> 0_1_6.wav
Cargo: processed 0_1.wav -> 0_1_7.wav
Cargo: processed 0_1.wav -> 0_1_8.wav
Cargo: processed 0_1.wav -> 0_1_9.wav
Cargo: processed 0_1.wav -> 0_1_10.wav
Cargo: processed 0_1.wav -> 0_1_11.wav
Cargo: processed 0_1.wav -> 0_1_12.wav
Cargo: processed 0_1.wav -> 0_1_13.wav
Cargo: processed 0_1.wav -> 0_1_14.wav
Cargo: processed 0_1.wav -> 0_1_15.wav
Cargo: processed 0_1.wav -> 0_1_16.wav
Cargo: processed 0_1.wav -> 0_1_17.wav
Cargo: processed 0_1.wav -> 0_1_18.wav
Cargo: processed 0_1.wav -> 0_1_19.wav
Cargo: processed 0_1.wav -> 0_1_20.wav
Cargo: processed 0_1.wav -> 0_1_21.wav
Cargo: processed 0_1.wav -> 0_1_22.wav
Cargo: processed 0_1.wav -> 0_1_23.wav
Cargo: processed 0_1.wav -> 0_1_24.wav
Car

# 处理标注文件，70/15/15 划分

In [14]:
ANNOT_PATH = Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip.csv")
SEG_ROOT   = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz")

CLASS_ID_TO_NAME = {
    0: "Cargo",
    1: "Passengership",
    2: "Tanker",
    3: "Tug",
}

annotation = pd.read_csv(ANNOT_PATH)
extended_rows = []

for _, row in annotation.iterrows():
    cls_id = int(row["class_id"])
    cls_name = CLASS_ID_TO_NAME[cls_id]
    base_id = int(row["ID"])

    pattern = f"{cls_id}_{base_id}_*.wav"
    feature_paths = sorted((SEG_ROOT / cls_name).glob(pattern))
    if not feature_paths:
        print(f"缺少特征: {cls_name} ID {base_id}")
        continue

    for feat_path in feature_paths:
        seg_idx = int(feat_path.stem.split("_")[-1])
        entry = row.to_dict()
        entry["segment_id"] = f"{cls_id}_{base_id}_{seg_idx}"
        extended_rows.append(entry)

extended_df = pd.DataFrame(extended_rows)
extended_columns = [
    "ID", "class ID", "Recording ID", "Ship Name",
    "Date & Time", "Duration(sec)", "Distances(m)",
    "class_id", "folder_name",
    "segment_id", "prompt_en",
]

extended_df = extended_df.reindex(columns=extended_columns)
out_path =  Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz_SNR.csv")
extended_df.to_csv(out_path, index=False)
print(f"写出 {len(extended_df)} 条记录 -> {out_path}")

缺少特征: Cargo ID 23
写出 56864 条记录 -> X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz_SNR.csv


In [15]:
ANNOT_PATH = Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz_SNR.csv")
OUTPUT_PATH = Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz_SNR_split.csv")

SPLIT_SEED = 42
VAL_FRACTION = 0.15 / 0.85

segmented_df = pd.read_csv(ANNOT_PATH)

# 先分出 15% 的测试集
remain_df, test_df = train_test_split(
    segmented_df,
    test_size=0.15,
    random_state=SPLIT_SEED,
    stratify=segmented_df["class_id"],
)

# 再把剩余 85% 按比例拆成训练/验证（≈70% / 15%）
train_df, val_df = train_test_split(
    remain_df,
    test_size=VAL_FRACTION,
    random_state=SPLIT_SEED,
    stratify=remain_df["class_id"],
)

split_map = {"train": 0, "val": 1, "test": 2}
def add_split_cols(df, split_name):
    return df.assign(split=split_name, split_id=split_map[split_name])

train_df = add_split_cols(train_df, "train")
val_df   = add_split_cols(val_df, "val")
test_df  = add_split_cols(test_df, "test")

split_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
split_df.to_csv(OUTPUT_PATH, index=False)

print(
    f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}; saved to {OUTPUT_PATH}"
)

Train: 39804, Val: 8530, Test: 8530; saved to X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz_SNR_split.csv


# 把训练集、验证集、测试集处理出来用三个文件夹，然后三个文件夹放在一个大文件夹下

In [19]:
ANNOTATION_CSV = Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz_SNR_split.csv")
SEGMENT_ROOT = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz")
OUTPUT_ROOT = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test")
SPLIT_DIRS = {
    "train": OUTPUT_ROOT / "train",
    "val": OUTPUT_ROOT / "val",
    "test": OUTPUT_ROOT / "test",
}

df = pd.read_csv(ANNOTATION_CSV)

for path in SPLIT_DIRS.values():
    path.mkdir(parents=True, exist_ok=True)

CLASS_ID_TO_NAME = {0: "Cargo", 1: "Passengership", 2: "Tanker", 3: "Tug"}

for _, row in tqdm(df.iterrows(), total=len(df)):
    split_name = row["split"]
    fname = str(row.get("segmented_filename", row["segment_id"]))
    if not fname.endswith(".wav"):
        fname += ".wav"

    cls_dir = CLASS_ID_TO_NAME[int(row["class_id"])]
    src = SEGMENT_ROOT / cls_dir / fname
    dst = SPLIT_DIRS[split_name] / fname

    if not src.exists():
        print(f"[WARN] Missing file: {src}")
        continue

    shutil.copy2(src, dst)

print("Done! Files are in", OUTPUT_ROOT)

100%|██████████| 56864/56864 [07:32<00:00, 125.77it/s]

Done! Files are in X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test





# 训练集随机加噪，验证集、测试集固定噪声分贝数

In [29]:
SOURCE_ROOT = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test")
ANNOTATION_CSV = Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz_SNR_split.csv")
DEST_ROOT = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test_noisy_dataset")

TRAIN_RANGE = (-12, 6)
EVAL_SNRS = np.array([-12, -9, -6, -3, 0, 3, 6], dtype=np.int32)

DEST_ROOT.mkdir(parents=True, exist_ok=True)
df_split = pd.read_csv(ANNOTATION_CSV)

def to_wav_name(row):
    name = str(row.get("segmented_filename", row["segment_id"]))
    return f"{name}.wav" if not name.lower().endswith(".wav") else name

def add_noise_with_snr(signal, target_snr_db, rng):
    signal_power = np.mean(signal ** 2) + 1e-12
    snr_linear = 10 ** (target_snr_db / 10.0)
    noise_power = signal_power / snr_linear
    noise = rng.normal(0.0, np.sqrt(noise_power), size=signal.shape)
    return signal + noise

def load_waveform(split_name, file_name):
    wav_path = SOURCE_ROOT / split_name / file_name
    audio, sr = sf.read(wav_path)
    return audio.astype(np.float32), sr

train_rng = np.random.default_rng(42)

for _, row in tqdm(df_split.iterrows(), total=len(df_split)):
    split = row["split"]
    fname = to_wav_name(row)
    audio, sr = load_waveform(split, fname)

    if split == "train":
        snr = train_rng.integers(TRAIN_RANGE[0], TRAIN_RANGE[1] + 1)
        seed = train_rng.integers(0, 2**32 - 1)
        noisy = add_noise_with_snr(audio, snr, np.random.default_rng(seed))
        out_dir = DEST_ROOT / "train"
        out_dir.mkdir(parents=True, exist_ok=True)
        sf.write(out_dir / fname, noisy, sr)

    else:  # val/test
        for snr in EVAL_SNRS:
            seed_input = f"{fname}-{snr}"
            seed = np.uint32(abs(hash(seed_input)) & 0xFFFFFFFF)
            noisy = add_noise_with_snr(audio, snr, np.random.default_rng(seed))
            snr_dir = DEST_ROOT / split / f"SNR_{snr:+d}dB"
            snr_dir.mkdir(parents=True, exist_ok=True)
            sf.write(snr_dir / fname, noisy, sr)

print("Done! Noisy dataset saved under", DEST_ROOT)

100%|██████████| 56864/56864 [27:02<00:00, 35.04it/s]

Done! Noisy dataset saved under X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test_noisy_dataset





# 测试噪声添加是否正确

In [36]:
def measure_snr_db(clean_path, noisy_path):
    clean, sr_clean = sf.read(clean_path, dtype="float32")
    noisy, sr_noisy = sf.read(noisy_path, dtype="float32")
    if sr_clean != sr_noisy:
        raise ValueError(f"Sample rate mismatch: {sr_clean} vs {sr_noisy}")

    # 截成相同长度，避免后缀补零差异
    min_len = min(len(clean), len(noisy))
    clean = clean[:min_len]
    noisy = noisy[:min_len]

    noise = noisy - clean
    signal_power = np.mean(clean ** 2) + 1e-12
    noise_power = np.mean(noise ** 2) + 1e-12
    snr_db = 10.0 * np.log10(signal_power / noise_power)
    return snr_db

clean_path = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test\val\0_1_2.wav")
noisy_path = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test_noisy_dataset\val\SNR_+6dB\0_1_2.wav")

print(f"SNR = {measure_snr_db(clean_path, noisy_path):.2f} dB")

SNR = 5.99 dB


In [38]:
def peak_amplitude(path):
    audio, _ = sf.read(path, dtype="float32")
    return float(np.max(np.abs(audio)))

def compare_peaks(clean_path, noisy_path):
    clean_peak = peak_amplitude(clean_path)
    noisy_peak = peak_amplitude(noisy_path)
    print(f"Clean peak: {clean_peak:.6f}")
    print(f"Noisy peak: {noisy_peak:.6f}")

clean = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test\val\0_1_2.wav")
noisy = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test_noisy_dataset\val\SNR_+6dB\0_1_2.wav")

compare_peaks(clean, noisy)

Clean peak: 0.042813
Noisy peak: 0.068176


# 提取特征

In [39]:
INPUT_ROOT = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test_noisy_dataset")
OUTPUT_ROOT = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename_frame_and_window_3s_0%_16kHz_train_val_test_noisy_dataset_features")
SPLITS = ["train", "val", "test"]  

SAMPLE_RATE = 16000
N_MEL = 128
N_MFCC = 40 
N_FFT = 2048
HOP_LENGTH = 512

In [None]:
def save_array(target_path: Path, array):
    target_path.parent.mkdir(parents=True, exist_ok=True)
    np.save(target_path, array)

for split in SPLITS:
    split_in = INPUT_ROOT / split
    if not split_in.exists():
        continue

    for wav_path in split_in.rglob("*.wav"):
        y, sr = librosa.load(wav_path, sr=SAMPLE_RATE)

        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MEL, n_fft=N_FFT, hop_length=HOP_LENGTH)
        mel_db = librosa.power_to_db(mel, ref=np.max)

        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
        waveform = y.astype(np.float32)

        rel = wav_path.relative_to(split_in).with_suffix(".npy")
        split_out = OUTPUT_ROOT / split
        save_array(split_out / "mel" / rel, mel_db)
        save_array(split_out / "mfcc" / rel, mfcc)
        save_array(split_out / "waveform" / rel, waveform)

        print(f"[{split}] {wav_path} -> mel/mfcc/waveform saved")