In [1]:
import sys, re, csv
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
COLUMNS = ["ID", "class ID", "Recording ID", "Ship Name",
           "Date & Time", "Duration(sec)", "Distances(m)"]
           
def clean_line(s):
    # 统一空白和连字符
    return (s.replace("\u00A0", " ")   # nbsp -> space
             .replace("\u2013", "-")  # en-dash –
             .replace("\u2014", "-")  # em-dash —
             .replace("\u2212", "-")  # minus sign −
             .strip())

def parse_file(path):
    rows, bad = [], 0
    with open(path, "r") as f:
        for line_num, raw in enumerate(f, 1):
            line = clean_line(raw)

            # 跳过空行
            if not line:
                continue

            # 使用逗号分割数据
            parts = [part.strip() for part in line.split(',')]
            
            # 提取各个字段
            id = parts[0].strip()
            class_id = parts[1].strip() 
            ship_name = parts[2].strip()
            date_part = parts[3].strip()
            time_part = parts[4].strip()
            duration = parts[5].strip()
            distances = parts[6].strip()
            
            # 合并日期和时间，格式化为 YYYYMMDD:HHMMSS
            date_time = f"{date_part}:{time_part}"
            
            # 这里假设 Recording ID 和 ID 相同，或者可以根据需要调整
            recording_id = id # 或者可以设置为其他值
            
            rows.append([id, class_id, recording_id, ship_name, date_time, duration, distances])
    
    return rows

In [3]:
in_path = Path(r"X:\数据集\DeepShip\annotation_original\tug-metafile")
out_path = in_path.with_suffix(".csv")
rows = parse_file(in_path)
with open(out_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(COLUMNS)
        writer.writerows(rows)
print(f"Saved: {out_path})")

Saved: X:\数据集\DeepShip\annotation_original\tug-metafile.csv)


In [4]:
# 1. 合并四个标注文件
annotation_original_dir = Path(r"X:\数据集\DeepShip\annotation_original")
data_dir = annotation_original_dir
files = {
    "cargo": data_dir / "cargo-metafile.csv",
    "passenger": data_dir / "passengership-metafile.csv",
    "tanker": data_dir / "tanker-metafile.csv",
    "tug": data_dir / "tug-metafile.csv",
}

dfs = []
for label, path in files.items():
    df = pd.read_csv(path)
    dfs.append(df)

merged = pd.concat(dfs, ignore_index=True)
merged["ID"] = merged["ID"].astype(int)

# 2. 添加 class_id
class_id = merged["class ID"].astype(int)
merged["class_id"] = np.select(
    [
        class_id.between(70, 79),  # Cargo
        class_id.between(60, 69),  # Passengership
        class_id.between(80, 89),  # Tanker
        class_id == 52,            # Tug
    ],
    [0, 1, 2, 3],
    default=-1
)

# 3. 构建 folder_name 映射
data_root = Path(r"X:\数据集\DeepShip\data_preprocessing\data")
class_dirs = {0: "Cargo", 1: "Passengership", 2: "Tanker", 3: "Tug"}

def parse_id(folder_name: str):
    parts = folder_name.rsplit("-", 1)
    if len(parts) != 2 or not parts[1].isdigit():
        return None
    return int(parts[1])

folder_map = {}
for cid, dirname in class_dirs.items():
    cls_dir = data_root / dirname
    if not cls_dir.exists():
        continue
    for sub in cls_dir.iterdir():
        if not sub.is_dir():
            continue
        sid = parse_id(sub.name)
        if sid is not None:
            folder_map[(cid, sid)] = sub.name

merged["folder_name"] = merged.apply(
    lambda row: folder_map.get(
        (int(row["class_id"]), int(row["ID"])),
        pd.NA
    ),
    axis=1,
)

# 4. 写回文件
merged.to_csv(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip.csv", index=False)

In [4]:
def bucket_distance(value):
    """
    输入: 原始字符串/数值，形如 '1583-1981' 或 '1600 – 1905' 等
    输出: (labels, distances)
         labels: ['near'|'mid'|'far'|...]
         distances: [float, float, ...]
    """
    # 1) 统一分隔符：用“非数字/小数点”的任意字符作为分割
    parts = re.split(r'[^0-9.]+', str(value).strip())
    distances, labels = [], []
    for p in parts:
        if not p:
            continue
        try:
            d = float(p)
        except ValueError:
            continue
        distances.append(d)
        if d < 300:
            labels.append("near")
        elif d <= 1000:
            labels.append("mid")
        else:
            labels.append("far")
    return labels, distances

def make_prompt(distance_str):
    labels, distances = bucket_distance(distance_str)

    # 解析失败：返回最安全句式
    if len(distances) == 0:
        return "Hydrophone recording of a marine vessel."

    # 只有一个值：直接用该档位
    if len(distances) == 1:
        return f"Hydrophone recording of a marine vessel at {labels[0]} range."

    # 取前两个时刻（你的标注基本就是两值）
    d0, d1 = distances[0], distances[1]
    l0, l1 = labels[0], labels[1]

    # 两次都在同一档位：判断是否“明显变远/变近/基本不变”
    if l0 == l1:
        rel = (d1 - d0) / max(abs(d0), 1e-6)
        if rel > 0.10:
            return f"Hydrophone recording of a marine vessel at {l0} range and getting farther."
        elif rel < -0.10:
            return f"Hydrophone recording of a marine vessel at {l0} range and getting closer."
        else:
            return f"Hydrophone recording of a marine vessel that remains at {l0} range."

    # 落在不同档位：用文字“then”表达顺序（不使用连字符）
    # 只拼接前两个标签，避免极端多段值导致过长
    return f"Hydrophone recording of a marine vessel at a {l0} range and then at {l1} range."

file_path = r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip.csv"
df = pd.read_csv(file_path)
df["prompt_en"] = df["Distances(m)"].apply(make_prompt)
df.to_csv(file_path, index=False)

In [2]:
data_root = Path(r"X:\数据集\DeepShip\data_preprocessing\data")
annotation_root = Path(r"X:\数据集\DeepShip\annotation_original")

meta_files = {
    "Cargo": annotation_root / "cargo-metafile.csv",
    "Passengership": annotation_root / "passengership-metafile.csv",
    "Tanker": annotation_root / "tanker-metafile.csv",
    "Tug": annotation_root / "tug-metafile.csv",
}

def parse_id(folder_name: str) -> int | None:
    parts = folder_name.rsplit("-", 1)
    if len(parts) != 2 or not parts[1].isdigit():
        return None
    return int(parts[1])

summary = {}

for cls, meta_path in meta_files.items():
    annotation = pd.read_csv(meta_path)
    expected_ids = set(annotation["ID"].astype(int))

    cls_dir = data_root / cls
    if not cls_dir.exists():
        summary[cls] = {"error": f"目录不存在: {cls_dir}"}
        continue

    folder_ids: dict[int, Path] = {}
    invalid_names: list[str] = []
    duplicate_names: list[str] = []

    for subdir in cls_dir.iterdir():
        if not subdir.is_dir():
            continue
        sid = parse_id(subdir.name)
        if sid is None:
            invalid_names.append(subdir.name)
            continue
        # 如果同一个 ID 出现多个文件夹，记录重复名称
        if sid in folder_ids:
            duplicate_names.append(subdir.name)
        else:
            folder_ids[sid] = subdir

    observed_ids = set(folder_ids)
    missing_ids = sorted(expected_ids - observed_ids)
    extra_ids = sorted(observed_ids - expected_ids)

    missing_audio = []
    renamed_audio = []

    for sid, folder in folder_ids.items():
        expected_wav = folder / f"{sid}.wav"
        if expected_wav.exists():
            # 名称已经对上
            continue

        # 这里没有标准命名，尝试重命名
        wav_files = list(folder.glob("*.wav"))

        if len(wav_files) == 1:
            src = wav_files[0]
            src.rename(expected_wav)
            renamed_audio.append(f"{src.name} -> {expected_wav.name}")
        elif len(wav_files) > 1:
            missing_audio.append(
                f"{folder}（发现多个 wav：{', '.join(f.name for f in wav_files)}）"
            )
        else:
            missing_audio.append(f"{expected_wav}（文件夹没有 wav）")

    summary[cls] = {
        "annotated_count": len(expected_ids),
        "folder_count": len(observed_ids),
        "missing_ids": missing_ids,
        "extra_ids": extra_ids,
        "invalid_folder_names": invalid_names,
        "duplicate_folder_names": duplicate_names,
        "missing_wav_files": missing_audio,
        "renamed_wav_files": renamed_audio if renamed_audio else "OK",
    }

for cls, info in summary.items():
    print(cls)
    for key, value in info.items():
        if isinstance(value, list):
            print(f"  {key}: {value if value else 'OK'}")
        else:
            print(f"  {key}: {value}")
    print("-" * 40)

Cargo
  annotated_count: 110
  folder_count: 109
  missing_ids: [23]
  extra_ids: OK
  invalid_folder_names: OK
  duplicate_folder_names: OK
  missing_wav_files: OK
  renamed_wav_files: ['002044.wav -> 22.wav', '045829.wav -> 24.wav', '055206.wav -> 25.wav', '225044.wav -> 26.wav', '074918.wav -> 28.wav', '151043.wav -> 29.wav', '214753.wav -> 30.wav', '031902.wav -> 31.wav', '111158.wav -> 32.wav', '202039.wav -> 33.wav', '210343.wav -> 34.wav', '060651.wav -> 35.wav', '075414.wav -> 36.wav', '113017.wav -> 37.wav', '004508.wav -> 39.wav', '135730.wav -> 40.wav', '134001.wav -> 42.wav', '161634.wav -> 43.wav', '134833.wav -> 46.wav', '214505.wav -> 47.wav', '234924.wav -> 48.wav', '023314.wav -> 49.wav', '035027.wav -> 50.wav', '044753.wav -> 51.wav', '063408.wav -> 52.wav', '231436.wav -> 53.wav', '080742.wav -> 54.wav', '000645.wav -> 55.wav', '075459.wav -> 56.wav', '165031.wav -> 57.wav', '231720.wav -> 58.wav', '080357.wav -> 59.wav', '111402.wav -> 60.wav', '054923.wav -> 61.wav

In [3]:
annotations = pd.read_csv(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip.csv")
data_root = Path(r"X:\数据集\DeepShip\data_preprocessing\data")

class_dirs = {
    0: "Cargo",
    1: "Passengership",
    2: "Tanker",
    3: "Tug",
}

for _, row in annotations.iterrows():
    cls_id = int(row["class_id"])
    sample_id = int(row["ID"])
    folder_name = row["folder_name"]

    if pd.isna(folder_name):
        print(f"ID:{sample_id}缺少folder_name，跳过ID:{sample_id}, class_id:{cls_id}")
        continue

    subdir = class_dirs.get(cls_id)
    if subdir is None:
        print(f"未知 class_id={cls_id}，跳过")
        continue

    src_dir = data_root / subdir / str(folder_name)
    if not src_dir.exists():
        print(f"目录不存在: {src_dir}")
        continue

    src_file = src_dir / f"{sample_id}.wav"
    dst_file = src_dir / f"{cls_id}_{sample_id}.wav"

    if dst_file.exists():
        print(f"目标名已存在，跳过: {dst_file}")
        continue

    src_file.rename(dst_file)

ID:23缺少folder_name，跳过ID:23, class_id:0


In [4]:
from pathlib import Path
import shutil

In [6]:
# 原目录
root = Path(r"X:\数据集\DeepShip\data_preprocessing")
categories = ["Cargo", "Passengership", "Tanker", "Tug"]
# 目标目录：同级生成 data_new/Cargo
dest_root = Path(r"X:\数据集\DeepShip\data_preprocessing\data_audio_rename")

# 需要的音频后缀，可按需扩展
audio_suffixes = {".wav"}

for category in categories:
    src_dir = root / "data" / category
    dst_dir = dest_root / category
    dst_dir.mkdir(parents=True, exist_ok=True)

    for audio in src_dir.rglob("*"):
        if audio.is_file() and audio.suffix.lower() in audio_suffixes:
            target = dst_dir / audio.name
            if target.exists():
                print(f"[跳过] {category}/{audio.name} 已存在，未覆盖。")
                continue
            shutil.copy2(audio, target)
            print(f"[复制] {audio.relative_to(root)} -> {target.relative_to(root)}")



[复制] data\Cargo\20171104-1\0_1.wav -> data_audio_rename\Cargo\0_1.wav
[复制] data\Cargo\20171104a-2\0_2.wav -> data_audio_rename\Cargo\0_2.wav
[复制] data\Cargo\20171105a-3\0_3.wav -> data_audio_rename\Cargo\0_3.wav
[复制] data\Cargo\20171106-4\0_4.wav -> data_audio_rename\Cargo\0_4.wav
[复制] data\Cargo\20171107-5\0_5.wav -> data_audio_rename\Cargo\0_5.wav
[复制] data\Cargo\20171107b-6\0_6.wav -> data_audio_rename\Cargo\0_6.wav
[复制] data\Cargo\20171110-7\0_7.wav -> data_audio_rename\Cargo\0_7.wav
[复制] data\Cargo\20171111-8\0_8.wav -> data_audio_rename\Cargo\0_8.wav
[复制] data\Cargo\20171111e-9\0_9.wav -> data_audio_rename\Cargo\0_9.wav
[复制] data\Cargo\20171111f-10\0_10.wav -> data_audio_rename\Cargo\0_10.wav
[复制] data\Cargo\20171111g-11\0_11.wav -> data_audio_rename\Cargo\0_11.wav
[复制] data\Cargo\20171112-12\0_12.wav -> data_audio_rename\Cargo\0_12.wav
[复制] data\Cargo\20171113-13\0_13.wav -> data_audio_rename\Cargo\0_13.wav
[复制] data\Cargo\20171114a-14\0_14.wav -> data_audio_rename\Cargo\0_14.wa

# 根据切割后的音频片段，生成新的标注文件

In [3]:
ANNOT_PATH = Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip.csv")
# MEL_ROOT   = Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_2s_0%\mel_2s_0%")
MEL_ROOT   = Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_3s_0%\mel_3s_0%")

CLASS_ID_TO_NAME = {
    0: "Cargo",
    1: "Passengership",
    2: "Tanker",
    3: "Tug",
}

annotation = pd.read_csv(ANNOT_PATH)
extended_rows = []

for _, row in annotation.iterrows():
    cls_id = int(row["class_id"])
    cls_name = CLASS_ID_TO_NAME[cls_id]
    base_id = int(row["ID"])

    pattern = f"{cls_id}_{base_id}_*.npy"
    feature_paths = sorted((MEL_ROOT / cls_name).glob(pattern))
    if not feature_paths:
        print(f"缺少特征: {cls_name} ID {base_id}")
        continue

    for feat_path in feature_paths:
        seg_idx = int(feat_path.stem.split("_")[-1])
        entry = row.to_dict()
        entry["segment_id"] = f"{cls_id}_{base_id}_{seg_idx}"
        extended_rows.append(entry)

extended_df = pd.DataFrame(extended_rows)
extended_columns = [
    "ID", "class ID", "Recording ID", "Ship Name",
    "Date & Time", "Duration(sec)", "Distances(m)",
    "class_id", "folder_name",
    "segment_id", "prompt_en",
]

extended_df = extended_df.reindex(columns=extended_columns)
# out_path =  Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_2s_0%\mel_2s_0%\DeepShip_segments_mel.csv")
out_path =  Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_3s_0%\mel_3s_0%\DeepShip_segments_mel.csv")
extended_df.to_csv(out_path, index=False)
print(f"写出 {len(extended_df)} 条记录 -> {out_path}")

缺少特征: Cargo ID 23
写出 56864 条记录 -> X:\数据集\DeepShip\data_preprocessing\data_new_extract_3s_0%\mel_3s_0%\DeepShip_segments_mel.csv


In [4]:
ANNOT_PATH = Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip.csv")
# MFCC_ROOT   = Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_2s_0%\mfcc_2s_0%")
MFCC_ROOT   = Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_3s_0%\mfcc_3s_0%")

CLASS_ID_TO_NAME = {
    0: "Cargo",
    1: "Passengership",
    2: "Tanker",
    3: "Tug",
}

annotation = pd.read_csv(ANNOT_PATH)
extended_rows = []

for _, row in annotation.iterrows():
    cls_id = int(row["class_id"])
    cls_name = CLASS_ID_TO_NAME[cls_id]
    base_id = int(row["ID"])

    pattern = f"{cls_id}_{base_id}_*.npy"
    feature_paths = sorted((MEL_ROOT / cls_name).glob(pattern))
    if not feature_paths:
        print(f"缺少特征: {cls_name} ID {base_id}")
        continue

    for feat_path in feature_paths:
        seg_idx = int(feat_path.stem.split("_")[-1])
        entry = row.to_dict()
        entry["segment_id"] = f"{cls_id}_{base_id}_{seg_idx}"
        extended_rows.append(entry)

extended_df = pd.DataFrame(extended_rows)
extended_columns = [
    "ID", "class ID", "Recording ID", "Ship Name",
    "Date & Time", "Duration(sec)", "Distances(m)",
    "class_id", "folder_name",
    "segment_id", "prompt_en",
]

extended_df = extended_df.reindex(columns=extended_columns)
# out_path =  Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_2s_0%\mfcc_2s_0%\DeepShip_segments_mfcc.csv")
out_path =  Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_3s_0%\mfcc_3s_0%\DeepShip_segments_mfcc.csv")
extended_df.to_csv(out_path, index=False)
print(f"写出 {len(extended_df)} 条记录 -> {out_path}")

缺少特征: Cargo ID 23
写出 56864 条记录 -> X:\数据集\DeepShip\data_preprocessing\data_new_extract_3s_0%\mfcc_3s_0%\DeepShip_segments_mfcc.csv


In [5]:
ANNOT_PATH = Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip.csv")
# SEG_ROOT   = Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_2s_0%\mfcc_2s_0%")
# SEG_ROOT   = Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_3s_0%\mfcc_3s_0%")
SEG_ROOT   = Path(r"X:\数据集\DeepShip\data_preprocessing\data_new_extract_3s_0%_16kHz\mel_3s_0%_16kHz")

CLASS_ID_TO_NAME = {
    0: "Cargo",
    1: "Passengership",
    2: "Tanker",
    3: "Tug",
}

annotation = pd.read_csv(ANNOT_PATH)
extended_rows = []

for _, row in annotation.iterrows():
    cls_id = int(row["class_id"])
    cls_name = CLASS_ID_TO_NAME[cls_id]
    base_id = int(row["ID"])

    pattern = f"{cls_id}_{base_id}_*.npy"
    feature_paths = sorted((SEG_ROOT / cls_name).glob(pattern))
    if not feature_paths:
        print(f"缺少特征: {cls_name} ID {base_id}")
        continue

    for feat_path in feature_paths:
        seg_idx = int(feat_path.stem.split("_")[-1])
        entry = row.to_dict()
        entry["segment_id"] = f"{cls_id}_{base_id}_{seg_idx}"
        extended_rows.append(entry)

extended_df = pd.DataFrame(extended_rows)
extended_columns = [
    "ID", "class ID", "Recording ID", "Ship Name",
    "Date & Time", "Duration(sec)", "Distances(m)",
    "class_id", "folder_name",
    "segment_id", "prompt_en",
]

extended_df = extended_df.reindex(columns=extended_columns)
out_path =  Path(r"X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz.csv")
extended_df.to_csv(out_path, index=False)
print(f"写出 {len(extended_df)} 条记录 -> {out_path}")

缺少特征: Cargo ID 23
写出 56864 条记录 -> X:\数据集\DeepShip\data_preprocessing\annotation\DeepShip_segments_3s_0%_16kHz.csv
