# DeepShip 数据集加环境噪声类后的标注文件处理

In [22]:
import pandas as pd
import os
import glob

In [10]:
# 配置输入文件路径
csv_path = r'X:\\数据集\\DeepShip\\data_preprocessing\\annotation\\DeepShip.csv'
train_txt_path = r'X:\\数据集\\DeepShip\\annotation_original\\training_and_testing\\train.txt'
test_txt_path = r'X:\\数据集\\DeepShip\\annotation_original\\training_and_testing\\test.txt'
output_csv_path = r'X:\\数据集\\DeepShip\\data_preprocessing\\annotation\\DeepShip_No_Overlap_Metadata.csv'

# 读取原始 CSV
if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print(f"成功读取 CSV文件，共 {len(df)} 条记录。")
else:
    raise FileNotFoundError(f"未找到文件: {csv_path}")

成功读取 CSV文件，共 610 条记录。


In [11]:
df['new_filename'] = df.apply(lambda row: f"{row['class_id']}_{row['ID']}.wav", axis=1)

print("新文件名示例:")
print(df[['ID', 'class_id', 'new_filename']].head())

新文件名示例:
   ID  class_id new_filename
0   1         0      0_1.wav
1   2         0      0_2.wav
2   3         0      0_3.wav
3   4         0      0_4.wav
4   5         0      0_5.wav


In [None]:
def get_split_keys(file_path):
    keys = set()
    if not os.path.exists(file_path):
        print(f"警告: 文件不存在 {file_path}")
        return keys
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if not line: continue
            parts = line.split('/')
            if len(parts) >= 3:
                class_name = parts[-3]
                folder_name = parts[-2]
                keys.add((class_name, folder_name))
    return keys

In [14]:
# 加载训练集和测试集的标识键
train_keys = get_split_keys(train_txt_path)
test_keys = get_split_keys(test_txt_path)

print(f"从 {train_txt_path} 解析出 {len(train_keys)} 个唯一文件夹 (Train)")
print(f"从 {test_txt_path} 解析出 {len(test_keys)} 个唯一文件夹 (Test)")

从 X:\\数据集\\DeepShip\\annotation_original\\training_and_testing\\train.txt 解析出 398 个唯一文件夹 (Train)
从 X:\\数据集\\DeepShip\\annotation_original\\training_and_testing\\test.txt 解析出 211 个唯一文件夹 (Test)


In [15]:
class_mapping = {
    0: 'Cargo',
    1: 'Passengership',
    2: 'Tanker',
    3: 'Tug'
}

def assign_split_label(row):
    if pd.isna(row['folder_name']): return 'unassigned'
    txt_class = class_mapping.get(row['class_id'])
    key = (txt_class, row['folder_name'])
    
    if key in train_keys: return 'train'
    elif key in test_keys: return 'test'
    else: return 'unassigned'

df['dataset_split'] = df.apply(assign_split_label, axis=1)

# 打印统计结果
split_counts = df['dataset_split'].value_counts()
print("\n数据集划分统计:")
print(split_counts)

# 检查是否有未分配的数据 (通常是ID 23, 因为缺失folder_name)
unassigned = df[df['dataset_split'] == 'unassigned']
if len(unassigned) > 0:
    print("\n注意：以下数据未被分配 (可能缺失元数据):")
    print(unassigned[['ID', 'Ship Name', 'folder_name']])


数据集划分统计:
dataset_split
train         398
test          211
unassigned      1
Name: count, dtype: int64

注意：以下数据未被分配 (可能缺失元数据):
    ID Ship Name folder_name
21  23   GALLEON         NaN


In [21]:
df_clean = df[df['dataset_split'] != 'unassigned'].copy()
print(f"已剔除未分配数据 {len(df) - len(df_clean)} 条。")

target_columns = [
    'ID', 
    'new_filename',
    'class_id', 
    'class ID', 
    'Ship Name', 
    'folder_name', 
    'Date & Time',
    'Duration(sec)',
    'Distances(m)',
    'prompt_en',
    'dataset_split'
]

# 确保列都存在
df_final = df_clean[target_columns]

# 打印统计信息
print("\n最终划分统计结果:")
print(df_final['dataset_split'].value_counts())

# 保存文件
df_final.to_csv(output_csv_path, index=False)
print(f"\n处理完成！文件已保存为: {output_csv_path}")

已剔除未分配数据 1 条。

最终划分统计结果:
dataset_split
train    398
test     211
Name: count, dtype: int64

处理完成！文件已保存为: X:\\数据集\\DeepShip\\data_preprocessing\\annotation\\DeepShip_No_Overlap_Metadata.csv


# 处理海洋环境噪声的标注信息

In [26]:
original_csv_path = r"X:\\数据集\\DeepShip\\data_preprocessing\\annotation\\DeepShip_No_Overlap_Metadata.csv"
output_csv_path = r"X:\数据集\DeepShip\\data_preprocessing\\annotation\\DeepShip_No_Overlap_Metadata_add_Environmental_Noise.csv"

# 噪声数据的根目录
NOISE_ROOT_PATH = r'X:\数据集\DeepShip\data_original\background'

In [27]:
df_ship = pd.read_csv(original_csv_path)
print(f"原始 4 类数据加载完成: {len(df_ship)} 条记录")

原始 4 类数据加载完成: 609 条记录


In [None]:
print("=== 开始执行文件重命名 ===")
def rename_noise_files(root_path):
    count = 0
    errors = 0
    
    # 遍历 train 和 test
    for split in ['train', 'test']:
        split_dir = os.path.join(root_path, split)
        if not os.path.exists(split_dir):
            print(f"警告: 找不到目录 {split_dir}")
            continue
            
        print(f"正在扫描目录: {split_dir}")
        
        # 使用 os.walk 遍历
        for root, dirs, files in os.walk(split_dir):
            for file in files:
                if file.endswith('.wav'):
                    # 检查文件名是否已经是目标格式 (防止重复运行导致错误)
                    if file.startswith('4_') and file.count('_') >= 2:
                        continue
                        
                    old_path = os.path.join(root, file)
                    
                    try:
                        # === 解析路径逻辑 ===
                        # 示例路径: ...\3_0\0000\00000.wav
                        path_parts = os.path.normpath(old_path).split(os.sep)
                        
                        # 1. 提取 FileID (00000 -> 1)
                        file_stem = os.path.splitext(file)[0]
                        try:
                            file_num_id = int(file_stem) + 1 # 00000 -> 1
                        except ValueError:
                            # 如果文件名不是纯数字，可能需要特殊处理，或者跳过
                            print(f"跳过非数字文件名: {file}")
                            continue

                        # 2. 提取 FolderID (3_0 -> 0)
                        # 寻找包含 '_' 的上级目录
                        parent = path_parts[-2]      # 0000
                        grandparent = path_parts[-3] # 3_0
                        
                        target_folder = grandparent if '_' in grandparent else parent
                        
                        if '_' in target_folder:
                            folder_id_part = target_folder.split('_')[-1]
                        else:
                            # 如果找不到下划线，尝试用当前目录名
                            folder_id_part = target_folder
                            
                        # === 构造新文件名 ===
                        # 格式: 4_{FolderID}_{FileID}.wav
                        new_filename = f"4_{folder_id_part}_{file_num_id}.wav"
                        new_path = os.path.join(root, new_filename)
                        
                        # === 执行重命名 ===
                        if old_path != new_path:
                            os.rename(old_path, new_path)
                            count += 1
                            if count % 100 == 0:
                                print(f"已重命名 {count} 个文件...", end='\r')
                                
                    except Exception as e:
                        print(f"重命名失败: {old_path}, 错误: {e}")
                        errors += 1

    print(f"\n重命名完成! 共修改 {count} 个文件, 失败 {errors} 个。")
    
# 执行重命名函数
rename_noise_files(NOISE_ROOT_PATH)

=== 开始执行文件重命名 ===
正在扫描目录: X:\数据集\DeepShip\data_original\background\train
正在扫描目录: X:\数据集\DeepShip\data_original\background\test
已重命名 27700 个文件...
重命名完成! 共修改 27768 个文件, 失败 0 个。


In [37]:
print("\n=== 开始生成标注信息 ===")

if os.path.exists(original_csv_path):
    df_ship = pd.read_csv(original_csv_path)
    print("成功读取原始数据。")
else:
    raise FileNotFoundError(f"未找到原始文件: {original_csv_path}")

# 扫描已重命名的文件
def scan_renamed_files(root_path):
    noise_rows = []
    # === 修改处：ID 从 1 开始独立计数 ===
    current_id = 1 
    
    for split in ['train', 'test']:
        split_dir = os.path.join(root_path, split)
        if not os.path.exists(split_dir): continue
        
        for root, dirs, files in os.walk(split_dir):
            for file in files:
                if file.endswith('.wav') and file.startswith('4_'):
                    
                    # 获取 folder_name
                    path_parts = os.path.normpath(root).split(os.sep)
                    parent = path_parts[-1] 
                    grandparent = path_parts[-2] 
                    raw_folder_name = grandparent if '_' in grandparent else parent

                    row = {
                        'ID': current_id,
                        'new_filename': file,  
                        'class_id': 4,
                        'class ID': None,
                        'Ship Name': 'Marine Environmental Noise',
                        'folder_name': raw_folder_name, 
                        'Date & Time': 'Unknown',
                        'Duration(sec)': 3,
                        'Distances(m)': None,
                        'prompt_en': 'Hydrophone recording of marine environmental noise.',
                        'dataset_split': split
                    }
                    noise_rows.append(row)
                    # 计数器递增
                    current_id += 1
                    
    return pd.DataFrame(noise_rows)

# 执行扫描
df_noise = scan_renamed_files(NOISE_ROOT_PATH)


=== 开始生成标注信息 ===
成功读取原始数据。


In [38]:
if len(df_noise) > 0:
    print(f"扫描到 {len(df_noise)} 条噪声数据。")
    print(f"噪声数据的 ID 范围: {df_noise['ID'].min()} - {df_noise['ID'].max()}")
    
    # 2.3 合并数据
    target_columns = [
        'ID', 'new_filename', 'class_id', 'class ID', 
        'Ship Name', 'folder_name', 'Date & Time', 
        'Duration(sec)', 'Distances(m)', 'prompt_en', 'dataset_split'
    ]
    
    # 确保列一致
    df_ship_aligned = df_ship[target_columns]
    df_noise_aligned = df_noise[target_columns]
    
    # 合并
    df_final = pd.concat([df_ship_aligned, df_noise_aligned], ignore_index=True)
    
    # 保存
    df_final.to_csv(output_csv_path, index=False)
    print(f"\n最终处理完成！文件已保存至: {output_csv_path}")
else:
    print("未找到任何噪声文件。")

扫描到 27768 条噪声数据。
噪声数据的 ID 范围: 1 - 27768

最终处理完成！文件已保存至: X:\数据集\DeepShip\\data_preprocessing\\annotation\\DeepShip_No_Overlap_Metadata_add_Environmental_Noise.csv
