In [4]:
import os
import shutil

# 创建目标文件夹
target_folder = '/mnt/hdd/jiazy/skin-cancer/imgs'
os.makedirs(target_folder, exist_ok=True)

# 要移动的文件夹列表
source_folders = [
    '/mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1',
    '/mnt/hdd/jiazy/skin-cancer/imgs_part_2/imgs_part_2',
    '/mnt/hdd/jiazy/skin-cancer/imgs_part_3/imgs_part_3'
]

# 遍历所有源文件夹，移动文件
for source_folder in source_folders:
    for filename in os.listdir(source_folder):
        source_file = os.path.join(source_folder, filename)
        if os.path.isfile(source_file):  # 确保是文件而不是文件夹
            target_file = os.path.join(target_folder, filename)
            shutil.move(source_file, target_file)
            print(f'已移动: {source_file} -> {target_file}')


已移动: /mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1/PAT_537_1014_452.png -> /mnt/hdd/jiazy/skin-cancer/imgs/PAT_537_1014_452.png
已移动: /mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1/PAT_256_394_449.png -> /mnt/hdd/jiazy/skin-cancer/imgs/PAT_256_394_449.png
已移动: /mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1/PAT_535_1010_303.png -> /mnt/hdd/jiazy/skin-cancer/imgs/PAT_535_1010_303.png
已移动: /mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1/PAT_104_1754_276.png -> /mnt/hdd/jiazy/skin-cancer/imgs/PAT_104_1754_276.png
已移动: /mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1/PAT_306_1646_903.png -> /mnt/hdd/jiazy/skin-cancer/imgs/PAT_306_1646_903.png
已移动: /mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1/PAT_90_219_648.png -> /mnt/hdd/jiazy/skin-cancer/imgs/PAT_90_219_648.png
已移动: /mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1/PAT_90_138_605.png -> /mnt/hdd/jiazy/skin-cancer/imgs/PAT_90_138_605.png
已移动: /mnt/hdd/jiazy/skin-cancer/imgs_part_1/imgs_part_1/PAT_251_382_576.png -> 

In [4]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from PIL import Image
from tqdm import tqdm

# 确保 PIL 版本兼容 (用于 resize)
try:
    # PIL 9.0.0+
    LANCZOS_RESAMPLE = Image.Resampling.LANCZOS
except AttributeError:
    # Older PIL
    LANCZOS_RESAMPLE = Image.LANCZOS

# --- 1. 定义常量和路径 ---
# !! 请根据您的环境修改这里的根路径 !!
DATA_ROOT = "/mnt/hdd/jiazy/skin-cancer"
METADATA_FILE = os.path.join(DATA_ROOT, "metadata.csv")
IMAGE_DIR = os.path.join(DATA_ROOT, "imgs")

# 输出目录
OUTPUT_DIR = "./features"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# --- 2. 根据您提供的表格定义特征 ---
# 连续特征
CONTINUOUS_COLS = [
    'age', 
    'diameter_1', 
    'diameter_2'
]

# 类别特征
CATEGORICAL_COLS = [
    'smoke', 
    'drink', 
    'background_father', 
    'background_mother', 
    'pesticide', 
    'gender', 
    'skin_cancer_history', 
    'cancer_history', 
    'has_piped_water', 
    'has_sewage_system', 
    'fitspatrick', 
    'region', 
    'itch', 
    'grew', 
    'hurt', 
    'changed', 
    'bleed', 
    'elevation', 
    'biopsed'
]

# 标签列
LABEL_COL = 'diagnostic'

# 非特征列 (将被移除)
NON_FEATURE_COLS = [
    'patient_id', 
    'lesion_id', 
    'img_id',
    LABEL_COL # 标签也不是特征
]

# --- 3. 图像处理辅助函数 ---
def process_and_save_image(png_path):
    """
    加载 PNG 图像, resize 到 224x224, 并另存为 .npy 文件。
    如果 .npy 文件已存在，则跳过。
    """
    if not os.path.exists(png_path):
        print(f"警告：图像文件不存在 {png_path}")
        return None # 返回 None 以便后续过滤

    # 将 .png 替换为 .npy
    npy_path = png_path.replace(".png", ".npy")

    if os.path.exists(npy_path):
        return npy_path # 文件已存在

    try:
        # 打开图像
        img = Image.open(png_path)
        
        # Resize 到 224x224
        img_resized = img.resize((224, 224), resample=LANCZOS_RESAMPLE)
        
        # 转换为 NumPy 数组
        # 如果图像是 RGBA，转为 RGB
        if img_resized.mode == 'RGBA':
            img_resized = img_resized.convert('RGB')
            
        np_img = np.array(img_resized)
        
        # 保存为 .npy
        np.save(npy_path, np_img)
        
        return npy_path
    except Exception as e:
        print(f"处理图像失败 {png_path}: {e}")
        return None


def main():
    print("--- 步骤 1: 加载数据 ---")
    df = pd.read_csv(METADATA_FILE)
    print(f"原始数据加载: {len(df)} 行")

    print("--- 步骤 2: 图像路径处理 (新) ---")
    # 拼接绝对路径
    df['image_path_final'] = df['img_id'].apply(
        lambda x: os.path.join(IMAGE_DIR, x)
    )

    print("--- 步骤 3: 数据清洗 (过滤缺失标签、路径和过多缺失特征) ---")
    initial_rows = len(df)
    
    # 1. (原) 过滤缺失标签和路径
    df = df.dropna(subset=['image_path_final', LABEL_COL])
    rows_after_label_drop = len(df)
    print(f"因标签或路径缺失而丢弃 {initial_rows - rows_after_label_drop} 行。")

    print("--- 步骤 4: 数据集划分 (8:1:1) (新) ---")
    # (对应原流程 Step 8, 但逻辑不同)
    
    # 首先，分层划分出 80% 的训练集
    train_df, temp_df = train_test_split(
        df,
        test_size=0.2,
        stratify=df[LABEL_COL],
        random_state=42
    )
    
    # 然后，将剩余的 20% 平分为 10% 验证集和 10% 测试集
    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,
        stratify=temp_df[LABEL_COL],
        random_state=42
    )
    
    print(f"划分结果: 训练集 {len(train_df)}, 验证集 {len(val_df)}, 测试集 {len(test_df)}")

    print("--- 步骤 5: 移除恒定列 ---")
    # (对应原流程 Step 3, 但基于新的 train_df)
    # 我们只检查训练集，如果在训练集中是恒定的，则该特征无效
    
    final_continuous_cols = []
    for col in CONTINUOUS_COLS:
        if train_df[col].nunique() > 1:
            final_continuous_cols.append(col)
        else:
            print(f"移除恒定连续特征: {col}")
            
    final_categorical_cols = []
    for col in CATEGORICAL_COLS:
        if train_df[col].nunique() > 1:
            final_categorical_cols.append(col)
        else:
            print(f"移除恒定类别特征: {col}")

    print("--- 步骤 6: 填充缺失值 ---")
    # (对应原流程 Step 6)
    print("    -> 正在为 _TIP (无填补) 版本创建数据备份...")
    train_df_tip = train_df.copy()
    val_df_tip = val_df.copy()
    test_df_tip = test_df.copy()
    
    # 连续特征：使用训练集的均值填充
    for col in final_continuous_cols:
        mean = train_df[col].mean()
        train_df.loc[:, col] = train_df[col].fillna(mean)
        val_df.loc[:, col] = val_df[col].fillna(mean)
        test_df.loc[:, col] = test_df[col].fillna(mean)
        
    # 类别特征：使用 "MISSING" 字符串填充 [填补版]
    for col in final_categorical_cols:
        train_df.loc[:, col] = train_df[col].fillna("MISSING")
        val_df.loc[:, col] = val_df[col].fillna("MISSING")
        test_df.loc[:, col] = test_df[col].fillna("MISSING")

    print("--- 步骤 7: 类别特征编码 (和标签编码) ---")

    field_lengths = []
    category_mappers = {} # <-- 用于存储统一的映射

    # 1. 从 [填补版] DataFrame 创建统一的类别映射
    print("    -> 7.A: 正在从 [填补版] 数据中构建统一的类别映射...")
    full_df_imputed = pd.concat([train_df, val_df, test_df])

    for col in final_categorical_cols:
        # 将列转换为 category 类型
        full_df_imputed[col] = full_df_imputed[col].astype('category')
        
        # 记录基数 (唯一类别数，例如 'A', 'B', 'MISSING')
        cardinality = len(full_df_imputed[col].cat.categories)
        field_lengths.append(cardinality)
        
        # 【【【 修复 1 】】】
        # 保存这个“真相”映射 (保存 Dtype, 而不是 CatAccessor)
        category_mappers[col] = full_df_imputed[col].dtype

        # 将编码应用回 [填补版] df
        # (这部分逻辑是正确的，因为 full_df_imputed 已经包含了正确的 codes)
        train_df[col] = full_df_imputed.loc[train_df.index, col].cat.codes
        val_df[col] = full_df_imputed.loc[val_df.index, col].cat.codes
        test_df[col] = full_df_imputed.loc[test_df.index, col].cat.codes

    # 2. 将 *相同的* 映射应用到 [_TIP 版] (包含 NaN)
    print("    -> 7.B: 正在应用统一映射到 [_TIP 版] 特征...")

    for col in final_categorical_cols:
        # 使用我们之前保存的 category_mappers
        mapper_dtype = category_mappers[col] # <-- mapper 现在是一个 dtype
        
        # 【【【 修复 2 】】】
        # 使用 .astype(mapper_dtype) 来应用统一的映射
        # 任何不在 'mapper_dtype.categories' 中的值将变为 NaN
        # .cat.codes 随后会将所有 NaN 转换为 -1
        train_df_tip[col] = train_df_tip[col].astype(mapper_dtype).cat.codes
        val_df_tip[col] = val_df_tip[col].astype(mapper_dtype).cat.codes
        test_df_tip[col] = test_df_tip[col].astype(mapper_dtype).cat.codes
        
    # 3. 编码标签列 (LABEL_COL) - (这部分逻辑保持不变)
    full_df_imputed[LABEL_COL] = full_df_imputed[LABEL_COL].astype('category')
    print(f"标签类别: {list(full_df_imputed[LABEL_COL].cat.categories)}")
    train_df[LABEL_COL] = full_df_imputed.loc[train_df.index, LABEL_COL].cat.codes
    val_df[LABEL_COL] = full_df_imputed.loc[val_df.index, LABEL_COL].cat.codes
    test_df[LABEL_COL] = full_df_imputed.loc[test_df.index, LABEL_COL].cat.codes

    print("--- 步骤 8: 连续特征标准化 ---")
    # (对应原流程 Step 9)
    scaler = StandardScaler()
    
    # 仅在训练集上 fit
    if final_continuous_cols: # 确保列表不为空
        scaler.fit(train_df[final_continuous_cols])
    
        # 2. 在 [填补版] 上 transform
        train_df[final_continuous_cols] = scaler.transform(train_df[final_continuous_cols])
        val_df[final_continuous_cols] = scaler.transform(val_df[final_continuous_cols])
        test_df[final_continuous_cols] = scaler.transform(test_df[final_continuous_cols])
    
        # 3. 在 [_TIP 版] 上 transform (重用 scaler)
        # (StandardScaler 会自动传播 NaN)
        print("    -> 正在应用 scaler 到 _TIP 版本...")
        train_df_tip[final_continuous_cols] = scaler.transform(train_df_tip[final_continuous_cols])
        val_df_tip[final_continuous_cols] = scaler.transform(val_df_tip[final_continuous_cols])
        test_df_tip[final_continuous_cols] = scaler.transform(test_df_tip[final_continuous_cols])
    else:
        print("    (跳过：未找到连续特征)")

    print("--- 步骤 9: 图像处理 (Resize 和 NPY 转换) ---")
    # (对应原流程 Step 12, 但增加了 Resize)
    
    # 启用 tqdm (进度条)
    tqdm.pandas(desc="处理训练集图像")
    train_df['npy_path'] = train_df['image_path_final'].progress_apply(process_and_save_image)
    
    tqdm.pandas(desc="处理验证集图像")
    val_df['npy_path'] = val_df['image_path_final'].progress_apply(process_and_save_image)
    
    tqdm.pandas(desc="处理测试集图像")
    test_df['npy_path'] = test_df['image_path_final'].progress_apply(process_and_save_image)

    # 再次清洗，以防有图像处理失败
    print("再次清洗，移除处理失败(None)的图像...")
    train_df = train_df.dropna(subset=['npy_path'])
    val_df = val_df.dropna(subset=['npy_path'])
    test_df = test_df.dropna(subset=['npy_path'])

    # --- [新增] 同步 _TIP 版本的行和 npy_path ---
    print("    -> 正在同步 _TIP 版本的行...")
    # 1. 过滤 _TIP 版本，使其索引与清洗后的版本一致
    train_df_tip = train_df_tip.loc[train_df.index]
    val_df_tip = val_df_tip.loc[val_df.index]
    test_df_tip = test_df_tip.loc[test_df.index]
    
    # 2. 复制 npy_path 列
    train_df_tip['npy_path'] = train_df['npy_path']
    val_df_tip['npy_path'] = val_df['npy_path']
    test_df_tip['npy_path'] = test_df['npy_path']
    # --- [新增结束] ---
    print(f"最终数据: 训练集 {len(train_df)}, 验证集 {len(val_df)}, 测试集 {len(test_df)}")

    print("--- 步骤 10: 保存最终输出 ---")
    
    # 1. 保存字段长度 (tabular_lengths.pt)
    # (对应原流程 Step 10)
    # 连续特征的长度为 1
    tabular_lengths = [1] * len(final_continuous_cols) + field_lengths
    torch.save(tabular_lengths, os.path.join(OUTPUT_DIR, "tabular_lengths.pt"))
    print(f"已保存: tabular_lengths.pt (共 {len(tabular_lengths)} 个特征)")

    # 准备最终的特征列顺序
    final_cols_ordered = final_continuous_cols + final_categorical_cols

    # 2. 保存表格特征 (features.csv)
    # (对应原流程 Step 10)
    train_df[final_cols_ordered].to_csv(
        os.path.join(OUTPUT_DIR, "train_features.csv"), 
        index=False, header=False
    )
    val_df[final_cols_ordered].to_csv(
        os.path.join(OUTPUT_DIR, "val_features.csv"), 
        index=False, header=False
    )
    test_df[final_cols_ordered].to_csv(
        os.path.join(OUTPUT_DIR, "test_features.csv"), 
        index=False, header=False
    )
    print("已保存: {train/val/test}_features.csv")

    
    # --- [新增] 2.B 保存表格特征 [_TIP 版] ---
    print("    -> 正在保存 [_TIP 版] 特征...")
    train_df_tip[final_cols_ordered].to_csv(
        os.path.join(OUTPUT_DIR, "train_features_TIP.csv"), 
        index=False, header=False
    )
    val_df_tip[final_cols_ordered].to_csv(
        os.path.join(OUTPUT_DIR, "val_features_TIP.csv"), 
        index=False, header=False
    )
    test_df_tip[final_cols_ordered].to_csv(
        os.path.join(OUTPUT_DIR, "test_features_TIP.csv"), 
        index=False, header=False
    )
    print("已保存: {train/val/test}_features_TIP.csv [_TIP 版]")

    # 3. 保存标签 (labels.pt)
    # (对应原流程 Step 11)
    torch.save(
        torch.tensor(train_df[LABEL_COL].values, dtype=torch.long),
        os.path.join(OUTPUT_DIR, "train_labels.pt")
    )
    torch.save(
        torch.tensor(val_df[LABEL_COL].values, dtype=torch.long),
        os.path.join(OUTPUT_DIR, "val_labels.pt")
    )
    torch.save(
        torch.tensor(test_df[LABEL_COL].values, dtype=torch.long),
        os.path.join(OUTPUT_DIR, "test_labels.pt")
    )
    print("已保存: {train/val/test}_labels.pt")



    # 4. 保存 NPY 图像路径 (paths.pt)
    # (对应原流程 Step 12)
    torch.save(
        list(train_df['npy_path']), 
        os.path.join(OUTPUT_DIR, "train_paths.pt")
    )
    torch.save(
        list(val_df['npy_path']), 
        os.path.join(OUTPUT_DIR, "val_paths.pt")
    )
    torch.save(
        list(test_df['npy_path']), 
        os.path.join(OUTPUT_DIR, "test_paths.pt")
    )
    print("已保存: {train/val/test}_paths.pt")
    
    print("\n--- 预处理完成！ ---")
    print(f"所有文件已保存到: {OUTPUT_DIR}")


if __name__ == "__main__":
    main()

--- 步骤 1: 加载数据 ---


  train_df.loc[:, col] = train_df[col].fillna("MISSING")
 'MISSING' 'MISSING' 2.0 2.0 3.0 2.0 'MISSING' 4.0 'MISSING' 'MISSING' 3.0
 2.0 'MISSING' 2.0 1.0 1.0 'MISSING' 2.0 'MISSING' 2.0 'MISSING' 2.0 3.0
 2.0 'MISSING' 3.0 1.0 2.0 3.0 'MISSING' 2.0 2.0 'MISSING' 2.0 'MISSING'
 'MISSING' 2.0 3.0 2.0 'MISSING' 'MISSING' 'MISSING' 'MISSING' 'MISSING'
 2.0 2.0 2.0 3.0 2.0 2.0 2.0 2.0 2.0 2.0 'MISSING' 4.0 3.0 'MISSING'
 'MISSING' 2.0 2.0 2.0 1.0 3.0 'MISSING' 2.0 'MISSING' 'MISSING' 'MISSING'
 'MISSING' 3.0 2.0 2.0 3.0 'MISSING' 3.0 3.0 2.0 'MISSING' 3.0 'MISSING'
 'MISSING' 'MISSING' 3.0 'MISSING' 1.0 2.0 2.0 2.0 4.0 2.0 'MISSING' 3.0
 'MISSING' 2.0 2.0 'MISSING' 2.0 3.0 3.0 'MISSING' 'MISSING' 2.0 2.0 3.0
 3.0 'MISSING' 2.0 3.0 2.0 3.0 2.0 3.0 'MISSING' 3.0 3.0 3.0 'MISSING' 2.0
 'MISSING' 2.0 3.0 'MISSING' 'MISSING' 3.0 2.0 2.0 1.0 'MISSING' 'MISSING'
 'MISSING' 2.0 2.0 'MISSING' 'MISSING' 2.0 'MISSING' 2.0 'MISSING'
 'MISSING' 1.0 'MISSING' 'MISSING' 2.0 3.0 4.0 3.0 1.0 4.0 'MISSING' 

原始数据加载: 2298 行
--- 步骤 2: 图像路径处理 (新) ---
--- 步骤 3: 数据清洗 (过滤缺失标签、路径和过多缺失特征) ---
因标签或路径缺失而丢弃 0 行。
--- 步骤 4: 数据集划分 (8:1:1) (新) ---
划分结果: 训练集 1838, 验证集 230, 测试集 230
--- 步骤 5: 移除恒定列 ---
--- 步骤 6: 填充缺失值 ---
    -> 正在为 _TIP (无填补) 版本创建数据备份...
--- 步骤 7: 类别特征编码 (和标签编码) ---
    -> 7.A: 正在从 [填补版] 数据中构建统一的类别映射...
    -> 7.B: 正在应用统一映射到 [_TIP 版] 特征...
标签类别: ['ACK', 'BCC', 'MEL', 'NEV', 'SCC', 'SEK']
--- 步骤 8: 连续特征标准化 ---
    -> 正在应用 scaler 到 _TIP 版本...
--- 步骤 9: 图像处理 (Resize 和 NPY 转换) ---


处理训练集图像: 100%|██████████| 1838/1838 [00:00<00:00, 146475.10it/s]
处理验证集图像: 100%|██████████| 230/230 [00:00<00:00, 125938.63it/s]
处理测试集图像: 100%|██████████| 230/230 [00:00<00:00, 127419.09it/s]

再次清洗，移除处理失败(None)的图像...
    -> 正在同步 _TIP 版本的行...
最终数据: 训练集 1838, 验证集 230, 测试集 230
--- 步骤 10: 保存最终输出 ---
已保存: tabular_lengths.pt (共 22 个特征)
已保存: {train/val/test}_features.csv
    -> 正在保存 [_TIP 版] 特征...
已保存: {train/val/test}_features_TIP.csv [_TIP 版]
已保存: {train/val/test}_labels.pt
已保存: {train/val/test}_paths.pt

--- 预处理完成！ ---
所有文件已保存到: ./features





In [1]:
import os
import pandas as pd
from PIL import Image

# --- 1. 配置 (从您的 skin-cancer 脚本中复制) ---

# !! 确保这些路径与您的环境一致 !!
DATA_ROOT = "/mnt/hdd/jiazy/skin-cancer" # <-- [修改]
METADATA_FILE = os.path.join(DATA_ROOT, "metadata.csv") # <-- [修改]
IMAGE_DIR = os.path.join(DATA_ROOT, "imgs") # <-- [修改]
image_col = "img_id" # <-- [修改]

# --- 3. 主检查逻辑 ---
def check_first_image_mode():
    print(f"--- 正在检查图像模式 (Skin Cancer) ---")
    
    # 1. 加载 CSV
    try:
        df = pd.read_csv(METADATA_FILE)
        print(f"成功加载: {METADATA_FILE}")
    except FileNotFoundError:
        print(f"❌ 错误：找不到 CSV 文件。请检查路径：")
        print(f"{METADATA_FILE}")
        return
    except Exception as e:
        print(f"❌ 错误：加载 CSV 时出错: {e}")
        return

    # 3. 过滤掉无效路径
    df = df.dropna(subset=[image_col])
    if len(df) == 0:
        print("❌ 错误：在 CSV 中找不到任何有效的图像路径。")
        return

    # 4. 获取第一张图片的路径
    # [修改] 构建路径
    first_image_path = os.path.join(IMAGE_DIR, df[image_col].iloc[0])
    
    if not os.path.exists(first_image_path):
        print(f"❌ 错误：找到路径但文件不存在: {first_image_path}")
        print("请确保 IMAGE_DIR 和 METADATA_FILE 设置正确。")
        return

    # 5. 打开图像并检查模式
    try:
        with Image.open(first_image_path) as img:
            mode = img.mode
            size = img.size
        
        print("\n" + "="*30)
        print(" 📊 图像诊断报告 (第一张图)")
        print("="*30)
        print(f" 文件路径: {first_image_path}")
        print(f" 图像尺寸: {size}")
        print(f" 图像模式: {mode}")
        print("="*30)

        # 6. 解释结果
        if mode == 'L':
            print("✅ 结论：这是 'L' 模式，即**灰度图 (Grayscale)**。")
            print("   (您应该在 transform 中使用 A.ToRGB() 将其转为 3 通道)")
        elif mode == 'RGB':
            print("✅ 结论：这是 'RGB' 模式，即**彩色图 (3 通道)**。")
            print("   (您的脚本已处理 RGBA，A.ToRGB() 不是必需的，但是安全的)")
        else:
            print(f"⚠️ 结论：这是一个不常见的模式 ('{mode}')。")
            print("   (强烈建议使用 A.ToRGB())")

    except Exception as e:
        print(f"❌ 错误：打开或检查图像时出错: {e}")


if __name__ == "__main__":
    check_first_image_mode()

--- 正在检查图像模式 (Skin Cancer) ---
成功加载: /mnt/hdd/jiazy/skin-cancer/metadata.csv

 📊 图像诊断报告 (第一张图)
 文件路径: /mnt/hdd/jiazy/skin-cancer/imgs/PAT_1516_1765_530.png
 图像尺寸: (1089, 1089)
 图像模式: RGB
✅ 结论：这是 'RGB' 模式，即**彩色图 (3 通道)**。
   (您的脚本已处理 RGBA，A.ToRGB() 不是必需的，但是安全的)
