In [1]:
import os
import sys

# 设置CUDA_HOME环境变量
conda_prefix = os.environ.get('CONDA_PREFIX')
if conda_prefix:
    os.environ['CUDA_HOME'] = conda_prefix
    os.environ['LD_LIBRARY_PATH'] = f"{conda_prefix}/lib:{os.environ.get('LD_LIBRARY_PATH', '')}"
    
    print(f"已设置环境变量:")
    print(f"  CUDA_HOME: {os.environ['CUDA_HOME']}")
    print(f"  CONDA_PREFIX: {conda_prefix}")
    
    # 验证libcuda.so是否存在
    libcuda_path = f"{conda_prefix}/lib/libcuda.so"
    if os.path.exists(libcuda_path):
        print(f"  ✓ libcuda.so: 找到 ({libcuda_path})")
    else:
        print(f"  ✗ libcuda.so: 在conda环境中未找到")
        
        # 创建符号链接到系统libcuda
        system_libcuda = "/usr/lib/x86_64-linux-gnu/libcuda.so"
        if os.path.exists(system_libcuda):
            os.system(f"ln -sf {system_libcuda} {conda_prefix}/lib/libcuda.so")
            print(f"  ✓ 已创建符号链接: {conda_prefix}/lib/libcuda.so -> {system_libcuda}")
        else:
            print(f"  ⚠ 系统libcuda.so也未找到")
else:
    print("CONDA_PREFIX未设置，请先激活conda环境")

已设置环境变量:
  CUDA_HOME: /home/y/anaconda3/envs/mindspore
  CONDA_PREFIX: /home/y/anaconda3/envs/mindspore
  ✓ libcuda.so: 找到 (/home/y/anaconda3/envs/mindspore/lib/libcuda.so)


In [2]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import warnings
from mindspore import Parameter
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 检查MindSpore环境
print("="*60)
print("MindSpore环境检查")
print("="*60)

try:
    import mindspore as ms
    from mindspore import context, Tensor, Model, load_checkpoint, save_checkpoint
    import mindspore.nn as nn
    import mindspore.ops as ops
    from mindspore.train.callback import LossMonitor, TimeMonitor, ModelCheckpoint, CheckpointConfig
    from mindspore import dataset as ds
    import mindspore.dataset.vision as vision
    import mindspore.dataset.transforms as transforms
    
    # 设置GPU
    context.set_context(mode=context.GRAPH_MODE, device_target='GPU')
    print(f"✓ MindSpore版本: {ms.__version__}")
    print(f"✓ 使用设备: GPU")
    print(f"✓ 运行模式: GRAPH_MODE")
    
except Exception as e:
    print(f"✗ MindSpore导入失败: {e}")
    # 尝试使用CPU
    try:
        context.set_context(device_target='CPU')
        print(f"✓ 使用设备: CPU (备用)")
    except:
        print("✗ MindSpore环境异常")

MindSpore环境检查
✓ MindSpore版本: 2.2.0
✓ 使用设备: GPU
✓ 运行模式: GRAPH_MODE


In [3]:
class Config:
    # 路径配置
    data_dir = "flowers"  # 数据目录
    model_path = "best_flower_model.ckpt"  # 模型保存路径
    metrics_path = "training_metrics.npy"  # 指标保存路径
    
    # 类别信息
    class_names = ["daisy", "dandelion", "rose", "sunflower", "tulip"]
    num_classes = len(class_names)
    
    # 训练参数
    batch_size = 32
    epochs = 20
    learning_rate = 1e-4
    image_size = (224, 224)
    
    # 数据集分割
    train_ratio = 0.8  # 训练集比例
    
config = Config()

In [4]:
def prepare_dataset():
    """准备和预处理数据集"""
    print("="*60)
    print("数据准备")
    print("="*60)
    
    # 检查数据目录
    if not os.path.exists(config.data_dir):
        print(f"✗ 数据目录不存在: {config.data_dir}")
        print("请确保在项目根目录下创建 'flowers' 文件夹，并包含以下子目录：")
        print("  flowers/train/  - 训练图像")
        print("  flowers/test/   - 测试图像")
        print("每个子目录下应有5个文件夹: daisy, dandelion, rose, sunflower, tulip")
        return None, None, None, None
    
    # 数据集统计
    def count_images(folder):
        count = 0
        for root, dirs, files in os.walk(folder):
            count += len([f for f in files if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
        return count
    
    train_dir = os.path.join(config.data_dir, "train")
    test_dir = os.path.join(config.data_dir, "test")
    
    if not os.path.exists(train_dir) or not os.path.exists(test_dir):
        print("✗ 未找到 train/ 或 test/ 子目录")
        return None, None, None, None
    
    # 统计数据
    train_count = count_images(train_dir)
    test_count = count_images(test_dir)
    
    print(f"✓ 训练集图像: {train_count} 张")
    print(f"✓ 测试集图像: {test_count} 张")
    print(f"✓ 类别数量: {config.num_classes}")
    print(f"✓ 类别名称: {config.class_names}")
    
    # 定义数据变换 - 修复：添加 Decode() 操作
    # 训练集变换（数据增强）
    train_transform = [
        vision.Decode(),  # 关键修复：必须先解码图像
        vision.Resize(config.image_size),
        vision.RandomHorizontalFlip(prob=0.5),
        vision.RandomColorAdjust(brightness=0.2, contrast=0.2, saturation=0.2),
        vision.RandomRotation(degrees=15),
        vision.ToTensor(),
        vision.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], is_hwc=False)
    ]
    
    # 测试集变换（无数据增强）
    test_transform = [
        vision.Decode(),  # 关键修复：必须先解码图像
        vision.Resize(config.image_size),
        vision.ToTensor(),
        vision.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], is_hwc=False)
    ]
    
    # 创建MindSpore数据集
    def create_mindspore_dataset(data_path, transform, shuffle=True):
        dataset = ds.ImageFolderDataset(
            data_path,
            shuffle=shuffle,
            extensions=[".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]
        )
        
        # 应用变换
        dataset = dataset.map(
            operations=transform,
            input_columns="image"
        )
        
        # 类型转换
        dataset = dataset.map(
            operations=transforms.TypeCast(ms.int32),
            input_columns="label"
        )
        
        # 批处理
        dataset = dataset.batch(config.batch_size, drop_remainder=False)
        
        return dataset
    
    # 创建数据集
    print("创建训练数据集...")
    train_dataset = create_mindspore_dataset(train_dir, train_transform, shuffle=True)
    print("创建测试数据集...")
    test_dataset = create_mindspore_dataset(test_dir, test_transform, shuffle=False)
    
    # 计算数据集大小
    train_size = train_dataset.get_dataset_size() * config.batch_size
    test_size = test_dataset.get_dataset_size() * config.batch_size
    
    print(f"✓ 训练批次数: {train_dataset.get_dataset_size()}")
    print(f"✓ 测试批次数: {test_dataset.get_dataset_size()}")
    
    # 测试数据集是否能正常读取
    print("\n测试数据读取...")
    try:
        test_iter = train_dataset.create_tuple_iterator()
        test_images, test_labels = next(test_iter)
        print(f"✓ 数据读取测试通过")
        print(f"  图像形状: {test_images.shape}")
        print(f"  标签形状: {test_labels.shape}")
    except Exception as e:
        print(f"✗ 数据读取测试失败: {e}")
        return None, None, None, None
    
    return train_dataset, test_dataset, train_size, test_size

# 准备数据
train_dataset, test_dataset, train_size, test_size = prepare_dataset()
# 准备数据
train_dataset, test_dataset, train_size, test_size = prepare_dataset()

数据准备
✓ 训练集图像: 3665 张
✓ 测试集图像: 652 张
✓ 类别数量: 5
✓ 类别名称: ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip']
创建训练数据集...
创建测试数据集...
✓ 训练批次数: 115
✓ 测试批次数: 21

测试数据读取...
✓ 数据读取测试通过
  图像形状: (32, 3, 224, 224)
  标签形状: (32,)
数据准备
✓ 训练集图像: 3665 张
✓ 测试集图像: 652 张
✓ 类别数量: 5
✓ 类别名称: ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip']
创建训练数据集...
创建测试数据集...
✓ 训练批次数: 115
✓ 测试批次数: 21

测试数据读取...
✓ 数据读取测试通过
  图像形状: (32, 3, 224, 224)
  标签形状: (32,)


In [17]:
def build_resnet50_model():
    """构建ResNet50迁移学习模型"""
    print("="*60)
    print("模型构建")
    print("="*60)
    
    # 方法1: 使用MindSpore内置的ResNet50
    try:
        from mindcv.models import create_model
        
        print("从MindSpore Hub加载预训练ResNet50...")
        # 从Hub加载预训练模型
        model = create_model("BiT_resnet50", pretrained=True, num_classes=1000)
        
        if hasattr(model, 'classifier'):
            in_channels = model.classifier.in_channels
            model.classifier = nn.Dense(in_channels, 5)
            print(f"✓ 替换classifier: {in_channels} -> 5")
        elif hasattr(model, 'head'):
            in_channels = model.head.in_channels
            model.head = nn.Dense(in_channels, 5)
            print(f"✓ 替换head: {in_channels} -> 5")
        elif hasattr(model, 'fc'):
            in_features = model.fc.in_features
            model.fc = nn.Dense(in_features, 5)
            print(f"✓ 替换fc: {in_features} -> 5")
        else:
            # 尝试找到最后一个Dense层
            for name, cell in model.cells_and_names():
                if isinstance(cell, nn.Dense):
                    in_features = cell.in_features
                    # 这里需要实际替换，但要知道层的位置
                    print(f"找到Dense层: {name}, 输入: {in_features}")
                    # 你可能需要直接修改模型结构
                    break

        # 冻结前面层
        print("\n冻结参数...")
        for name, param in model.parameters_and_names():
            if 'classifier' in name or 'head' in name or 'fc' in name:
                param.requires_grad = True
            else:
                param.requires_grad = False

        trainable = sum(p.size for p in model.trainable_params())
        total = sum(p.size for p in model.get_parameters())
        print(f"可训练参数: {trainable:,}/{total:,}")

        print("✓ 模型准备完成！")
        return model
        
    except Exception as e:
        print(f"从Hub加载失败: {e}")
        print("使用自定义ResNet50实现...")
        
        # 方法2: 自定义ResNet50
        class ResNet50(nn.Cell):
            def __init__(self, num_classes=config.num_classes):
                super(ResNet50, self).__init__()
                
                # 简化的ResNet50结构
                self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, pad_mode='pad', padding=3)
                self.bn1 = nn.BatchNorm2d(64)
                self.relu = nn.ReLU()
                self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode='same')
                
                # 残差块（简化版本）
                self.layer1 = self._make_layer(64, 64, 3, stride=1)
                self.layer2 = self._make_layer(64, 128, 4, stride=2)
                self.layer3 = self._make_layer(128, 256, 6, stride=2)
                self.layer4 = self._make_layer(256, 512, 3, stride=2)
                
                self.avgpool = nn.AvgPool2d(kernel_size=7, stride=1)
                self.flatten = nn.Flatten()
                self.fc = nn.Dense(512, num_classes)
                
            def _make_layer(self, in_channels, out_channels, blocks, stride):
                layers = []
                # 第一个残差块可能需要下采样
                layers.append(self._residual_block(in_channels, out_channels, stride))
                
                for _ in range(1, blocks):
                    layers.append(self._residual_block(out_channels, out_channels, 1))
                    
                return nn.SequentialCell(layers)
            
            def _residual_block(self, in_channels, out_channels, stride):
                return nn.SequentialCell([
                    nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, 
                             pad_mode='pad', padding=1, has_bias=False),
                    nn.BatchNorm2d(out_channels),
                    nn.ReLU(),
                    nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1,
                             pad_mode='pad', padding=1, has_bias=False),
                    nn.BatchNorm2d(out_channels)
                ])
            
            def construct(self, x):
                x = self.conv1(x)
                x = self.bn1(x)
                x = self.relu(x)
                x = self.maxpool(x)
                
                x = self.layer1(x)
                x = self.layer2(x)
                x = self.layer3(x)
                x = self.layer4(x)
                
                x = self.avgpool(x)
                x = self.flatten(x)
                x = self.fc(x)
                
                return x
        
        model = ResNet50()
        print(f"✓ 自定义ResNet50构建完成")
        print(f"✓ 参数量: {sum(p.size for p in model.trainable_params()):,}")
        
        return model
model=build_resnet50_model()

模型构建
从MindSpore Hub加载预训练ResNet50...
✓ 替换classifier: 2048 -> 5

冻结参数...
可训练参数: 10,245/23,510,597
✓ 模型准备完成！


In [21]:
def fix_bit_model_simple(model):
    """简单修复BiT模型维度问题"""
    print("="*60)
    print("简单修复BiT模型维度")
    print("="*60)

    import mindspore.nn as nn
    import mindspore.ops as ops

    print(f"模型类型: {type(model)}")

    # 方法：直接封装模型
    class FixedBitModel(nn.Cell):
        def __init__(self, backbone, num_classes=5):
            super().__init__()
            self.backbone = backbone

            # 移除原分类头
            if hasattr(backbone, 'classifier'):
                backbone.classifier = nn.Identity()
            if hasattr(backbone, 'head'):
                backbone.head = nn.Identity()
            if hasattr(backbone, 'fc'):
                backbone.fc = nn.Identity()

            # BiT-ResNet50输出2048维特征
            self.global_pool = ops.ReduceMean(keep_dims=False)
            self.classifier = nn.Dense(2048, num_classes)

            # 冻结特征提取层
            for param in self.backbone.get_parameters():
                param.requires_grad = False

        def construct(self, x):
            # 提取特征
            x = self.backbone(x)

            # 处理维度
            if len(x.shape) == 4:
                # (B, C, H, W) -> (B, C)
                x = self.global_pool(x, (2, 3))
            elif len(x.shape) == 2:
                # 已经展平
                pass
            else:
                # 其他情况，简单展平
                x = x.reshape(x.shape[0], -1)
                if x.shape[1] != 2048:
                    # 调整到2048维
                    x = x[:, :2048] if x.shape[1] > 2048 else ops.pad(x, ((0, 0), (0, 2048 - x.shape[1])))

            # 分类
            x = self.classifier(x)
            return x

    # 创建修复后的模型
    fixed_model = FixedBitModel(model, config.num_classes)

    print(f"✓ 模型修复完成")
    print(f"  输入: 3x224x224")
    print(f"  输出: {config.num_classes}类")

    # 测试
    import numpy as np
    test_input = ms.Tensor(np.random.randn(2, 3, 224, 224).astype(np.float32))
    output = fixed_model(test_input)
    print(f"  测试输入: {test_input.shape}")
    print(f"  测试输出: {output.shape}")

    return fixed_model

# 使用简单修复
print("修复模型维度...")
model = fix_bit_model_simple(model)

修复模型维度...
简单修复BiT模型维度
模型类型: <class 'mindcv.models.bit.BiT_ResNet'>
✓ 模型修复完成
  输入: 3x224x224
  输出: 5类


[ERROR] CORE(280126,79ff5f932600,python):2025-12-29-14:18:07.989.298 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_280126/3613542768.py]


  测试输入: (2, 3, 224, 224)
  测试输出: (2, 5)


In [22]:
def setup_training(model):
    """设置训练组件"""
    print("="*60)
    print("训练配置")
    print("="*60)
    
    # 损失函数
    loss_fn = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    
    # 优化器 - 直接使用学习率，不在这里设置调度器
    optimizer = nn.Adam(
        model.trainable_params(),
        learning_rate=config.learning_rate,
        weight_decay=1e-4
    )
    
    # 注意：piecewise_constant_lr 需要里程碑数量 = 学习率数量 - 1
    # 但我们将在训练循环中手动实现学习率调度
    
    # 模型编译
    net_with_loss = nn.WithLossCell(model, loss_fn)
    train_net = nn.TrainOneStepCell(net_with_loss, optimizer)
    
    # 评估指标
    metrics = {
        'accuracy': nn.Accuracy(),
        'loss': nn.Loss()
    }
    
    print(f"✓ 损失函数: SoftmaxCrossEntropyWithLogits")
    print(f"✓ 优化器: Adam (lr={config.learning_rate})")
    print(f"✓ 学习率调度: 手动调整（每7个epoch减少10倍）")
    
    return train_net, loss_fn, optimizer, metrics

train_net, loss_fn, optimizer, metrics = setup_training(model)

训练配置
✓ 损失函数: SoftmaxCrossEntropyWithLogits
✓ 优化器: Adam (lr=0.0001)
✓ 学习率调度: 手动调整（每7个epoch减少10倍）


In [23]:
def train_model(model, train_dataset, test_dataset):
    """训练模型"""
    print("="*60)
    print("开始训练")
    print("="*60)
    
    # 初始化记录
    train_losses = []
    train_accuracies = []
    test_losses = []
    test_accuracies = []
    best_accuracy = 0.0
    
    # 创建评估模型
    eval_net = nn.WithEvalCell(model, loss_fn, add_cast_fp32=False)
    
    # 创建 argmax 操作
    argmax_op = ops.Argmax(output_type=ms.int32)
    
    # 训练循环
    for epoch in range(config.epochs):
        print(f"\nEpoch {epoch+1}/{config.epochs}")
        print("-" * 40)
        
        # 学习率调整（每7个epoch减少10倍）
        if epoch in [7, 14]:  # 第8和15个epoch调整学习率
            current_lr = config.learning_rate / (10 ** (epoch // 7))
            optimizer.learning_rate = Parameter(Tensor(current_lr, ms.float32), name='learning_rate')
            print(f"学习率调整为: {current_lr}")
        
        # 训练阶段
        model.set_train()
        epoch_loss = 0
        correct = 0
        total = 0
        
        batch_iterator = train_dataset.create_tuple_iterator()
        
        for batch_idx, (images, labels) in enumerate(batch_iterator):
            # 前向传播和反向传播
            loss = train_net(images, labels)
            
            # 计算准确率
            outputs = model(images)
            predictions = argmax_op(outputs)
            
            correct += (predictions.asnumpy() == labels.asnumpy()).sum()
            total += labels.shape[0]
            epoch_loss += loss.asnumpy()
            
            # 进度显示
            if (batch_idx + 1) % 10 == 0:
                current_acc = correct / total if total > 0 else 0
                current_loss = epoch_loss / (batch_idx + 1)
                print(f"  Batch {batch_idx+1}/{train_dataset.get_dataset_size()}, "
                      f"Loss: {current_loss:.4f}, Acc: {current_acc:.4f}")
        
        # 计算epoch指标
        train_loss = epoch_loss / max(1, train_dataset.get_dataset_size())
        train_acc = correct / total if total > 0 else 0
        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        
        # 评估阶段 - 修复：正确处理 eval_net 的输出
        model.set_train(False)
        test_loss = 0
        test_correct = 0
        test_total = 0
        
        for images, labels in test_dataset.create_tuple_iterator():
            # 计算损失 - 修复：eval_net 可能返回元组
            eval_outputs = eval_net(images, labels)
            
            # 提取损失值
            if isinstance(eval_outputs, tuple):
                # 第一个元素通常是损失值
                loss_value = eval_outputs[0]
            else:
                loss_value = eval_outputs
            
            test_loss += loss_value.asnumpy()
            
            # 计算准确率
            outputs = model(images)
            predictions = argmax_op(outputs)
            test_correct += (predictions.asnumpy() == labels.asnumpy()).sum()
            test_total += labels.shape[0]
        
        test_loss = test_loss / max(1, test_dataset.get_dataset_size())
        test_acc = test_correct / test_total if test_total > 0 else 0
        test_losses.append(test_loss)
        test_accuracies.append(test_acc)
        
        # 输出epoch结果
        print(f"训练结果: 损失={train_loss:.4f}, 准确率={train_acc:.4f}")
        print(f"测试结果: 损失={test_loss:.4f}, 准确率={test_acc:.4f}")
        
        # 保存最佳模型
        if test_acc > best_accuracy:
            best_accuracy = test_acc
            save_checkpoint(model, config.model_path)
            print(f"✓ 保存最佳模型，准确率: {best_accuracy:.4f}")
    
    # 保存训练指标
    metrics_dict = {
        'train_losses': train_losses,
        'train_accuracies': train_accuracies,
        'test_losses': test_losses,
        'test_accuracies': test_accuracies
    }
    np.save(config.metrics_path, metrics_dict)
    
    print("\n" + "="*60)
    print(f"训练完成！最佳测试准确率: {best_accuracy:.4f}")
    print("="*60)
    
    return train_losses, train_accuracies, test_losses, test_accuracies
# 开始训练
if train_dataset is not None:
    print(f"开始训练，共 {config.epochs} 个epoch")
    train_losses, train_accuracies, test_losses, test_accuracies = train_model(
        model, train_dataset, test_dataset
    )
else:
    print("✗ 数据集不可用，跳过训练")

开始训练，共 20 个epoch
开始训练

Epoch 1/20
----------------------------------------


[ERROR] CORE(280126,79ff5f932600,python):2025-12-29-14:18:18.221.033 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_280126/3613542768.py]


  Batch 10/115, Loss: 1.5823, Acc: 0.3156
  Batch 20/115, Loss: 1.5274, Acc: 0.4062
  Batch 30/115, Loss: 1.4952, Acc: 0.4313
  Batch 40/115, Loss: 1.4703, Acc: 0.4547
  Batch 50/115, Loss: 1.4422, Acc: 0.4838
  Batch 60/115, Loss: 1.4116, Acc: 0.5135
  Batch 70/115, Loss: 1.3867, Acc: 0.5339
  Batch 80/115, Loss: 1.3610, Acc: 0.5496
  Batch 90/115, Loss: 1.3389, Acc: 0.5646
  Batch 100/115, Loss: 1.3167, Acc: 0.5747
  Batch 110/115, Loss: 1.2926, Acc: 0.5901


[ERROR] CORE(280126,79ff5f932600,python):2025-12-29-14:19:11.649.573 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_280126/3613542768.py]
[ERROR] CORE(280126,79ff5f932600,python):2025-12-29-14:19:15.127.054 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_280126/3613542768.py]
[ERROR] CORE(280126,79ff5f932600,python):2025-12-29-14:19:26.403.436 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_280126/3613542768.py]


训练结果: 损失=1.2831, 准确率=0.5951
测试结果: 损失=1.1099, 准确率=0.6472
✓ 保存最佳模型，准确率: 0.6472

Epoch 2/20
----------------------------------------
  Batch 10/115, Loss: 1.0095, Acc: 0.7469
  Batch 20/115, Loss: 1.0060, Acc: 0.7422
  Batch 30/115, Loss: 1.0206, Acc: 0.7250
  Batch 40/115, Loss: 1.0147, Acc: 0.7234
  Batch 50/115, Loss: 1.0002, Acc: 0.7306
  Batch 60/115, Loss: 0.9891, Acc: 0.7349
  Batch 70/115, Loss: 0.9808, Acc: 0.7344
  Batch 80/115, Loss: 0.9698, Acc: 0.7383
  Batch 90/115, Loss: 0.9603, Acc: 0.7441
  Batch 100/115, Loss: 0.9481, Acc: 0.7481
  Batch 110/115, Loss: 0.9406, Acc: 0.7497
训练结果: 损失=0.9394, 准确率=0.7490
测试结果: 损失=0.9242, 准确率=0.6687
✓ 保存最佳模型，准确率: 0.6687

Epoch 3/20
----------------------------------------
  Batch 10/115, Loss: 0.7919, Acc: 0.7500
  Batch 20/115, Loss: 0.8236, Acc: 0.7406
  Batch 30/115, Loss: 0.8258, Acc: 0.7438
  Batch 40/115, Loss: 0.8104, Acc: 0.7594
  Batch 50/115, Loss: 0.8064, Acc: 0.7675
  Batch 60/115, Loss: 0.8041, Acc: 0.7682
  Batch 70/115, Loss: 0.

In [29]:
class FocalLoss(nn.Cell):
    """修正的Focal Loss实现"""
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction
        self.softmax = nn.Softmax(axis=1)
        self.onehot = nn.OneHot(depth=config.num_classes)
        self.reduce_sum = ops.ReduceSum()
        self.log = ops.Log()
        self.pow = ops.Pow()

    def construct(self, inputs, targets):
        # 计算softmax概率
        probs = self.softmax(inputs)

        # 将targets转换为one-hot编码
        targets_onehot = self.onehot(targets)

        # 计算每个样本的类别概率 (B, num_classes) * (B, num_classes) -> (B,)
        class_probs = self.reduce_sum(probs * targets_onehot, 1)

        # 计算调制因子 (1 - pt)^gamma
        modulating_factor = self.pow(1 - class_probs, self.gamma)

        # 计算交叉熵
        ce_loss = -self.log(class_probs + 1e-8)

        # 计算focal loss
        focal_loss = modulating_factor * ce_loss

        # 应用类别权重（如果提供）
        if self.alpha is not None:
            alpha_factor = ops.gather(self.alpha, targets, 0)
            focal_loss = alpha_factor * focal_loss

        # 根据reduction参数返回结果
        if self.reduction == 'mean':
            return ops.mean(focal_loss)
        elif self.reduction == 'sum':
            return ops.sum(focal_loss)
        else:
            return focal_loss

In [30]:
# 新增单元格 - 直接加载检查点继续训练
print("="*60)
print("加载检查点并继续训练5个epoch")
print("="*60)

from mindspore import load_checkpoint, load_param_into_net

# 1. 加载检查点
if os.path.exists(config.model_path):
    try:
        param_dict = load_checkpoint(config.model_path)
        load_param_into_net(model, param_dict)
        print(f"✓ 检查点加载成功: {config.model_path}")
    except Exception as e:
        print(f"✗ 加载失败: {e}")
else:
    print(f"⚠ 检查点不存在: {config.model_path}")

# 2. 使用FocalLoss
print("\n使用FocalLoss训练...")
loss_fn = FocalLoss(gamma=2.0, reduction='mean')

# 3. 直接调用原有train_model函数（只训练5个epoch）
print("\n开始训练5个epoch...")

# 保存原始epochs
original_epochs = config.epochs
config.epochs = 5

# 运行训练
if train_dataset is not None:
    print(f"开始训练，共 {config.epochs} 个epoch")
    train_losses, train_accuracies, test_losses, test_accuracies = train_model(
        model, train_dataset, test_dataset
    )
else:
    print("✗ 数据集不可用")

# 恢复原始epochs（可选）
config.epochs = original_epochs

加载检查点并继续训练5个epoch
✓ 检查点加载成功: best_flower_model.ckpt

使用FocalLoss训练...

开始训练5个epoch...
开始训练，共 5 个epoch
开始训练

Epoch 1/5
----------------------------------------
  Batch 10/115, Loss: 0.3790, Acc: 0.8656
  Batch 20/115, Loss: 0.4158, Acc: 0.8531
  Batch 30/115, Loss: 0.4079, Acc: 0.8583
  Batch 40/115, Loss: 0.4033, Acc: 0.8641
  Batch 50/115, Loss: 0.4049, Acc: 0.8662
  Batch 60/115, Loss: 0.4094, Acc: 0.8635
  Batch 70/115, Loss: 0.4102, Acc: 0.8603
  Batch 80/115, Loss: 0.4061, Acc: 0.8641
  Batch 90/115, Loss: 0.4065, Acc: 0.8663
  Batch 100/115, Loss: 0.4084, Acc: 0.8662
  Batch 110/115, Loss: 0.4088, Acc: 0.8662
训练结果: 损失=0.4078, 准确率=0.8677
测试结果: 损失=0.3222, 准确率=0.7853
✓ 保存最佳模型，准确率: 0.7853

Epoch 2/5
----------------------------------------
  Batch 10/115, Loss: 0.3857, Acc: 0.8719
  Batch 20/115, Loss: 0.4227, Acc: 0.8453
  Batch 30/115, Loss: 0.4097, Acc: 0.8552
  Batch 40/115, Loss: 0.4141, Acc: 0.8602
  Batch 50/115, Loss: 0.4105, Acc: 0.8619
  Batch 60/115, Loss: 0.4074, Acc: 0.8

In [24]:
def visualize_results():
    """可视化训练结果"""
    print("="*60)
    print("结果可视化")
    print("="*60)
    import matplotlib
    matplotlib.use('Agg')
    # 检查是否有训练指标
    if not os.path.exists(config.metrics_path):
        print(f"✗ 未找到训练指标文件: {config.metrics_path}")
        print("请先完成训练")
        return
    
    try:
        # 加载训练指标
        metrics = np.load(config.metrics_path, allow_pickle=True).item()
        train_losses = metrics['train_losses']
        train_accuracies = metrics['train_accuracies']
        test_losses = metrics['test_losses']
        test_accuracies = metrics['test_accuracies']
        
        epochs = range(1, len(train_losses) + 1)
        
        # 创建图形 - 修复：使用更简单的方法设置中文字体
        plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial Unicode MS', 'SimHei']
        plt.rcParams['axes.unicode_minus'] = False
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # 1. 损失曲线
        axes[0, 0].plot(epochs, train_losses, 'b-', label='Train Loss', marker='o', linewidth=2)
        axes[0, 0].plot(epochs, test_losses, 'r-', label='Test Loss', marker='s', linewidth=2)
        axes[0, 0].set_title('Training and Testing Loss', fontsize=14, fontweight='bold')
        axes[0, 0].set_xlabel('Epoch')
        axes[0, 0].set_ylabel('Loss')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)
        
        # 2. 准确率曲线
        axes[0, 1].plot(epochs, train_accuracies, 'g-', label='Train Accuracy', marker='o', linewidth=2)
        axes[0, 1].plot(epochs, test_accuracies, 'orange', label='Test Accuracy', marker='s', linewidth=2)
        axes[0, 1].set_title('Training and Testing Accuracy', fontsize=14, fontweight='bold')
        axes[0, 1].set_xlabel('Epoch')
        axes[0, 1].set_ylabel('Accuracy')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
        axes[0, 1].set_ylim([0, 1.0])
        
        # 3. 最终准确率对比
        final_train_acc = train_accuracies[-1] if train_accuracies else 0
        final_test_acc = test_accuracies[-1] if test_accuracies else 0
        
        axes[1, 0].bar(['Train', 'Test'], [final_train_acc, final_test_acc], 
                      color=['green', 'red'], alpha=0.7, width=0.5)
        axes[1, 0].set_title('Final Accuracy Comparison', fontsize=14, fontweight='bold')
        axes[1, 0].set_ylabel('Accuracy')
        axes[1, 0].set_ylim([0, 1.0])
        axes[1, 0].text(0, final_train_acc + 0.02, f'{final_train_acc:.4f}', 
                       ha='center', fontweight='bold')
        axes[1, 0].text(1, final_test_acc + 0.02, f'{final_test_acc:.4f}', 
                       ha='center', fontweight='bold')
        
        # 4. 类别分布
        try:
            # 统计训练集类别分布
            train_counts = {}
            train_dir = os.path.join(config.data_dir, "train")
            for class_name in config.class_names:
                class_dir = os.path.join(train_dir, class_name)
                if os.path.exists(class_dir):
                    count = len([f for f in os.listdir(class_dir) 
                               if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
                    train_counts[class_name] = count
                else:
                    train_counts[class_name] = 0
            
            # 显示类别分布
            class_values = [train_counts[cls] for cls in config.class_names]
            colors = plt.cm.Set3(np.arange(len(config.class_names)) / len(config.class_names))
            axes[1, 1].bar(range(len(config.class_names)), class_values, color=colors)
            axes[1, 1].set_title('Training Set Class Distribution', fontsize=14, fontweight='bold')
            axes[1, 1].set_xlabel('Flower Class')
            axes[1, 1].set_ylabel('Number of Samples')
            axes[1, 1].set_xticks(range(len(config.class_names)))
            axes[1, 1].set_xticklabels(config.class_names, rotation=15)
            
            # 添加数值标签
            for i, count in enumerate(class_values):
                axes[1, 1].text(i, count + max(class_values)*0.02, f'{count}', 
                               ha='center', fontweight='bold')
                
        except Exception as e:
            print(f"Warning: Class distribution failed: {e}")
            axes[1, 1].text(0.5, 0.5, 'Class distribution data\nnot available', 
                          ha='center', va='center', transform=axes[1, 1].transAxes)
        
        plt.tight_layout()
        plt.savefig('training_results_2.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("✓ Visualization saved as 'training_results_2.png'")
        
        # 显示关键指标
        print("\nKey Metrics Summary:")
        if train_accuracies:
            print(f"  Best Train Accuracy: {max(train_accuracies):.4f}")
        if test_accuracies:
            print(f"  Best Test Accuracy: {max(test_accuracies):.4f}")
            print(f"  Final Test Accuracy: {final_test_acc:.4f}")
        
    except Exception as e:
        print(f"✗ Visualization failed: {e}")
        import traceback
        traceback.print_exc()
# 执行可视化
visualize_results()

结果可视化
✓ Visualization saved as 'training_results_1.png'

Key Metrics Summary:
  Best Train Accuracy: 0.8723
  Best Test Accuracy: 0.7899
  Final Test Accuracy: 0.7899


In [25]:
def evaluate_model():
    """评估模型性能"""
    print("="*60)
    print("模型评估")
    print("="*60)
    
    # 检查模型文件
    if not os.path.exists(config.model_path):
        print(f"✗ 模型文件不存在: {config.model_path}")
        print("请先训练模型或确保模型文件存在")
        return
    
    # 检查测试集
    if test_dataset is None:
        print("✗ 测试集不可用")
        return
    
    # 导入必要的函数
    from mindspore import load_param_into_net
    
    # 加载模型
    try:
        model.set_train(False)
        param_dict = ms.load_checkpoint(config.model_path)  # 使用 ms.load_checkpoint
        
        # 检查模型参数是否匹配
        model_params = {p.name: p for p in model.get_parameters()}
        load_success = True
        for name, param in param_dict.items():
            if name in model_params:
                model_params[name].set_data(param)
            else:
                print(f"Warning: Parameter {name} not in current model")
        
        print(f"✓ Model loaded successfully: {config.model_path}")
    except Exception as e:
        print(f"✗ Model loading failed: {e}")
        import traceback
        traceback.print_exc()
        return
    
    # 评估
    total_correct = 0
    total_samples = 0
    class_correct = [0] * config.num_classes
    class_total = [0] * config.num_classes
    
    print("\n开始评估...")
    batch_count = 0
    
    # 创建 argmax 操作
    argmax_op = ops.Argmax(output_type=ms.int32)
    
    for batch_idx, (images, labels) in enumerate(test_dataset.create_tuple_iterator()):
        outputs = model(images)
        predictions = argmax_op(outputs)
        
        # 整体统计
        batch_correct = (predictions.asnumpy() == labels.asnumpy()).sum()
        total_correct += batch_correct
        total_samples += labels.shape[0]
        
        # 按类别统计
        for i in range(labels.shape[0]):
            label = labels.asnumpy()[i]
            pred = predictions.asnumpy()[i]
            class_total[label] += 1
            if label == pred:
                class_correct[label] += 1
        
        batch_count += 1
        # 进度显示
        if (batch_idx + 1) % 5 == 0:
            current_acc = total_correct / total_samples if total_samples > 0 else 0
            print(f"  Batch {batch_idx+1}/{test_dataset.get_dataset_size()}, "
                  f"Current Accuracy: {current_acc:.4f}")
    
    # 计算最终指标
    final_accuracy = total_correct / total_samples if total_samples > 0 else 0
    
    print("\n" + "="*40)
    print("Evaluation Results Summary")
    print("="*40)
    print(f"Total Test Samples: {total_samples}")
    print(f"Correct Predictions: {total_correct}")
    print(f"Overall Accuracy: {final_accuracy:.4f}")
    
    print("\nPer-Class Accuracy:")
    for i in range(config.num_classes):
        if class_total[i] > 0:
            class_acc = class_correct[i] / class_total[i]
            print(f"  {config.class_names[i]}: {class_acc:.4f} "
                  f"({class_correct[i]}/{class_total[i]})")
        else:
            print(f"  {config.class_names[i]}: No test samples")
# 执行评估
evaluate_model()

模型评估
✓ Model loaded successfully: best_flower_model.ckpt

开始评估...
  Batch 5/21, Current Accuracy: 0.8250
  Batch 10/21, Current Accuracy: 0.8281
  Batch 15/21, Current Accuracy: 0.7896
  Batch 20/21, Current Accuracy: 0.7859

Evaluation Results Summary
Total Test Samples: 652
Correct Predictions: 515
Overall Accuracy: 0.7899

Per-Class Accuracy:
  daisy: 0.7907 (102/129)
  dandelion: 0.8502 (193/227)
  rose: 0.6264 (57/91)
  sunflower: 0.7561 (62/82)
  tulip: 0.8211 (101/123)


In [26]:
def show_predictions():
    """显示预测示例"""
    print("="*60)
    print("预测示例")
    print("="*60)
    
    if test_dataset is None:
        print("✗ 测试集不可用")
        return
    
    # 导入必要的函数
    from mindspore import load_param_into_net
    
    # 加载模型
    if not os.path.exists(config.model_path):
        print(f"✗ 模型文件不存在: {config.model_path}")
        return
    
    try:
        model.set_train(False)
        param_dict = ms.load_checkpoint(config.model_path)
        
        # 检查模型参数是否匹配
        model_params = {p.name: p for p in model.get_parameters()}
        for name, param in param_dict.items():
            if name in model_params:
                model_params[name].set_data(param)
        
        print(f"✓ Model loaded successfully: {config.model_path}")
    except Exception as e:
        print(f"✗ Model loading failed: {e}")
        return
    
    # 获取一批测试数据
    test_iter = test_dataset.create_tuple_iterator()
    images, labels = next(test_iter)
    
    # 预测
    outputs = model(images[:10])  # 只取前10个
    argmax_op = ops.Argmax(output_type=ms.int32)
    predictions = argmax_op(outputs)
    probabilities = ops.softmax(outputs, axis=1)
    
    # 创建可视化
    plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial Unicode MS']
    plt.rcParams['axes.unicode_minus'] = False
    
    fig, axes = plt.subplots(2, 5, figsize=(15, 7))
    axes = axes.ravel()
    
    for i in range(10):
        # 反标准化图像显示
        img = images[i].asnumpy().transpose(1, 2, 0)
        img = img * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])
        img = np.clip(img, 0, 1)
        
        true_label = labels.asnumpy()[i]
        pred_label = predictions.asnumpy()[i]
        confidence = probabilities.asnumpy()[i][pred_label]
        
        axes[i].imshow(img)
        axes[i].axis('off')
        
        # 颜色标注
        if true_label == pred_label:
            color = 'green'
            title = f'✓ {config.class_names[pred_label]}\n({confidence:.2f})'
        else:
            color = 'red'
            title = f'{config.class_names[true_label]}→{config.class_names[pred_label]}\n({confidence:.2f})'
        
        axes[i].set_title(title, color=color, fontsize=10)
        
        # 错误预测加红框
        if true_label != pred_label:
            for spine in axes[i].spines.values():
                spine.set_edgecolor('red')
                spine.set_linewidth(3)
    
    plt.suptitle('Flower Classification Examples (Green=Correct, Red=Wrong)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig('prediction_examples_1.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("✓ Prediction examples saved as 'prediction_examples_2.png'")

show_predictions()

预测示例
✓ Model loaded successfully: best_flower_model.ckpt


[ERROR] CORE(280126,79ff5f932600,python):2025-12-29-14:40:37.770.934 [mindspore/core/utils/file_utils.cc:253] GetRealPath] Get realpath failed, path[/tmp/ipykernel_280126/3613542768.py]


✓ Prediction examples saved as 'prediction_examples_1.png'
