In [1]:
import os
import sys

# 设置CUDA_HOME环境变量
conda_prefix = os.environ.get('CONDA_PREFIX')
if conda_prefix:
    os.environ['CUDA_HOME'] = conda_prefix
    os.environ['LD_LIBRARY_PATH'] = f"{conda_prefix}/lib:{os.environ.get('LD_LIBRARY_PATH', '')}"
    
    print(f"已设置环境变量:")
    print(f"  CUDA_HOME: {os.environ['CUDA_HOME']}")
    print(f"  CONDA_PREFIX: {conda_prefix}")
    
    # 验证libcuda.so是否存在
    libcuda_path = f"{conda_prefix}/lib/libcuda.so"
    if os.path.exists(libcuda_path):
        print(f"  ✓ libcuda.so: 找到 ({libcuda_path})")
    else:
        print(f"  ✗ libcuda.so: 在conda环境中未找到")
        
        # 创建符号链接到系统libcuda
        system_libcuda = "/usr/lib/x86_64-linux-gnu/libcuda.so"
        if os.path.exists(system_libcuda):
            os.system(f"ln -sf {system_libcuda} {conda_prefix}/lib/libcuda.so")
            print(f"  ✓ 已创建符号链接: {conda_prefix}/lib/libcuda.so -> {system_libcuda}")
        else:
            print(f"  ⚠ 系统libcuda.so也未找到")
else:
    print("CONDA_PREFIX未设置，请先激活conda环境")

已设置环境变量:
  CUDA_HOME: /home/y/anaconda3/envs/mindspore
  CONDA_PREFIX: /home/y/anaconda3/envs/mindspore
  ✓ libcuda.so: 找到 (/home/y/anaconda3/envs/mindspore/lib/libcuda.so)


In [2]:
import os
import sys
import subprocess
import shutil

def check_and_fix():
    """检查和修复MindSpore GPU问题"""
    print("="*60)
    print("MindSpore GPU修复工具")
    print("="*60)
    
    # 1. 检查conda环境
    conda_prefix = os.environ.get('CONDA_PREFIX')
    if not conda_prefix:
        print("错误: 未在conda环境中！")
        print("请先运行: conda activate mindspore")
        return False
    
    print(f"1. Conda环境: {conda_prefix}")
    
    # 2. 设置环境变量
    print("\n2. 设置环境变量...")
    os.environ['CUDA_HOME'] = conda_prefix
    os.environ['LD_LIBRARY_PATH'] = f"{conda_prefix}/lib:{os.environ.get('LD_LIBRARY_PATH', '')}"
    
    print(f"   CUDA_HOME: {os.environ['CUDA_HOME']}")
    
    # 3. 检查libcuda.so
    print("\n3. 检查CUDA库...")
    libcuda_path = f"{conda_prefix}/lib/libcuda.so"
    
    if os.path.exists(libcuda_path):
        print(f"   ✓ libcuda.so: 存在 ({libcuda_path})")
    else:
        print(f"   ✗ libcuda.so: 在conda环境中未找到")
        
        # 寻找系统libcuda
        possible_paths = [
            "/usr/lib/x86_64-linux-gnu/libcuda.so",
            "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
            "/usr/local/cuda/lib64/libcuda.so",
            "/usr/lib64/libcuda.so",
        ]
        
        found = False
        for path in possible_paths:
            if os.path.exists(path):
                # 创建符号链接
                os.system(f"ln -sf {path} {libcuda_path}")
                print(f"   ✓ 创建符号链接: {libcuda_path} -> {path}")
                found = True
                break
        
        if not found:
            print("   ⚠ 未找到系统libcuda.so")
            print("   可能需要安装NVIDIA驱动: sudo apt install nvidia-driver-550")
    
    # 4. 检查libcudnn.so（应该在conda环境中）
    libcudnn_path = f"{conda_prefix}/lib/libcudnn.so"
    if os.path.exists(libcudnn_path):
        print(f"   ✓ libcudnn.so: 存在 ({libcudnn_path})")
    else:
        # 寻找libcudnn.so.x
        cudnn_files = []
        for root, dirs, files in os.walk(f"{conda_prefix}/lib"):
            for file in files:
                if 'libcudnn.so' in file:
                    cudnn_files.append(os.path.join(root, file))
        
        if cudnn_files:
            # 创建符号链接到libcudnn.so
            target = cudnn_files[0]
            os.system(f"ln -sf {target} {libcudnn_path}")
            print(f"   ✓ 创建符号链接: {libcudnn_path} -> {target}")
        else:
            print("   ✗ libcudnn.so: 未找到")
            print("   请安装: conda install cudnn=8.5 -c conda-forge")
    
    # 5. 验证环境变量生效
    print("\n4. 验证环境变量...")
    env_vars = ['CUDA_HOME', 'LD_LIBRARY_PATH', 'CONDA_PREFIX']
    for var in env_vars:
        value = os.environ.get(var, '未设置')
        print(f"   {var}: {value[:80]}...")
    
    # 6. 测试MindSpore GPU
    print("\n5. 测试MindSpore GPU...")
    try:
        import mindspore as ms
        from mindspore import context
        
        print(f"   MindSpore版本: {ms.__version__}")
        
        try:
            context.set_context(device_target='GPU')
            print("   ✓ GPU上下文设置成功")
            
            # 测试简单计算
            import numpy as np
            x = ms.Tensor(np.ones([2, 2], dtype=np.float32))
            y = x * 2
            print(f"   ✓ GPU计算测试: {x.shape} -> {y.shape}")
            
            return True
            
        except Exception as e:
            print(f"   ✗ GPU设置失败: {str(e)[:100]}")
            
            # 尝试CPU作为备选
            context.set_context(device_target='CPU')
            print("   ✓ CPU上下文设置成功（备用）")
            return True
            
    except Exception as e:
        print(f"   ✗ MindSpore导入失败: {e}")
        return False

def create_launch_script():
    """创建启动脚本"""
    script_content = '''#!/bin/bash
# MindSpore GPU启动脚本

# 激活conda环境
conda activate mindspore

# 设置环境变量
export CUDA_HOME=$CONDA_PREFIX
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH

# 创建必要的符号链接
if [ ! -f $CONDA_PREFIX/lib/libcuda.so ]; then
    # 尝试找到系统libcuda
    if [ -f /usr/lib/x86_64-linux-gnu/libcuda.so ]; then
        ln -sf /usr/lib/x86_64-linux-gnu/libcuda.so $CONDA_PREFIX/lib/libcuda.so
    elif [ -f /usr/lib/x86_64-linux-gnu/libcuda.so.1 ]; then
        ln -sf /usr/lib/x86_64-linux-gnu/libcuda.so.1 $CONDA_PREFIX/lib/libcuda.so
    fi
fi

# 启动Python
echo "环境已设置:"
echo "  CUDA_HOME: $CUDA_HOME"
echo "  Python: $(which python)"
echo ""
python "$@"
'''
    
    with open("launch_mindspore.sh", "w") as f:
        f.write(script_content)
    
    os.system("chmod +x launch_mindspore.sh")
    print("启动脚本已创建: launch_mindspore.sh")
    print("使用: ./launch_mindspore.sh your_script.py")

if __name__ == "__main__":
    success = check_and_fix()
    
    print("\n" + "="*60)
    if success:
        print("✅ 修复完成！")
        print("\n接下来运行:")
        print("1. 对于当前会话，环境变量已设置")
        print("2. 对于新终端，运行创建的启动脚本")
    else:
        print("⚠ 部分问题未解决")
        print("\n建议:")
        print("1. 重启终端后重新激活环境")
        print("2. 手动设置环境变量")
        print("3. 如果GPU仍然不行，使用CPU版本完成作业")
    
    # 创建启动脚本
    create_launch_script()
    
    print("\n快速测试命令:")
    print("python -c \"import mindspore as ms; ms.set_context(device_target='GPU'); print('GPU可用')\"")
    print("="*60)

MindSpore GPU修复工具
1. Conda环境: /home/y/anaconda3/envs/mindspore

2. 设置环境变量...
   CUDA_HOME: /home/y/anaconda3/envs/mindspore

3. 检查CUDA库...
   ✓ libcuda.so: 存在 (/home/y/anaconda3/envs/mindspore/lib/libcuda.so)
   ✓ libcudnn.so: 存在 (/home/y/anaconda3/envs/mindspore/lib/libcudnn.so)

4. 验证环境变量...
   CUDA_HOME: /home/y/anaconda3/envs/mindspore...
   LD_LIBRARY_PATH: /home/y/anaconda3/envs/mindspore/lib:/home/y/anaconda3/envs/mindspore/lib:...
   CONDA_PREFIX: /home/y/anaconda3/envs/mindspore...

5. 测试MindSpore GPU...
   MindSpore版本: 2.2.0
   ✓ GPU上下文设置成功
   ✓ GPU计算测试: (2, 2) -> (2, 2)

✅ 修复完成！

接下来运行:
1. 对于当前会话，环境变量已设置
2. 对于新终端，运行创建的启动脚本
启动脚本已创建: launch_mindspore.sh
使用: ./launch_mindspore.sh your_script.py

快速测试命令:
python -c "import mindspore as ms; ms.set_context(device_target='GPU'); print('GPU可用')"


In [3]:
import mindspore as ms; ms.set_context(device_target='GPU'); print('GPU可用')

GPU可用
