In [None]:
将二进制代码文件转换为灰度图像数据

In [2]:
import os
from PIL import Image
import numpy as np
import re

def numerical_sort(value):
    # 提取文件名中的数字
    numbers = re.findall(r'\d+', value)
    return int(numbers[0]) if numbers else float('inf')  # 不符合的文件排序到最后

def batch_convert_to_images(source_folder, target_folder):
    # 确保目标文件夹存在
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # 获取源文件夹中所有文件并过滤
    file_names = [f for f in os.listdir(source_folder) if re.match(r'二进制文件\d+', f)]
    # 使用自定义排序函数按数字顺序排序
    file_names.sort(key=numerical_sort)

    # 遍历所有文件并进行处理
    for index, file_name in enumerate(file_names):
        file_path = os.path.join(source_folder, file_name)
        image_path = os.path.join(target_folder, f"{index + 1}.bmp")
        
        # 读取文件内容
        with open(file_path, 'rb') as file:
            file_bytes = file.read()

        # 创建一个256x256的灰度图像的数组，初始化为0（全黑）
        image_data = np.zeros((256, 256), dtype=np.uint8)

        # 填充图像数组
        for i in range(65536):  # 总是循环65536次，对应64KB
            if i < len(file_bytes):
                row = i // 256
                col = i % 256
                image_data[row, col] = file_bytes[i]
            else:
                break  # 如果file_bytes用完了，后面的image_data保持为0，无需显式填充

        # 创建一个Pillow图像
        img = Image.fromarray(image_data, 'L')
        
        # 保存图像
        img.save(image_path)

# 调用函数，处理文件
batch_convert_to_images('VirusShare_x86-64_WinEXE_20130711', 'output_images_顺序正确_数量正确')

In [None]:
图像归一化处理

In [2]:
import os
import numpy as np
from PIL import Image

# 定义图像归一化函数
def normalize_image(image_path, output_folder):
    # 读取图像
    img = Image.open(image_path).convert('L')  # 确保图像为灰度
    # 归一化图像
    normalized_img = np.array(img) / 255.0
    # 从原始文件名中提取编号
    file_name = os.path.basename(image_path)
    base_name, _ = os.path.splitext(file_name)
    # 保存图像到输出文件夹，使用.npy格式以保存浮点数像素值
    save_path = os.path.join(output_folder, f"{base_name}.npy")
    np.save(save_path, normalized_img)

# 设置输入和输出文件夹路径
input_folder = "output_images_顺序正确_数量正确"  # 替换为输入文件夹路径
output_folder = "归一化处理后_改变像素值特征"  # 替换为输出文件夹路径

# 创建输出文件夹
os.makedirs(output_folder, exist_ok=True)

# 获取所有图像文件
image_files = [f for f in os.listdir(input_folder) if f.endswith('.bmp')]

# 对每个图像文件应用归一化预处理
for image_file in image_files:
    image_path = os.path.join(input_folder, image_file)
    normalize_image(image_path, output_folder)

print("预处理完成。")


预处理完成。


In [2]:
import os
import numpy as np
import re
from PIL import Image
from torchvision import transforms
import torch

def numerical_sort(value):
    # 使用正则表达式提取文件名中的数字，确保文件按数字顺序处理
    numbers = re.findall(r'\d+', value)
    return int(numbers[0]) if numbers else float('inf')

def convert_and_preprocess_images(source_folder, target_folder):
    # 确保目标文件夹存在
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # 获取源文件夹中所有文件并过滤
    file_names = [f for f in os.listdir(source_folder) if re.match(r'二进制文件\d+', f)]
    # 按数字顺序排序
    file_names.sort(key=numerical_sort)

    # 定义PyTorch图像预处理转换
    transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),  # 转为灰度图像
        transforms.Resize((224, 224)),  # 调整图像大小到224x224
        transforms.ToTensor(),  # 转为Tensor
        transforms.Normalize(mean=[0.5], std=[0.5])  # 归一化
    ])

    # 遍历所有文件并进行处理
    for index, file_name in enumerate(file_names):
        file_path = os.path.join(source_folder, file_name)
        with open(file_path, 'rb') as file:
            file_bytes = file.read()

        # 创建256x256的灰度图像数组，初始化为0（全黑）
        image_data = np.zeros((256, 256), dtype=np.uint8)
        for i in range(65536):  # 总是循环65536次，对应64KB
            if i < len(file_bytes):
                row = i // 256
                col = i % 256
                image_data[row, col] = file_bytes[i]

        # 使用Pillow创建图像
        img = Image.fromarray(image_data, 'L')

        # 应用PyTorch预处理
        img_tensor = transform(img)
        
        # 保存处理后的Tensor，此处为示例保存为.pt文件
        tensor_path = os.path.join(target_folder, f"{index + 1}.pt")
        torch.save(img_tensor, tensor_path)

# 调用函数，处理文件
source_folder = 'VirusShare_x86-64_WinEXE_20130711'
target_folder = 'processed_images'
convert_and_preprocess_images(source_folder, target_folder)
print("所有文件已处理并保存为Tensor格式。")


所有文件已处理并保存为Tensor格式。


接下来的目标是提取特征

In [None]:
# 导入 PyTorch 库,这是一个用于机器学习和深度学习的开源框架。
import torch

# 导入 PyTorch 的神经网络模块 nn
import torch.nn as nn

# 导入 PyTorch 的优化器模块 optim
import torch.optim as optim

# 导入 torchvision 库，它提供了常用的数据集和图像转换工具
from torchvision import datasets, transforms

# 从 torchvision.models 导入 vgg16 模型，vgg16 是一种常用的图像识别模型
from torchvision.models import vgg16

# 导入 DataLoader，它提供了对 Dataset 的迭代访问
from torch.utils.data import DataLoader


# 设定基本参数
batch_size = 32  # 批处理大小
learning_rate = 0.001  # 学习率
num_epochs = 20  # 训练周期
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # 设定使用GPU或CPU

# 数据预处理，这里只使用灰度图像，所以只有一个通道
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),  # 转为灰度图像
    transforms.Resize((224, 224)),  # 调整图像大小到224x224
    transforms.ToTensor(),  # 转为Tensor
    transforms.Normalize(mean=[0.5], std=[0.5])  # 归一化
])

# 加载数据集
train_dataset = datasets.ImageFolder(root='path_to_train_dataset', transform=transform)
test_dataset = datasets.ImageFolder(root='path_to_test_dataset', transform=transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# 修改VGG16模型以接受1通道输入并进行一些调整
class VGG16Gray(nn.Module):
    def __init__(self, num_classes=2):  # 默认为二分类
        super(VGG16Gray, self).__init__()
        original_vgg = vgg16(pretrained=True)
        # 替换原始VGG的第一层，从3通道改为1通道
        self.features = original_vgg.features
        self.features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.classifier = original_vgg.classifier
        self.classifier[6] = nn.Linear(4096, num_classes)  # 替换分类器的最后一层为目标类别数

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, start_dim=1)
        x = self.classifier(x)
        return x

# 实例化模型
model = VGG16Gray().to(device)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 训练模型
def train_model():
    model.train()
    for epoch in range(num_epochs):
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 测试模型
def test_model():
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f'Accuracy of the model on the test images: {100 * correct / total:.2f}%')

# 执行训练和测试
train_model()
test_model()


整合

修改了保存为npy格式，改为保存为张量格式，不保存为npy格式，能够更好的适应pytorch的需求

具体如下

上述整合代码与PyTorch中的数据预处理代码实现的功能基本相似，但存在几个关键的区别和特点：

1. **技术栈和应用场景**：
   - **PyTorch 数据预处理**：适用于在PyTorch框架中处理图像数据，主要用于训练机器学习模型。它利用PyTorch的库和函数来转换图像，包括调整大小、转换为张量并归一化。
   - **整合代码**：处理的是二进制文件，将其直接转换为灰度图像的numpy数组，并进行归一化处理。这种处理通常是为了准备从非标准图像源（如二进制文件）导入的数据，适用于特定应用，比如处理二进制恶意软件样本。

2. **输出格式**：
   - **PyTorch 数据预处理**：输出是归一化后的Tensor，直接适用于PyTorch模型。
   - **整合代码**：输出是.npy格式的文件，这是一个包含归一化后的图像数据的numpy数组，可用于多种不同的数据科学和机器学习应用，不限于PyTorch。

3. **处理流程**：
   - **PyTorch 数据预处理**：不涉及从原始二进制文件生成图像的过程，假设输入已经是图像格式。
   - **整合代码**：包含从原始二进制文件读取数据并转换为图像的步骤，这是一个更底层的数据处理，特别是在处理非图像数据文件时非常有用。

4. **实用性和灵活性**：
   - **PyTorch 数据预处理**：是标准化流程，用于图像数据，并且严重依赖PyTorch环境。
   - **整合代码**：更通用，可以在没有PyTorch环境的情况下运行，因为它使用Python标准库和Pillow进行图像处理，适合需要从基础数据格式开始的预处理任务。

总的来说，两者虽然在最终目标（即为机器学习模型准备图像数据）上相似，但处理的起点、依赖的库、输出格式以及在实际应用中的灵活性上有所不同。如果你的工作流程需要从非标准图像源（例如二进制文件）处理数据，整合代码提供了一个很好的起点。如果是标准图像文件，并且使用PyTorch进行深度学习，那么PyTorch的预处理流程更直接有效。

In [1]:
import os
import re
import ssl
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets, models
from torch.utils.data import DataLoader
import urllib.request

# 创建一个不验证SSL证书的上下文
context = ssl._create_unverified_context()

# 使用这个上下文发起请求
response = urllib.request.urlopen('https://example.com', context=context)
print(response.read().decode())


def numerical_sort(value):
    # 使用正则表达式提取文件名中的数字，确保文件按数字顺序处理
    numbers = re.findall(r'\d+', value)
    return int(numbers[0]) if numbers else float('inf')

def convert_and_preprocess_images(source_folder, target_folder):
    # 确保目标文件夹存在
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # 获取源文件夹中所有文件并过滤
    file_names = [f for f in os.listdir(source_folder) if re.match(r'二进制文件\d+', f)]
    # 按数字顺序排序
    file_names.sort(key=numerical_sort)

    # 定义图像预处理转换
    transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),  # 转为灰度图像
        transforms.Resize((224, 224)),  # 调整图像大小到224x224
        transforms.ToTensor(),  # 转为Tensor
        transforms.Normalize(mean=[0.5], std=[0.5])  # 归一化
    ])

    # 遍历所有文件并进行处理
    for index, file_name in enumerate(file_names):
        file_path = os.path.join(source_folder, file_name)
        with open(file_path, 'rb') as file:
            file_bytes = file.read()

        # 创建256x256的灰度图像数组，初始化为0（全黑）
        image_data = np.zeros((256, 256), dtype=np.uint8)
        for i in range(65536):
            if i < len(file_bytes):
                row = i // 256
                col = i % 256
                image_data[row, col] = file_bytes[i]

        # 使用Pillow创建图像
        img = Image.fromarray(image_data, 'L')
        img_tensor = transform(img)
        tensor_path = os.path.join(target_folder, f"{index + 1}.pt")
        torch.save(img_tensor, tensor_path)

# 调用函数，处理文件
source_folder = 'VirusShare_x86-64_WinEXE_20130711'
target_folder = 'processed_images'
convert_and_preprocess_images(source_folder, target_folder)
print("所有文件已处理并保存为Tensor格式。")

# 模型训练与测试
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 修改VGG16模型以接受1通道输入
class VGG16Gray(nn.Module):
    def __init__(self, num_classes=2):
        super(VGG16Gray, self).__init__()
        original_vgg = models.vgg16(pretrained=True)
        self.features = original_vgg.features
        self.features[0] = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.classifier = original_vgg.classifier
        self.classifier[6] = nn.Linear(4096, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, start_dim=1)
        x = self.classifier(x)
        return x

model = VGG16Gray().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# 训练模型
def train_model():
    model.train()
    for epoch in range(num_epochs):
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 测试模型
def test_model():
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f'Accuracy on test set: {(correct / total) * 100:.2f}%')

# 加载数据
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

train_dataset = datasets.ImageFolder(root='processed_images/train', transform=transform)
test_dataset = datasets.ImageFolder(root='processed_images/test', transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 设置超参数
num_epochs = 10
batch_size = 32

# 训练并测试模型
train_model()
test_model()


<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domai

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /Users/guo/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100.0%


FileNotFoundError: [Errno 2] No such file or directory: 'processed_images/train'

#收集所有的良性文件
从本地电脑爬取，然后保存下来，下面是代码
目录是c盘
限定大小为大于2kb小于100mb
处理了错误，跳过访问权限错误和找不到的错误


In [None]:
import os
import shutil

def copy_exe_files(src_folder, dst_folder):
    # 确保目标目录存在
    if not os.path.exists(dst_folder):
        os.makedirs(dst_folder)

    file_count = 0  # 初始化文件计数器
    for root, dirs, files in os.walk(src_folder):
        dirs[:] = [d for d in dirs if os.access(os.path.join(root, d), os.R_OK)]  # 检查目录读取权限
        for file in files:
            if file.endswith('.exe'):
                src_file_path = os.path.join(root, file)
                try:
                    file_size = os.path.getsize(src_file_path)
                    if file_size < 2048 or file_size > 104857600:  # 跳过小于2KB或大于100MB的文件
                        continue
                except OSError:
                    continue  # 如果无法获取文件大小，则跳过此文件

                dst_file_path = os.path.join(dst_folder, file)  # 直接在目标文件夹下保存文件
                if not os.path.exists(dst_file_path):  # 检查是否已存在同名文件
                    try:
                        shutil.copy(src_file_path, dst_file_path)
                        file_count += 1  # 文件计数增加
                        print(f"Copied: {src_file_path} to {dst_file_path}")
                    except (PermissionError, FileNotFoundError):
                        continue  # 如果没有权限复制文件或文件不存在，则跳过

    print(f"Total .exe files copied: {file_count}")

dir_src = "C:\\"  # C盘根目录
dir_dst = "C:\\BenignFiles"  # 目标目录
copy_exe_files(dir_src, dir_dst)

In [12]:
import ssl
print(ssl.get_default_verify_paths())



DefaultVerifyPaths(cafile='/Library/Frameworks/Python.framework/Versions/3.12/etc/openssl/cert.pem', capath=None, openssl_cafile_env='SSL_CERT_FILE', openssl_cafile='/Library/Frameworks/Python.framework/Versions/3.12/etc/openssl/cert.pem', openssl_capath_env='SSL_CERT_DIR', openssl_capath='/Library/Frameworks/Python.framework/Versions/3.12/etc/openssl/certs')


In [10]:
import requests
from requests.exceptions import SSLError

try:
    response = requests.get('https://example.com', verify='/Library/Frameworks/Python.framework/Versions/3.12/etc/openssl/cert.pem')
    print(response.text)
except SSLError as e:
    print("SSL Error:", e)


<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domai

仅仅作为特征提取器
修改


In [4]:
import os
import re
import ssl
import numpy as np
from PIL import Image
import torch
from torchvision import transforms, datasets, models
from torch.utils.data import DataLoader
import urllib.request

# 创建一个验证SSL证书的上下文
context = ssl.create_default_context()

# 使用这个上下文发起请求
response = urllib.request.urlopen('https://example.com', context=context)
print(response.read().decode())

def numerical_sort(value):
    numbers = re.findall(r'\d+', value)
    return int(numbers[0]) if numbers else float('inf')

def convert_and_preprocess_images(source_folder, target_folder):
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)
    file_names = [f for f in os.listdir(source_folder) if re.match(r'文件\d+', f)]
    file_names.sort(key=numerical_sort)
    transform = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5], std=[0.5])
    ])
    for index, file_name in enumerate(file_names):
        file_path = os.path.join(source_folder, file_name)
        with open(file_path, 'rb') as file:
            file_bytes = file.read()
        image_data = np.zeros((256, 256), dtype=np.uint8)
        for i in range(min(65536, len(file_bytes))):
            row = i // 256
            col = i % 256
            image_data[row, col] = file_bytes[i]
        img = Image.fromarray(image_data, 'L')
        img_tensor = transform(img)
        tensor_path = os.path.join(target_folder, f"{index + 1}.pt")
        torch.save(img_tensor, tensor_path)
    print("所有文件已处理并保存为Tensor格式。")

source_folder = 'VirusShare_x86-64_WinEXE_20130711'
target_folder = 'processed_images'
convert_and_preprocess_images(source_folder, target_folder)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载预训练的 VGG16 模型，但只使用卷积层作为特征提取器
model = models.vgg16(pretrained=True).features.to(device)
model.eval()

def extract_features(data_loader):
    features = []
    with torch.no_grad():
        for images, _ in data_loader:
            images = images.to(device)
            outputs = model(images)
            features.extend(outputs.cpu().numpy())
    return features

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

dataset = datasets.ImageFolder(root='processed_images', transform=transform)
loader = DataLoader(dataset, batch_size=32, shuffle=False)

# 提取特征
features = extract_features(loader)
print("特征提取完成，特征维度：", features[0].shape)

# features 现在包含了每个图像通过 VGG 最后一个卷积层的输出


<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domai

FileNotFoundError: Couldn't find any class folder in processed_images.

In [7]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载预训练的 VGG16 模型
model = models.vgg16(pretrained=True)

# 修改第一个卷积层以接受1通道输入
model.features[0] = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1).to(device)

# 使用卷积层部分作为特征提取器
model.features = model.features.to(device)
model.eval()

class TensorDataset(Dataset):
    def __init__(self, directory):
        super(TensorDataset, self).__init__()
        self.file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pt')]
        self.file_paths.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        tensor = torch.load(self.file_paths[idx])
        return tensor, 0  # 返回0作为伪标签，因为不进行分类

def extract_features(data_loader):
    features = []
    with torch.no_grad():
        for tensors, _ in data_loader:
            tensors = tensors.to(device)
            outputs = model(tensors)
            features.extend(outputs.cpu().numpy())
    return features

# 初始化数据集和数据加载器
dataset = TensorDataset('processed_images')
loader = DataLoader(dataset, batch_size=32, shuffle=False)

# 提取特征
features = extract_features(loader)
print("特征提取完成，特征维度：", features[0].shape if features else 'No features extracted')

# 输出信息，确保特征已被提取
if features:
    print(f"总共提取了 {len(features)} 组特征。")
else:
    print("未提取任何特征，请检查数据文件。")


特征提取完成，特征维度： (1000,)
总共提取了 997 组特征。


In [8]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
import os
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载预训练的 VGG16 模型
model = models.vgg16(pretrained=True)
# 修改第一个卷积层以接受1通道输入
model.features[0] = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1).to(device)
# 使用卷积层部分作为特征提取器
model.features = model.features.to(device)
model.eval()

class TensorDataset(Dataset):
    def __init__(self, directory):
        super(TensorDataset, self).__init__()
        self.file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pt')]
        self.file_paths.sort(key=lambda x: int(x.split('/')[-1].split('.')[0]))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        tensor = torch.load(self.file_paths[idx])
        return tensor, 0  # 返回0作为伪标签，因为不进行分类

def extract_features(data_loader):
    features = []
    with torch.no_grad():
        for tensors, _ in data_loader:
            tensors = tensors.to(device)
            outputs = model(tensors)
            features.extend(outputs.cpu().numpy())
    return features

# 初始化数据集和数据加载器
dataset = TensorDataset('processed_images')
loader = DataLoader(dataset, batch_size=32, shuffle=False)

# 提取特征
features = extract_features(loader)

# 添加的部分：保存提取的特征
if features:
    np.save('extracted_features.npy', np.array(features))
    print("特征已成功保存至 'extracted_features.npy'")
    print("特征提取完成，特征维度：", features[0].shape)
    print(f"总共提取了 {len(features)} 组特征。")
else:
    print("未提取任何特征，请检查数据文件。")


特征已成功保存至 'extracted_features.npy'
特征提取完成，特征维度： (1000,)
总共提取了 997 组特征。
