In [33]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

In [34]:
class NPYDataset(Dataset):
    def __init__(self, root_dir):
        """
        初始化数据集
        :param root_dir: 包含所有子文件夹（0-9）的根目录
        """
        self.data_files = []
        self.labels = []

        # 遍历根目录下的每个文件夹
        for label in range(10):  # 假设标签是从0到9
            folder_path = os.path.join(root_dir, str(label))
            if os.path.isdir(folder_path):
                for file in os.listdir(folder_path):
                    if file.endswith('.npy'):
                        file_path = os.path.join(folder_path, file)
                        # 保存文件路径而不是加载数据
                        self.data_files.append(file_path)
                        # 使用文件夹名称作为标签
                        self.labels.append(label)

    def __len__(self):
        return len(self.data_files)

    def __getitem__(self, idx):
        # 按需加载数据
        data = np.load(self.data_files[idx])
        data = np.sum(data, axis=0)
        label = self.labels[idx]
        return torch.from_numpy(data).unsqueeze(0).float(), label
    def delete_items(self, indices):
        # 删除指定索引的数据
        self.data_files = [d for i, d in enumerate(self.data_files) if i not in indices]
        self.labels = [l for i, l in enumerate(self.labels) if i not in indices]

In [35]:
root_dir = './processed_data/train-modify'  # 替换为您的数据集根目录路径
dataset = NPYDataset(root_dir)


In [36]:
trainloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [37]:

from CNN import CNN
batch_size = 64
# 加载模型

cnn = CNN()
# 如果模型已经训练过，确保加载模型权重
cnn.load_state_dict(torch.load('cnn2.pkl'))
# 将模型设置为评估模式
cnn.eval()

CNN(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (out): Linear(in_features=1568, out_features=10, bias=True)
)

In [38]:
from tqdm import tqdm
delect_index=[]
for index, (data, label) in enumerate(tqdm(trainloader)):
    # 模型预测  
    output = cnn(data)
    _, predicted = torch.max(output.data, 1)
    # 检查预测是否正确
    batch_start_index = index * trainloader.batch_size
    for idx, pred in enumerate(predicted):
        absolute_idx = batch_start_index + idx  # 计算在整个数据集中的索引
        if pred.item() != label[idx].item():
            delect_index.append(absolute_idx)
print(len(delect_index))               

100%|██████████| 938/938 [00:21<00:00, 43.67it/s]

15313





In [54]:

dataset.delete_items(delect_index)

# 创建一个新的 DataLoader用来训练
new_dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [69]:
new_dataloader.dataset[0]

(tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 

In [65]:

torch.manual_seed(1)  # 使用随机化种子使神经网络的初始化每次都相同

# 超参数
EPOCH = 5  # 训练整批数据的次数
BATCH_SIZE = 64
LR = 0.001  # 学习率


In [49]:
import torch.nn as nn
# 用class类来建立CNN模型
# CNN流程：卷积(Conv2d)-> 激励函数(ReLU)->池化(MaxPooling)->
#        卷积(Conv2d)-> 激励函数(ReLU)->池化(MaxPooling)->
#        展平多维的卷积成的特征图->接入全连接层(Linear)->输出
cnn2 = CNN()

# 训练
# 把x和y 都放入Variable中，然后放入cnn中计算output，最后再计算误差

# 优化器选择Adam
optimizer = torch.optim.Adam(cnn2.parameters(), lr=LR)
# 损失函数
loss_func = nn.CrossEntropyLoss()  # 目标标签是one-hotted

In [72]:
# test_x=torch.unsqueeze(new_dataloader.dataset, dim=1).type(torch.FloatTensor)[:2000] / 255
# test_y=new_dataloader.dataset.labels[:2000] 

TypeError: unsqueeze(): argument 'input' (position 1) must be Tensor, not NPYDataset

In [73]:

# 开始训练
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(new_dataloader):  # 分配batch data
        output = cnn2(b_x)  # 先将数据放到cnn中计算output
        loss = loss_func(output, b_y)  # 输出和真实标签的loss，二者位置不可颠倒
        optimizer.zero_grad()  # 清除之前学到的梯度的参数
        loss.backward()  # 反向传播，计算梯度
        optimizer.step()  # 应用梯度

        # if step % 50 == 0:
        #     test_output = cnn2(test_x)
        #     pred_y = torch.max(test_output, 1)[1].data.numpy()
        #     accuracy = float((pred_y == test_y.data.numpy()).astype(int).sum()) / float(test_y.size(0))
        #     print('Epoch: ', int(epoch), '| train loss: %.4f' % loss.data.numpy(), '| test accuracy: %.2f' % accuracy)

torch.save(cnn2.state_dict(), 'cnnLast.pkl')#保存模型


In [80]:
from testDataSet import testDataSet
import matplotlib.pyplot as plt
import pandas as pd
batch_size = 64
# 加载模型
cnn2 = CNN()
# 如果模型已经训练过，确保加载模型权重
cnn2.load_state_dict(torch.load('cnnLast.pkl'))
# 将模型设置为评估模式
cnn2.eval()
#将一个文件夹中的所有文件名写入到一个numpy数组中
folder_path = './processed_data/test/'
target_folder = './png_images/new/'
test_dataset = testDataSet(folder_path)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
results =[]

In [82]:

with torch.no_grad():
    for batch_idx, data in enumerate(tqdm(test_loader)):

        # 获取预测结果
        outputs = cnn2(data)
        _, predicted = torch.max(outputs, 1)
        # 遍历批次中的每个图像及其预测
        # for i, (image, pred) in enumerate(zip(data, predicted)):
        #     # 处理图像数据
        #     image_np = image.squeeze().numpy()  # 假设图像是单通道的
        # 
        #     # 构建文件名（包含预测结果）
        #     file_name = f"batch{batch_idx}_img{i}_pred{pred.item()}.png"

            # 保存图像
            # plt.imsave(os.path.join(target_folder, file_name), image_np, cmap='gray')

        results.extend(predicted.cpu().numpy())


100%|██████████| 155/155 [01:03<00:00,  2.45it/s]


In [84]:

# 处理或保存测试结果
file_names=np.array([str(f)+".npy" for f in range(9900)])
predictions=pd.DataFrame({'fileNames':file_names,'predication':results})
predictions.to_csv('predictionsLast.csv',index=False)