In [1]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from CNN import CNN
from tqdm import tqdm
import torch.nn as nn
from trainDataSet import trainDataSet
import matplotlib.pyplot as plt
import pandas as pd
torch.manual_seed(1)  # 使用随机化种子使神经网络的初始化每次都相同
from testDataSet import testDataSet
# 超参数
EPOCH = 5 # 训练整批数据的次数
BATCH_SIZE = 64
LR = 0.001  # 学习率

In [2]:
# correct_file_label=[]
def delete_error_img():
    """
    由于train数据集中图片噪音过多，现用官方数据训练完成的模型对一些标签错误的图片进行筛选，模型准确率已经在98%左右；
    如果模型预测的标签与实际标签不一致，则删除该图片
    """

    root_dir = './processed_data/train'  # 替换为数据集根目录路径
    dataset = trainDataSet(root_dir)
    train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    # 加载模型

    cnn = CNN()
    # 如果模型已经训练过，确保加载模型权重
    #该模型是用mnist官方数据集训练完成的
    cnn.load_state_dict(torch.load('cnn2.pkl'))
    # 将模型设置为评估模式
    cnn.eval()
    correct_index=[]#这是正确的绝对index
    for index, (data, label) in enumerate(tqdm(train_loader)):
        # 模型预测
        output = cnn(data)
        _, predicted = torch.max(output.data, 1)
        # 检查预测是否正确
        batch_start_index = index * train_loader.batch_size
        for idx, pred in enumerate(predicted):
            absolute_idx = batch_start_index + idx  # 计算在整个数据集中的索引
            if pred.item() == label[idx].item():
                correct_index.append(absolute_idx)
                # correct_file_label.append(pred.item())
              
    print("删除噪音数据中........大概2min")
    dataset.get_correct_data(correct_index)
    # dataset.set_labels(correct_file_label)
    # 创建一个新的 dataset
    new_dataset = train_loader.dataset
    return new_dataset


def train_for_new_model(train_loader):
    """
    dataloader是要训练的数据集
    通过cnn训练已经清理完成的数据集，来得到一个针对于该数据集的新模型。
    """
    cnn2 = CNN()
    # 优化器选择Adam
    optimizer = torch.optim.Adam(cnn2.parameters(), lr=LR)
    # 损失函数
    loss_func = nn.CrossEntropyLoss()  # 目标标签是one-hotted
    # 开始训练
    for epoch in range(EPOCH):
        for step, (b_x, b_y) in enumerate(tqdm(train_loader)):  # 分配batch data
            output = cnn2(b_x)  # 先将数据放到cnn中计算output
            loss = loss_func(output, b_y)  # 输出和真实标签的loss，二者位置不可颠倒
            optimizer.zero_grad()  # 清除之前学到的梯度的参数
            loss.backward()  # 反向传播，计算梯度
            optimizer.step()  # 应用梯度
    torch.save(cnn2.state_dict(), 'cnnLast.pkl')#保存新模型

In [3]:

newDataSet=delete_error_img()

100%|██████████| 938/938 [00:27<00:00, 34.41it/s]


删除噪音数据中........大概2min


In [None]:
len(newDataSet)

In [4]:
# newDataSet.set_labels(correct_file_label)
filename=newDataSet.data_files
labels=newDataSet.labels
correctFile=pd.DataFrame({"name":filename,"labels":labels}).to_csv("1.csv")

In [None]:
# root_dir = './processed_data/train'  # 替换为数据集根目录路径
# dataset1 = trainDataSet(root_dir)
# train_loader = DataLoader(dataset1, batch_size=BATCH_SIZE, shuffle=True)
# print(len(dataset1))
# old_filename=train_loader.dataset.data_files
# old_labels=train_loader.dataset.labels
# pd.DataFrame({"name":old_filename,"labels":old_labels}).to_csv("hahah1.csv")

In [6]:
cnn = CNN()
# 如果模型已经训练过，确保加载模型权重
#该模型是用mnist官方数据集训练完成的
cnn.load_state_dict(torch.load('cnn2.pkl'))
# 将模型设置为评估模式
cnn.eval()


CNN(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Dropout(p=0.5, inplace=False)
  )
  (out): Linear(in_features=1568, out_features=10, bias=True)
)

In [16]:
#看留下的数据集在原来的模型中准确率如何
from PIL import Image
correct_predictions = 0
total_samples = 0
root_dir = './processed_data/val'  # 替换为数据集根目录路径
valSet = trainDataSet(root_dir)
val_loader = DataLoader(valSet, batch_size=BATCH_SIZE, shuffle=False)
with torch.no_grad():
    # k=0
    for index,(data, labels) in enumerate(tqdm(val_loader)):
        outputs = cnn(data)
        _, predicted = torch.max(outputs, 1)
        total_samples += labels.size(0)
        
        image = Image.fromarray(data.numpy().astype('uint8'), 'L')
        image.save('1.png')  # 图片将保存在 img 文件夹下
accuracy = correct_predictions / total_samples
print(f'模型在数据集上的准确率：{accuracy * 100:.2f}%')


100%|██████████| 2/2 [00:00<00:00,  2.81it/s]

模型在数据集上的准确率：0.00%





In [8]:
cnn2 = CNN()
# 如果模型已经训练过，确保加载模型权重
cnn2.load_state_dict(torch.load('cnn2.pkl'))
# 将模型设置为评估模式
cnn2.eval()
#将一个文件夹中的所有文件名写入到一个numpy数组中
folder_path = './processed_data/test/'
# target_folder = './png_images/new/'
test_dataset = testDataSet(folder_path)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
results =[]
with torch.no_grad():
    for index,data in enumerate(tqdm(test_loader)):
        # 获取预测结果
        outputs = cnn2(data)
        _, predicted = torch.max(outputs, 1)
        results.extend(predicted.cpu().numpy())
# 处理或保存测试结果
file_names=np.array([str(f)+".npy" for f in range(9900)])
predictions=pd.DataFrame({'fileNames':file_names,'predication':results})
predictions.to_csv('predictions.csv',index=False)

100%|██████████| 155/155 [00:05<00:00, 27.55it/s]


In [None]:
correct_predictions = 0
total_samples = 0
with torch.no_grad():
    for data, labels in tqdm(train_loader):
        outputs = cnn(data)
        _, predicted = torch.max(outputs, 1)
        total_samples += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()
accuracy = correct_predictions / total_samples
print(f'模型在数据集上的准确率：{accuracy * 100:.2f}%')

In [18]:
def delete_error_img(root_dir):
    """
    由于train数据集中图片噪音过多，现用官方数据训练完成的模型对一些标签错误的图片进行筛选，模型准确率已经在98%左右；
    如果模型预测的标签与实际标签不一致，则删除该图片
    """

    # root_dir = './processed_data/train'  # 替换为数据集根目录路径
    dataset = trainDataSet(root_dir)
    train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    # 加载模型

    cnn = CNN()
    # 如果模型已经训练过，确保加载模型权重
    # 该模型是用mnist官方数据集训练完成的
    cnn.load_state_dict(torch.load('cnn2.pkl'))
    # 将模型设置为评估模式
    cnn.eval()
    correct_index = []  # 这是正确的绝对index
    for index, (data, label) in enumerate(tqdm(train_loader)):
        # 模型预测
        output = cnn(data)
        _, predicted = torch.max(output.data, 1)
        # 检查预测是否正确
        batch_start_index = index * train_loader.batch_size
        for idx, pred in enumerate(predicted):
            absolute_idx = batch_start_index + idx  # 计算在整个数据集中的索引
            if pred.item() == label[idx].item():
                correct_index.append(absolute_idx)
                # correct_file_label.append(pred.item())

    print("删除噪音数据中........大概2min")
    dataset.get_correct_data(correct_index)
    # dataset.set_labels(correct_file_label)
    # 创建一个新的 dataset
    new_dataset = train_loader.dataset
    return new_dataset


44702

In [23]:
root_dir = './processed_data/val'  # 替换为数据集根目录路径
val_set=delete_error_img(root_dir)

100%|██████████| 2/2 [00:00<00:00, 33.49it/s]

删除噪音数据中........大概2min





In [24]:
len(val_set)

77

In [28]:
correct_predictions = 0
total_samples = 0
cnn3 = CNN()
# 如果模型已经训练过，确保加载模型权重
cnn3.load_state_dict(torch.load('cnnLast.pkl'))
# 将模型设置为评估模式
cnn3.eval()
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)
with torch.no_grad():
    for data, labels in tqdm(val_loader):
        outputs = cnn3(data)
        _, predicted = torch.max(outputs, 1)
        total_samples += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()
accuracy = correct_predictions / total_samples
print(f'模型在val集上的准确率：{accuracy * 100:.2f}%')

100%|██████████| 2/2 [00:00<00:00, 26.30it/s]

模型在val集上的准确率：100.00%



