# Train on noisy dataset

## import the necessary package

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet50
import pandas as pd
from torch.utils.data import DataLoader, Subset
from tqdm import tqdm
import os
import csv
import utils
from collections import defaultdict
print("env is done")

env is done


## data preprocessing

In [6]:
train_transforms = transforms.Compose([
    transforms.ToTensor(),  # 转化为tensor类型
    # 从[0,1]归一化到[-1,1]
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    transforms.RandomHorizontalFlip(),  # 随机水平镜像
    transforms.RandomErasing(scale=(0.04, 0.2), ratio=(0.5, 2)),  # 随机遮挡
    transforms.RandomCrop(32, padding=4),  # 随机裁剪
                                       ])
 
test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                                     ])

# 定义数据集的保存路径
data_path = '../data'  # 可以更改为你希望保存的目录

# 使用 torchvision 下载 CIFAR-10 数据集
train_dataset = torchvision.datasets.CIFAR10(root=data_path, train=True, download=True, transform=train_transforms)
test_dataset = torchvision.datasets.CIFAR10(root=data_path, train=False, download=True, transform=test_transforms)

noise_file = torch.load('../data/CIFAR-10_human.pt')
noisy_labels = noise_file['worse_label']


Files already downloaded and verified
Files already downloaded and verified


In [8]:
batch_size = 64

dataset = train_dataset

train_loader_clean = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=4)
train_dataset_noisy = dataset
train_dataset_noisy.targets = noisy_labels
train_loader_noisy = DataLoader(train_dataset_noisy, batch_size, shuffle=True, num_workers=4)

test_loader = DataLoader(test_dataset,batch_size, shuffle=False, num_workers=4)

### 不均衡数据集

In [None]:
flag = False
if flag:
    ration_classes = [0.1,
                    0.2,
                    0.3,
                    0.4,
                    0.5,
                    0.6,
                    0.7,
                    0.8,
                    0.9,
                    1]

    # 统计每个类别的样本索引
    class_indices = defaultdict(list)
    for idx, (_, label) in enumerate(train_loader_clean):
        class_indices[label.item()].append(idx)

    # 保留比例的样本索引
    new_indices = []
    for class_id, indices in class_indices.items():
        retain_count = int(len(indices) * ration_classes[class_id])  # 按比例保留
        retain_indices = np.random.choice(indices, retain_count, replace=False)
        new_indices.extend(retain_indices)
    # 保存索引到 .npy 文件
    np.save("selected_indices.npy", new_indices)
else:
   loaded_indices = np.load("selected_indices.npy").tolist() 
# 根据索引生成新的训练集
inb_train_clean = Subset(train_dataset, new_indices)
inb_train_noisy = Subset(train_dataset_noisy, new_indices)

inb_train_clean_loader = DataLoader(inb_train_clean,batch_size=batch_size,shuffle=True,num_workers=4)
inb_train_noisy_loader = DataLoader(inb_train_noisy,batch_size=batch_size,shuffle=True,num_workers=4)

## Model Preprocessing

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def initial_model(device):
    model = resnet50(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features,10)
    model = model.to(device)
    return model

model = initial_model(device)



## train tactic

In [23]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma = 0.1)

### train basic model

In [29]:
def start_train(num_epochs, save_name, model, train_loader, test_loader, criterion, optimizer, scheduler):
    os.makedirs("log",exist_ok=True)
    os.makedirs("weight",exist_ok=True)
    
    csv_file = f'log/{save_name}.csv'
    
    # 初始化 CSV 文件并写入标题
    with open(csv_file, "w", newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(["Epoch", "Train Loss", "Train Accuracy", "Test Loss", "Test Accuracy"])

    
    for epoch in range(num_epochs):
        train_loss, train_accuracy = utils.train_base(model, train_loader, criterion, optimizer, epoch)
        test_loss, test_accuracy = utils.test(model, test_loader, criterion)
        scheduler.step()
        
         # 实时写入 CSV 文件
        with open(csv_file, "a", newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=' ')
            writer.writerow([epoch + 1, train_loss, train_accuracy, test_loss, test_accuracy])


        # 打印每个 epoch 的训练和测试结果
        print(f'Epoch {epoch + 1}/{num_epochs} - '
            f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
            f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')
    
        if train_accuracy > 99.999:
            break
    
    torch.save(model.state_dict(), f"weight/{save_name}")
    print(f"训练过程已保存到 '{csv_file}' 文件中。模型权重文件已保存")

In [None]:
start_train(num_epochs=50, save_name="base", model=model, train_loader=train_loader_clean, test_loader=test_loader, criterion=criterion, optimizer=optimizer, scheduler=scheduler)

### train ITLM

In [None]:
def start_train(num_epochs, save_name, model, train_loader, test_loader, criterion, optimizer, scheduler):
    os.makedirs("log",exist_ok=True)
    os.makedirs("weight",exist_ok=True)
    
    csv_file = f'log/{save_name}.csv'
    
    # 初始化 CSV 文件并写入标题
    with open(csv_file, "w", newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(["Epoch", "Train Loss", "Train Accuracy", "Test Loss", "Test Accuracy"])

    
    for epoch in range(num_epochs):
        train_loss, train_accuracy = utils.train_ITLM(model, train_loader, criterion, optimizer, epoch)
        test_loss, test_accuracy = utils.test(model, test_loader, criterion)
        scheduler.step()
        
         # 实时写入 CSV 文件
        with open(csv_file, "a", newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=' ')
            writer.writerow([epoch + 1, train_loss, train_accuracy, test_loss, test_accuracy])


        # 打印每个 epoch 的训练和测试结果
        print(f'Epoch {epoch + 1}/{num_epochs} - '
            f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
            f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')
    
        if train_accuracy > 99.999:
            break
    
    torch.save(model.state_dict(), f"weight/{save_name}")
    print(f"训练过程已保存到 '{csv_file}' 文件中。模型权重文件已保存")

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma = 0.1)
start_train(num_epochs=50, save_name="ITLM", model=model, train_loader=train_loader_clean, test_loader=test_loader, criterion=criterion, optimizer=optimizer, scheduler=scheduler)

## 解决数据不平衡问题

In [None]:
def start_train(num_epochs, save_name, model, train_loader, test_loader, criterion, optimizer, scheduler):
    os.makedirs("log",exist_ok=True)
    os.makedirs("weight",exist_ok=True)
    
    csv_file = f'log/{save_name}.csv'
    
    # 初始化 CSV 文件并写入标题
    with open(csv_file, "w", newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(["Epoch", "Train Loss", "Train Accuracy", "Test Loss", "Test Accuracy"])

    
    for epoch in range(num_epochs):
        train_loss, train_accuracy = utils.train_FL(model, train_loader, criterion, optimizer, epoch)
        test_loss, test_accuracy = utils.test(model, test_loader, criterion)
        scheduler.step()
        
         # 实时写入 CSV 文件
        with open(csv_file, "a", newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=' ')
            writer.writerow([epoch + 1, train_loss, train_accuracy, test_loss, test_accuracy])


        # 打印每个 epoch 的训练和测试结果
        print(f'Epoch {epoch + 1}/{num_epochs} - '
            f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, '
            f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')
    
        if train_accuracy > 99.999:
            break
    
    torch.save(model.state_dict(), f"weight/{save_name}")
    print(f"训练过程已保存到 '{csv_file}' 文件中。模型权重文件已保存")