In [107]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [108]:
# python 3.9 
import random
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split
from sklearn.mixture import GaussianMixture
from torch.utils.data import DataLoader, TensorDataset
from torchvision.transforms import ToTensor

数据集划分处理

In [109]:
# 设置随机种子
seed = 42
def seed_torch(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED']= str(seed)#为了禁止hash随机化，使得实验可复现np.random.seed(seed)torch.manual_seed(seed)torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)# if you are using multi-GPU.torch.backends.cudnn.benchmark = Falsetorch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark=False
    torch.backends.cudnn.deterministic=True
seed_torch(seed)
transform = ToTensor()

# 加载MNIST训练集和测试集
mnist_train = MNIST(root='./', train=True, download=True, transform=transform)
mnist_test = MNIST(root='./', train=False, download=True, transform=transform)

# 划分训练集和验证集
generator = torch.Generator().manual_seed(seed)
train_dataset, val_dataset = random_split(mnist_train,[50000,10000],generator=generator)



用kmeans划分簇

In [110]:
# 将MNIST数据集转换为嵌入空间
train_images = [img.view(-1).numpy() for img, _ in train_dataset]

pca = PCA(n_components=2)
embedded_train_images = pca.fit_transform(train_images)

# 进行聚类
kmeans = KMeans(n_clusters=10, random_state=seed)
cluster_labels = kmeans.fit_predict(embedded_train_images)

# 定义要查找的训练集图片的索引
target_indices = [1173, 3336, 12529, 12785, 12979, 17351, 27048, 40579, 43128, 46498]

# 找到目标索引所属的簇
target_clusters = [cluster_labels[idx] for idx in target_indices]
# 创建伪标签字典
pseudo_labels = {}
for i, cluster_label in enumerate(cluster_labels):
    if cluster_label in target_clusters:
        if cluster_label not in pseudo_labels:
            pseudo_labels[cluster_label] = []
        pseudo_labels[cluster_label].append(i)




置信度筛选

In [111]:
# 设置阈值
threshold = 0.7

# 对于每个簇，筛选置信度或距离满足阈值的图片索引
filtered_indices = []

for cluster_label, indices in pseudo_labels.items():
    cluster_images = [train_images[idx] for idx in indices]
    cluster_distances = kmeans.transform(pca.transform(cluster_images))

    if threshold < 1.0:
        # 使用高斯混合模型聚类中的概率
        confidences = np.min(cluster_distances, axis=1)
        filtered_indices.extend([indices[i] for i, conf in enumerate(confidences) if conf > threshold])
    else:
        # 使用距离排名
        sorted_indices = np.argsort(cluster_distances, axis=1)
        num_filtered = int(len(cluster_images) * threshold)
        filtered_indices.extend([indices[i] for i in sorted_indices[:, :num_filtered].flatten()])

final_indices = list(set(target_indices).union(set(filtered_indices)))

# 新的训练集
new_train_dataset = []

for idx in final_indices:
    img, label = train_dataset[idx]
    new_train_dataset.append((img, label))

    

先处理小数据集

In [112]:
# 图片标签允许使用的索引
allowed_indices = [1173, 3336, 12529, 12785, 12979, 17351, 27048, 40579, 43128, 46498]

# 从训练集中筛选出允许使用的图片和标签
filtered_data = [(mnist_train[i][0], mnist_train[i][1]) for i in allowed_indices]
filtered_images, filtered_labels = zip(*filtered_data)
filtered_dataset = torch.utils.data.TensorDataset(torch.stack(filtered_images),
                                                  torch.tensor(filtered_labels))

# 将训练集分为训练集和验证集
train_size = int(0.8 * len(filtered_dataset))
val_size = len(filtered_dataset) - train_size
train_dataset, val_dataset = random_split(filtered_dataset, [train_size, val_size])

# 加载训练集、验证集和测试集
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                          num_workers=8, persistent_workers=True)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(mnist_test, batch_size=batch_size, shuffle=False)



建立CNN模型


In [113]:

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2)
        self.fc1 = nn.Linear(32 * 7 * 7, 128)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu3(x)
        x = self.fc2(x)
        return x

小数据集的训练和测试

In [114]:
model = CNN()

# 
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
total_epochs = 10

for epoch in range(total_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_acc = 100.0 * correct / total

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    val_loss /= len(val_loader)
    val_acc = 100.0 * correct / total

    print(f'Epoch {epoch+1}/{total_epochs} - Training Loss: {train_loss:.4f} - Training Acc: {train_acc:.2f}% - '
          f'Validation Loss: {val_loss:.4f} - Validation Acc: {val_acc:.2f}%')
    

    # 在测试集上评估模型
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    test_loss /= len(test_loader)
    test_acc = 100.0 * correct / total

    print(f'Test Loss: {test_loss:.4f} - Test Acc: {test_acc:.2f}%')

Epoch 1/10 - Training Loss: 2.2942 - Training Acc: 0.00% - Validation Loss: 2.3231 - Validation Acc: 0.00%
Test Loss: 2.3061 - Test Acc: 10.09%
Epoch 2/10 - Training Loss: 2.2272 - Training Acc: 25.00% - Validation Loss: 2.3520 - Validation Acc: 0.00%
Test Loss: 2.3087 - Test Acc: 10.09%
Epoch 3/10 - Training Loss: 2.1688 - Training Acc: 25.00% - Validation Loss: 2.3917 - Validation Acc: 0.00%
Test Loss: 2.3183 - Test Acc: 10.09%
Epoch 4/10 - Training Loss: 2.0977 - Training Acc: 25.00% - Validation Loss: 2.4539 - Validation Acc: 0.00%
Test Loss: 2.3402 - Test Acc: 10.09%
Epoch 5/10 - Training Loss: 2.0195 - Training Acc: 25.00% - Validation Loss: 2.5374 - Validation Acc: 0.00%
Test Loss: 2.3805 - Test Acc: 10.09%
Epoch 6/10 - Training Loss: 1.9360 - Training Acc: 25.00% - Validation Loss: 2.6530 - Validation Acc: 0.00%
Test Loss: 2.4452 - Test Acc: 10.09%
Epoch 7/10 - Training Loss: 1.8526 - Training Acc: 25.00% - Validation Loss: 2.8053 - Validation Acc: 0.00%
Test Loss: 2.5379 - Tes

新数据集的训练和测试

In [115]:
# 导入数据
train_datas, val_datas = random_split (new_train_dataset,[10000,len(new_train_dataset)-10000], 
                                       generator=generator)
train_loader = DataLoader(train_datas, batch_size= 64, shuffle=True, num_workers=8, 
                          persistent_workers=True)
val_loader = DataLoader(val_datas, batch_size= 64, shuffle=False)
test_loader = DataLoader(mnist_test, batch_size= 64, shuffle=False)
model = CNN()

# 
optimizer = optim.Adam(model.parameters(), lr=0.003)
criterion = nn.CrossEntropyLoss()

# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
total_epochs = 40

for epoch in range(total_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    images, labels = images.to(device), labels.to(device)
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        labels = labels.long()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_acc = 100.0 * correct / total

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            labels = labels.long()
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    val_loss /= len(val_loader)
    val_acc = 100.0 * correct / total

    print(f'Epoch {epoch+1}/{total_epochs} - Training Loss: {train_loss:.4f} - Training Acc: {train_acc:.2f}% - '
          f'Validation Loss: {val_loss:.4f} - Validation Acc: {val_acc:.2f}%')
    

    # 在测试集上评估模型
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    test_loss /= len(test_loader)
    test_acc = 100.0 * correct / total

    print(f'Test Loss: {test_loss:.4f} - Test Acc: {test_acc:.2f}%')

Epoch 1/40 - Training Loss: 0.4600 - Training Acc: 85.08% - Validation Loss: 0.1519 - Validation Acc: 95.47%
Test Loss: 0.1771 - Test Acc: 94.33%
Epoch 2/40 - Training Loss: 0.1126 - Training Acc: 96.74% - Validation Loss: 0.0973 - Validation Acc: 97.06%
Test Loss: 0.1006 - Test Acc: 96.83%
Epoch 3/40 - Training Loss: 0.0682 - Training Acc: 97.95% - Validation Loss: 0.0796 - Validation Acc: 97.73%
Test Loss: 0.0910 - Test Acc: 96.96%
Epoch 4/40 - Training Loss: 0.0524 - Training Acc: 98.50% - Validation Loss: 0.0783 - Validation Acc: 97.61%
Test Loss: 0.0831 - Test Acc: 97.18%
Epoch 5/40 - Training Loss: 0.0377 - Training Acc: 98.83% - Validation Loss: 0.0630 - Validation Acc: 98.28%
Test Loss: 0.0675 - Test Acc: 97.76%
Epoch 6/40 - Training Loss: 0.0289 - Training Acc: 99.18% - Validation Loss: 0.0588 - Validation Acc: 98.21%
Test Loss: 0.0622 - Test Acc: 97.95%
Epoch 7/40 - Training Loss: 0.0252 - Training Acc: 99.15% - Validation Loss: 0.0729 - Validation Acc: 97.95%
Test Loss: 0.07