In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )

    def forward(self, x):
        return self.layers(x)

data = pd.read_excel('Dry_Bean_Dataset.xlsx')
data = data.dropna()

data = pd.read_excel('Dry_Bean_Dataset.xlsx')
print(data['Class'].value_counts())

# 1. 缺失值处理
data = data.dropna()

# 2. 异常值处理
# 只选取数值型列进行异常值处理
numeric_cols = data.select_dtypes(include=[np.number]).columns

Q1 = data[numeric_cols].quantile(0.25)
Q3 = data[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

# 对数值型列进行过滤，非数值型列保持原值
mask = ~((data[numeric_cols] < (Q1 - 1.5 * IQR)) | (data[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
data = data[mask]

# 定义特征和目标变量
X = data.drop('Class', axis=1)
y = data['Class']

# 对目标变量进行编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. 特征缩放 - 先使用 StandardScaler 进行标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. 特征缩放 - 使用 MinMaxScaler 进行归一化
min_max_scaler = MinMaxScaler()
X_train_normalized = min_max_scaler.fit_transform(X_train_scaled)
X_test_normalized = min_max_scaler.transform(X_test_scaled)

# 创建 PyTorch 数据集和数据加载器
X_train_tensor = torch.FloatTensor(X_train_normalized)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test_normalized)
y_test_tensor = torch.LongTensor(y_test)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

input_size = X_train_normalized.shape[1]
num_classes = len(set(y))

# 创建模型、损失函数和优化器
model = MLP(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 100
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# 测试模型
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.tolist())
        y_true.extend(targets.tolist())

# 输出结果
print("MLP - Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("MLP - Classification Report:")
print(classification_report(y_true, y_pred))


Class
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
Name: count, dtype: int64
MLP - Confusion Matrix:
[[296  22   0   1   2   6]
 [  8 342   0   3   1   5]
 [  0   0 950   3  10  36]
 [  0  10   3 302   0  15]
 [  3   0  11   0 345   6]
 [  2   0 102   4  14 677]]
MLP - Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.91      0.93       327
           1       0.91      0.95      0.93       359
           2       0.89      0.95      0.92       999
           3       0.96      0.92      0.94       330
           4       0.93      0.95      0.94       365
           5       0.91      0.85      0.88       799

    accuracy                           0.92      3179
   macro avg       0.93      0.92      0.92      3179
weighted avg       0.92      0.92      0.92      3179

