# 准备数据

In [1]:
import numpy as np
import pandas as pd
import torch

# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
from sklearn.model_selection import train_test_split

# 读取数据
data = pd.read_csv("../../data/dataset_reduced.csv")
data['target_class'] = pd.qcut(data['Cs'], q=10, labels=False)
X = data.drop(['Cs', 'target_class'], axis=1)
y = data['Cs']
stratify_column = data['target_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=stratify_column)

X_train_categ = X_train.iloc[:, 7].values  # 第九列为类别特征
X_train_cont = np.delete(X_train, 7, axis=1)  # 删除第九列，其他为连续特征

In [3]:
# 将 NumPy 数组转换为 PyTorch 张量
X_train_categ_tensor = torch.tensor(X_train_categ, dtype=torch.long)  # 类别特征需要使用长整型
X_train_categ_tensor = X_train_categ_tensor.unsqueeze(1).to(device)  # 在最后一个维度添加1
X_train_cont_tensor = torch.tensor(X_train_cont, dtype=torch.float).to(device)  # 连续特征使用浮点型
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float)  # 对于回归问题，通常使用浮点数
y_train_tensor = y_train_tensor.unsqueeze(1).to(device)

# 计算连续特征的均值和标准差
mean = X_train_cont_tensor.mean(dim=0)
std = X_train_cont_tensor.std(dim=0)
continuous_mean_std = torch.stack([mean, std], dim=1).to(device)

# 处理测试集
X_test_categ = X_test.iloc[:, 7].values
X_test_cont = np.delete(X_test, 7, axis=1)
X_test_categ_tensor = torch.tensor(X_test_categ, dtype=torch.long)
X_test_categ_tensor = X_test_categ_tensor.unsqueeze(1).to(device)
X_test_cont_tensor = torch.tensor(X_test_cont, dtype=torch.float).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float)
y_test_tensor = y_test_tensor.unsqueeze(1).to(device)

# 定义模型

In [4]:
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer
from torch_function import MAE_Loss

# 我们有9个特征，其中有1个类别特征，8个连续值特征
# 类别特征每个有2个唯一值
categories = (2,)
num_continuous = 8

# 初始化 TabTransformer 模型
model = TabTransformer(
    categories=categories,
    num_continuous=num_continuous,
    dim=18,  # 默认维度为32
    dim_out=1,  # 回归问题的输出维度为1
    depth=8,  # 默认深度为6
    heads=8,  # 注意力机制的头数
    attn_dropout=0.01,  # 注意力机制的dropout
    ff_dropout=0.01,  # 前馈网络的的dropout
    mlp_hidden_mults=(1, 4, 12, 1),  # MLP隐藏层的倍数
    mlp_act=nn.ReLU(),  # MLP的激活函数, 默认为ReLU
    continuous_mean_std=continuous_mean_std,  # 连续值的均值和标准差
)

# 将模型移动到 GPU
model.to(device)

# 初始化损失函数
criterion = MAE_Loss().to(device)

# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 训练模型

In [5]:
# 训练循环
num_epochs = 3000
patience = 100  # 允许的最大连续未改进 epoch 数
epochs_without_improvement = 0  # 连续未改进的 epoch 数
best_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    model.zero_grad()
    outputs = model(X_train_categ_tensor, X_train_cont_tensor)
    loss = criterion(outputs, y_train_tensor)  # 使用MSE损失函数
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}, Loss: {running_loss:.4f}')

    # 计算验证损失
    model.eval()
    with torch.no_grad():
        y_val_pred = model(X_train_categ_tensor, X_train_cont_tensor).to(device)
        y_train_tensor = y_train_tensor.to(device)
        val_loss = criterion(y_val_pred, y_train_tensor).item()

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), "tab_transformer_best_model_hidden1241.pth")
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= patience:
        print(f"Early stopping at epoch {epoch + 1}")
        break

Epoch 10, Loss: 88.8217
Epoch 20, Loss: 71.4939
Epoch 30, Loss: 43.1803
Epoch 40, Loss: 35.2271
Epoch 50, Loss: 32.3522
Epoch 60, Loss: 31.0799
Epoch 70, Loss: 30.2905
Epoch 80, Loss: 30.2007
Epoch 90, Loss: 30.1266
Epoch 100, Loss: 29.9553
Epoch 110, Loss: 29.9256
Epoch 120, Loss: 29.7320
Epoch 130, Loss: 29.5943
Epoch 140, Loss: 29.5612
Epoch 150, Loss: 29.2606
Epoch 160, Loss: 29.1201
Epoch 170, Loss: 28.9964
Epoch 180, Loss: 28.7370
Epoch 190, Loss: 28.4926
Epoch 200, Loss: 28.1645
Epoch 210, Loss: 27.6468
Epoch 220, Loss: 27.1129
Epoch 230, Loss: 26.4438
Epoch 240, Loss: 25.5785
Epoch 250, Loss: 24.6552
Epoch 260, Loss: 23.8551
Epoch 270, Loss: 23.2414
Epoch 280, Loss: 22.8040
Epoch 290, Loss: 22.6523
Epoch 300, Loss: 22.0220
Epoch 310, Loss: 22.1165
Epoch 320, Loss: 21.7318
Epoch 330, Loss: 21.6486
Epoch 340, Loss: 21.5430
Epoch 350, Loss: 21.5381
Epoch 360, Loss: 21.5023
Epoch 370, Loss: 21.3251
Epoch 380, Loss: 21.2013
Epoch 390, Loss: 21.0922
Epoch 400, Loss: 20.9923
Epoch 410

In [6]:
from function import metrics_to_dataframe

# 加载最佳模型的状态字典
model.load_state_dict(torch.load("tab_transformer_best_model_hidden1241.pth", weights_only=True))

# 将模型设置为评估模式
model.eval()
model.to(device)

with torch.no_grad():
    # 对训练集进行预测
    predictions = model(X_train_categ_tensor, X_train_cont_tensor)
    # 对测试集进行预测
    test_predictions = model(X_test_categ_tensor, X_test_cont_tensor)

    # 将结果转换为DataFrame
    tab_transformer_metrics = metrics_to_dataframe(
        y_train_tensor.cpu().numpy(), predictions.cpu().numpy(),
        y_test_tensor.cpu().numpy(), test_predictions.cpu().numpy(), "TabTransformer").round(3)
    tab_transformer_metrics.to_csv('TabTransformer_metrics.csv', index=False)

tab_transformer_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,TabTransformer,0.923,5.652,5.822,11.144,0.84,10.888,15.56,16.617001


In [7]:
# 保存训练集和测试集的预测结果（包含真实值）
tab_transformer_train = pd.DataFrame({'Actual': y_train_tensor.cpu().numpy().squeeze(), 'Predicted': predictions.cpu().numpy().squeeze()})
tab_transformer_test = pd.DataFrame({'Actual': y_test_tensor.cpu().numpy().squeeze(), 'Predicted': test_predictions.cpu().numpy().squeeze()})

tab_transformer_train.to_csv('tab_transformer_train.csv', index=False)
tab_transformer_test.to_csv('tab_transformer_test.csv', index=False)