# 准备数据

In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
# 读取数据
data = pd.read_csv("../data/dataset.csv")

# 数据分割
data['target_class'] = pd.qcut(data['Cs'], q=10, labels=False)
X = data.drop(['Cs', 'target_class'], axis=1).values
y = data['Cs'].values
stratify_column = data['target_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=stratify_column)

X_train_categ = X_train[:, 8]  # 第九列为类别特征
X_train_cont = np.delete(X_train, 8, axis=1)  # 删除第九列，其他为连续特征

# 将 NumPy 数组转换为 PyTorch 张量
X_train_categ_torch = torch.tensor(X_train_categ, dtype=torch.long)  # 类别特征需要使用长整型
X_train_categ_torch = X_train_categ_torch.unsqueeze(1).to(device)  # 在最后一个维度添加1
X_train_cont_torch = torch.tensor(X_train_cont, dtype=torch.float).to(device)  # 连续特征使用浮点型
y_train_torch = torch.tensor(y_train, dtype=torch.float)  # 对于回归问题，通常使用浮点数
y_train_torch = y_train_torch.unsqueeze(1).to(device)

# 计算连续特征的均值和标准差
mean = X_train_cont_torch.mean(dim=0)
std = X_train_cont_torch.std(dim=0)
continuous_mean_std = torch.stack([mean, std], dim=1).to(device)

# 处理测试集
X_test_categ = X_test[:, 8]
X_test_cont = np.delete(X_test, 8, axis=1)
X_test_categ_torch = torch.tensor(X_test_categ, dtype=torch.long)
X_test_categ_torch = X_test_categ_torch.unsqueeze(1).to(device)
X_test_cont_torch = torch.tensor(X_test_cont, dtype=torch.float).to(device)
y_test_torch = torch.tensor(y_test, dtype=torch.float)
y_test_torch = y_test_torch.unsqueeze(1).to(device)

# 自定义 MAPE 损失函数

In [3]:
class MAPELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, predictions, targets):
        epsilon = 1e-8  # 避免除以零
        mape = torch.mean(torch.abs((targets - predictions) / (targets + epsilon))) * 100
        return mape

# 定义模型

In [4]:
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer

# 我们有12个特征，其中有1个类别特征，11个连续值特征
# 类别特征每个有2个唯一值
categories = (2,)
num_continuous = 11

# 初始化 TabTransformer 模型
model = TabTransformer(
    categories=categories,
    num_continuous=num_continuous,
    dim=16,  # 默认维度为32
    dim_out=1,  # 回归问题的输出维度为1
    depth=6,  # 默认深度为6
    heads=8,  # 注意力机制的头数
    attn_dropout=0.01,  # 注意力机制的dropout
    ff_dropout=0.01,  # 前馈网络的的dropout
    mlp_hidden_mults=(1, 2, 4, 1),  # MLP隐藏层的倍数
    mlp_act=nn.ReLU(),  # MLP的激活函数, 默认为ReLU
    continuous_mean_std=continuous_mean_std,  # 连续值的均值和标准差
)

# 将模型移动到 GPU
model.to(device)

# 初始化损失函数
# mse_loss = nn.MSELoss()
mape_loss = MAPELoss().to(device)

# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 训练模型

In [5]:
from function import metrics_to_dataframe, calculate_metrics

# 训练循环
num_epochs = 3000
best_loss = float('inf')
cumulative_loss = 0.0
model.train()

for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_categ_torch, X_train_cont_torch)
    loss = mape_loss(outputs, y_train_torch)  # 使用MSE损失函数
    loss.backward()
    optimizer.step()
    cumulative_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        average_loss = cumulative_loss / 10
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')
        cumulative_loss = 0.0  # 重置累积损失

    if loss.item() < best_loss:
        best_loss = loss.item()
        torch.save(model.state_dict(), 'best_model.pth')  # 保存最佳模型

Epoch 10, Average Loss: 99.66233291625977
Epoch 20, Average Loss: 96.31603393554687
Epoch 30, Average Loss: 75.8355499267578
Epoch 40, Average Loss: 45.06487274169922
Epoch 50, Average Loss: 41.079642105102536
Epoch 60, Average Loss: 39.854240798950194
Epoch 70, Average Loss: 39.140023422241214
Epoch 80, Average Loss: 38.6615104675293
Epoch 90, Average Loss: 38.17184333801269
Epoch 100, Average Loss: 37.78788795471191
Epoch 110, Average Loss: 37.610953903198244
Epoch 120, Average Loss: 37.502573013305664
Epoch 130, Average Loss: 37.39039840698242
Epoch 140, Average Loss: 37.30012092590332
Epoch 150, Average Loss: 37.19655647277832
Epoch 160, Average Loss: 37.064575576782225
Epoch 170, Average Loss: 36.871530532836914
Epoch 180, Average Loss: 36.77138175964355
Epoch 190, Average Loss: 36.535007858276366
Epoch 200, Average Loss: 36.35811195373535
Epoch 210, Average Loss: 36.12175025939941
Epoch 220, Average Loss: 35.83407440185547
Epoch 230, Average Loss: 35.472109985351565
Epoch 240, Av

In [6]:
# 加载保存的最佳模型状态字典
model_state_dict = torch.load('best_model.pth', weights_only=True)
model.load_state_dict(model_state_dict)
# 评估模式
model.eval()
with torch.no_grad():
    predictions = model(X_train_categ_torch, X_train_cont_torch)
    print("Training Predictions:")
    print(predictions)

    # 计算训练集的指标
    train_metrics = calculate_metrics(y_train_torch.cpu().numpy(), predictions.cpu().numpy())
    print("Training Metrics:", train_metrics)

    # 对测试集进行预测
    test_predictions = model(X_test_categ_torch, X_test_cont_torch)
    test_metrics = calculate_metrics(y_test_torch.cpu().numpy(), test_predictions.cpu().numpy())
    print("Test Metrics:", test_metrics)

    # 将结果转换为DataFrame
    tab_transformer_metrics = metrics_to_dataframe(y_train_torch.cpu().numpy(), predictions.cpu().numpy(),
                                      y_test_torch.cpu().numpy(), test_predictions.cpu().numpy(), "TabTransformer").round(3)
    tab_transformer_metrics.to_csv('ann_metrics.csv', index=False)
    print(tab_transformer_metrics)

Training Predictions:
tensor([[ 43.6939],
        [165.9922],
        [ 69.9694],
        [ 35.4476],
        [ 73.6449],
        [166.1232],
        [ 90.8249],
        [142.6071],
        [162.4902],
        [ 77.6557],
        [ 51.6398],
        [ 77.3661],
        [142.5352],
        [ 72.0646],
        [ 50.6148],
        [ 48.9761],
        [110.5851],
        [ 45.4163],
        [110.9482],
        [121.6952],
        [ 66.2022],
        [146.0184],
        [ 91.6008],
        [ 41.5884],
        [104.5343],
        [ 81.2105],
        [ 61.7363],
        [ 49.0624],
        [103.0496],
        [ 51.4770],
        [ 99.0285],
        [ 60.5295],
        [128.6222],
        [ 74.3125],
        [102.8935],
        [ 23.8201],
        [ 83.6700],
        [108.1507],
        [119.4769],
        [109.2993],
        [ 84.2494],
        [ 54.6366],
        [ 44.7992],
        [102.3680],
        [ 53.1823],
        [ 58.6458],
        [ 49.3005],
        [122.1956],
        [ 14.7462]

In [8]:
# 保存训练集和测试集的预测结果（包含真实值）
tab_transformer_train = pd.DataFrame({'Actual': y_train_torch.cpu().numpy().squeeze(), 'Predicted': predictions.cpu().numpy().squeeze()})
tab_transformer_test = pd.DataFrame({'Actual': y_test_torch.cpu().numpy().squeeze(), 'Predicted': test_predictions.cpu().numpy().squeeze()})

tab_transformer_train.to_csv('tab_transformer_train.csv', index=False)
tab_transformer_test.to_csv('tab_transformer_test.csv', index=False)