In [1]:
import torch

# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# 读取数据

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# 读取数据
data = pd.read_csv("../data/dataset.csv")

# 数据分割
data['target_class'] = pd.qcut(data['Cs'], q=10, labels=False)
X = data.drop(['Cs', 'target_class'], axis=1).values
y = data['Cs'].values
stratify_column = data['target_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=stratify_column)

# 数据标准化
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 构建图数据对象, 转换数据为图数据结构

In [3]:
import numpy as np
from torch_geometric.data import Data

# 材料特性的索引和测试条件的索引
material_indices = list(range(7))  # 前七个特性
test_indices = list(range(7, 12))  # 后五个条件

# 构建边：仅为材料特性之间构建边
edges = []
for i in material_indices:
    for j in material_indices:
        if i != j:
            edges.append([i, j])

edges = np.array(edges).T  # 转置以匹配PyTorch Geometric的edge_index格式

# 转换为PyTorch张量
edge_index = torch.tensor(edges, dtype=torch.long)
X_train_torch = torch.tensor(X_train_scaled, dtype=torch.float)
y_train_torch = torch.tensor(y_train, dtype=torch.float).view(-1, 1)  # 确保y是列向量

# 创建图数据对象
train_data = Data(x=X_train_torch, edge_index=edge_index, y=y_train_torch).to(device)

# 打印数据对象信息，确认构建是否成功
print(train_data)

Data(x=[480, 12], edge_index=[2, 42], y=[480, 1])


## 定义模型

In [4]:
from torch import nn
from torch_geometric.nn import GCNConv

class MAPELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, predictions, targets):
        epsilon = 1e-8  # 避免除以零
        mape = torch.mean(torch.abs((targets - predictions) / (targets + epsilon))) * 100
        return mape

class GNN4TDL(nn.Module):
    def __init__(self, input_dim):
        super(GNN4TDL, self).__init__()
        self.conv1 = GCNConv(input_dim, 12)
        self.conv2 = GCNConv(12, 90)
        self.conv3 = GCNConv(90, 90)
        self.conv4 = GCNConv(90, 60)
        self.conv5 = GCNConv(60, 70)
        self.conv6 = GCNConv(70, 30)
        self.conv7 = GCNConv(30, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.conv3(x, edge_index)
        x = torch.relu(x)
        x = self.conv4(x, edge_index)
        x = torch.relu(x)
        x = self.conv5(x, edge_index)
        x = torch.relu(x)
        x = self.conv6(x, edge_index)
        x = torch.relu(x)
        x = self.conv7(x, edge_index)
        return x


In [5]:
from torch import optim

model = GNN4TDL(
    input_dim=X_train_scaled.shape[1]
)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
# mse_loss = nn.MSELoss().to(device)
mape_loss = MAPELoss().to(device)

## 训练模型

In [6]:
num_epochs = 3000
best_loss = float('inf')
cumulative_loss = 0.0
model.train()

for epoch in range(num_epochs):
    model.zero_grad()
    out = model(train_data)
    loss = mape_loss(out, train_data.y)  # Modify as per your loss function, e.g., mape_loss
    loss.backward()
    optimizer.step()
    cumulative_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        average_loss = cumulative_loss / 10
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')
        cumulative_loss = 0.0  # Reset cumulative loss

    # Save the best model
    if loss.item() < best_loss:
        best_loss = loss.item()
        torch.save(model.state_dict(), 'gnn_best_model.pth')

Epoch 10, Average Loss: 99.34335861206054
Epoch 20, Average Loss: 95.8920280456543
Epoch 30, Average Loss: 83.20887985229493
Epoch 40, Average Loss: 59.976660537719724
Epoch 50, Average Loss: 54.005150985717776
Epoch 60, Average Loss: 47.51027717590332
Epoch 70, Average Loss: 42.405466079711914
Epoch 80, Average Loss: 38.30121841430664
Epoch 90, Average Loss: 34.87344970703125
Epoch 100, Average Loss: 32.11855621337891
Epoch 110, Average Loss: 29.76716556549072
Epoch 120, Average Loss: 27.826988983154298
Epoch 130, Average Loss: 26.03719367980957
Epoch 140, Average Loss: 24.484006118774413
Epoch 150, Average Loss: 23.12535228729248
Epoch 160, Average Loss: 21.841281700134278
Epoch 170, Average Loss: 20.718341255187987
Epoch 180, Average Loss: 19.788359451293946
Epoch 190, Average Loss: 18.85236930847168
Epoch 200, Average Loss: 17.870597076416015
Epoch 210, Average Loss: 16.9438720703125
Epoch 220, Average Loss: 16.117920303344725
Epoch 230, Average Loss: 15.415686511993409
Epoch 240, 

## 6.构建测试集图数据对象

In [7]:
from function import metrics_to_dataframe, calculate_metrics

# 加载最佳模型的状态字典
model.load_state_dict(torch.load('gnn_best_model.pth', weights_only=True))

# 将模型设置为评估模式
model.eval()
model.to(device)

# 转换测试数据为张量
X_test_torch = torch.tensor(X_test_scaled, dtype=torch.float)
y_test_torch = torch.tensor(y_test, dtype=torch.float).view(-1, 1)  # 确保y是列向量

# 创建测试集图数据对象
test_data = Data(x=X_test_torch, edge_index=edge_index, y=y_test_torch).to(device)

with torch.no_grad():
    # 对训练集进行预测
    out = model(train_data)
    print("训练集预测结果:")
    print(out)

    # 计算训练集的指标
    train_metrics = calculate_metrics(train_data.y.cpu().numpy(), out.cpu().numpy())
    print("训练集指标:", train_metrics)

    # 对测试集进行预测
    test_out = model(test_data)
    print("测试集预测结果:")
    print(test_out)

    # 计算测试集的指标
    test_metrics = calculate_metrics(test_data.y.cpu().numpy(), test_out.cpu().numpy())
    print("测试集指标:", test_metrics)

    # 保存指标到CSV文件
    metrics_df = metrics_to_dataframe(train_data.y.cpu().numpy(), out.cpu().numpy(),
                                      test_data.y.cpu().numpy(), test_out.cpu().numpy(), 'GNN')
    metrics_df.to_csv('gnn_metrics.csv', index=False)

    print(metrics_df)


训练集预测结果:
tensor([[ 66.8181],
        [ 66.8181],
        [ 66.8181],
        [ 66.8181],
        [ 66.8181],
        [ 66.8181],
        [ 66.8181],
        [139.8364],
        [155.8848],
        [ 80.1820],
        [ 51.9934],
        [ 78.3552],
        [146.4905],
        [ 71.2654],
        [ 55.6744],
        [ 49.4663],
        [114.6858],
        [ 45.2909],
        [113.2356],
        [121.6429],
        [ 66.9843],
        [139.6713],
        [ 91.2711],
        [ 41.2199],
        [103.9537],
        [ 81.0079],
        [ 61.6311],
        [ 54.3573],
        [102.4572],
        [ 51.7242],
        [ 93.5381],
        [ 60.6645],
        [124.4267],
        [ 74.9495],
        [103.5839],
        [ 24.1408],
        [ 82.8756],
        [106.7130],
        [119.7432],
        [114.1906],
        [ 82.1467],
        [ 63.7557],
        [ 48.1355],
        [100.3587],
        [ 53.2747],
        [ 58.9528],
        [ 49.2889],
        [129.6292],
        [ 14.7635],
        [12

In [9]:
# 保存训练集和测试集的预测结果（包含真实值）
train_predictions = pd.DataFrame({'Actual': train_data.y.cpu().detach().numpy().flatten(),
                                  'Predicted': model(train_data).cpu().detach().numpy().flatten()})
test_predictions = pd.DataFrame({'Actual': test_data.y.cpu().detach().numpy().flatten(),
                                 'Predicted': model(test_data).cpu().detach().numpy().flatten()})

train_predictions.to_csv('gnn_train_predictions.csv', index=False)
test_predictions.to_csv('gnn_test_predictions.csv', index=False)