#### 加载数据

transformers==4.40.0

torch==1.11.0+cu113

torch-geometric==2.5.2

torch-scatter==2.0.9

torch-sparse==0.6.13

#### 训练模型

In [1]:
import pickle
with open("/root/autodl-tmp/graph_data/hc3_train.pkl", "rb") as f:
    hc3_train = pickle.load(f)
with open("/root/autodl-tmp/graph_data/hc3_val.pkl", "rb") as f:
    hc3_val = pickle.load(f)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from datetime import datetime
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from tqdm import tqdm
import time
import math

# 构建 GCN 模型
class GCN2(nn.Module):
    def __init__(self,  input_dim, hidden_dim, output_dim):
        super(GCN2, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.fc = nn.Linear(output_dim, 1) 
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc(x)
        x = torch.mean(x, dim=0, keepdim=True)  
        return torch.sigmoid(x)  

class GCN4(nn.Module):
    def __init__(self,  input_dim, hidden_dim, hidden_dim2, hidden_dim3, output_dim):
        super(GCN4, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim2)
        self.conv3 = GCNConv(hidden_dim2, hidden_dim3)
        self.conv4 = GCNConv(hidden_dim3, output_dim)
        self.fc = nn.Linear(output_dim, 1) 
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc(x)
        x = torch.mean(x, dim=0, keepdim=True)  
        return torch.sigmoid(x)  

class GCN_Transformer(nn.Module):
    def __init__(self, gcn_features=768, hidden_dim=256, num_classes=1, nhead=4, num_layers=2):
        super().__init__()
        
        # GCN分支
        self.gcn_conv1 = GCNConv(gcn_features, hidden_dim)
        
        # Transformer分支
        self.pos_encoder = PositionalEncoding(gcn_features, max_len=10000)  # 位置编码
        encoder_layers = TransformerEncoderLayer(gcn_features, nhead)
        self.transformer = TransformerEncoder(encoder_layers, num_layers)
        
        # 特征融合
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim + gcn_features, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )
        
        
    def forward(self, graph_data, text_data=None):
        x_gcn = F.relu(self.gcn_conv1(graph_data.x, graph_data.edge_index))

        x_trans = self.pos_encoder(graph_data.x)
        x_trans = x_trans.unsqueeze(1)
        x_trans = self.transformer(x_trans)
        x_trans = x_trans.squeeze(1)

        fused = torch.cat([x_gcn, x_trans], dim=1)
        x = self.fc(fused)
        # 全局平均池化
        graph_embedding = fused.mean(dim=0).unsqueeze(0)  # [1, hidden+features]
        
        return torch.sigmoid(self.fc(graph_embedding))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 10000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)  
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.pe[:x.size(0)]  

In [2]:
seed = 2024
dataset_name = 'hc3'
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
input_dim = 768  # 输入维度
hidden_dim = 512  # 隐藏层维度
hidden_dim2 = 256  # 隐藏层维度
hidden_dim3 = 128  # 隐藏层维度
output_dim = 64  # 输出类别数
gcnmodel = GCN_Transformer(gcn_features=768).to(device)
optimizer = optim.Adam(gcnmodel.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [5]:
train_len = len(hc3_train['y'])
val_len = len(hc3_val['y'])
epochs = 20
train_loss = []
val_loss = []
train_acc = []
val_acc = []
val_max_acc = -1
writer = SummaryWriter(f'logs/{dataset_name}_{seed}'+ datetime.now().strftime("%Y%m%d-%H%M%S"))
start_time = time.time()
for epoch in range(epochs):
    # 训练集
    gcnmodel.train()
    epoch_loss = 0.0
    correct_predictions = 0
    for i in tqdm(range(train_len),  f"epoch: {epoch+1}, Training"):
        if hc3_train['all_token_embeddings'][i].shape[0] >= 10000:
            continue
        data = Data(x=hc3_train['all_token_embeddings'][i], edge_index=hc3_train['all_edge_index'][i], y=hc3_train['y'][i]).to(device)
        optimizer.zero_grad()
        outputs = gcnmodel(data)
        loss = criterion(outputs, data.y.float().view(-1, 1))
        # print(loss)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        predictions = (outputs >= 0.5).long()  
        correct_predictions += (predictions == data.y.view(-1, 1)).sum().item()
    epoch_loss /= train_len
    writer.add_scalar('Loss/train', epoch_loss, epoch)
    epoch_acc = correct_predictions / train_len
    writer.add_scalar('Acc/train', epoch_acc, epoch)
    print(f"epoch: {epoch+1}, train_loss: {epoch_loss}, train_acc: {epoch_acc}")
    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)
    
    # 验证集
    gcnmodel.eval()
    epoch_loss = 0.0
    correct_predictions = 0
    all_predictions = []
    with torch.no_grad():
        for i in tqdm(range(val_len),  f"epoch: {epoch+1}, Validation"):
            if hc3_val['all_token_embeddings'][i].shape[0] >= 10000:
                continue
            data = Data(x=hc3_val['all_token_embeddings'][i], edge_index=hc3_val['all_edge_index'][i], y=hc3_val['y'][i]).to(device)
            outputs = gcnmodel(data)
            loss = criterion(outputs, data.y.float().view(-1, 1))
            epoch_loss += loss.item()
            predictions = (outputs >= 0.5).long()
            all_predictions.append(predictions)
            correct_predictions += (predictions == data.y.view(-1, 1)).sum().item()
    epoch_loss /= val_len
    writer.add_scalar('Loss/val', epoch_loss, epoch)
    epoch_acc = correct_predictions / val_len
    writer.add_scalar('Acc/val', epoch_acc, epoch)
    print(f"epoch: {epoch+1}, val_loss: {epoch_loss}, val_acc: {epoch_acc}")
    val_loss.append(epoch_loss)
    val_acc.append(epoch_acc)

    if epoch_acc >= val_max_acc:
        val_max_acc = epoch_acc
        # torch.save(gcnmodel.state_dict(), f'./model/{dataset_name}_gcn_transformer_model_only_gcn_{seed}.pth')
end_time = time.time()
elapsed_time = end_time - start_time
print(f"运行时间: {elapsed_time} 秒")
writer.close()

epoch: 1, Training:   0%|          | 0/8000 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (68x256 and 768x256)

In [8]:
import pickle
test_file = "hc3_test_translate"
with open(f"/root/autodl-tmp/graph_data/{test_file}.pkl", "rb") as f:
    hc3_test = pickle.load(f)
test_len = len(hc3_test['y'])

In [9]:
from sklearn.metrics import roc_auc_score, f1_score

# test_gcnmodel = gcnmodel
test_gcnmodel = GCN_Transformer(gcn_features=768).to(device)
test_gcnmodel.load_state_dict(torch.load(f'./model/{dataset_name}_gcn_transformer_model_{seed}.pth'))
test_gcnmodel.eval()
test_loss = 0.0
correct_predictions = 0
test_pres = list()
start_time = time.time()
with torch.no_grad():
    for i in tqdm(range(test_len),  f"Test"):
        if hc3_test['all_token_embeddings'][i].shape[0] >= 10000:
                continue
        data = Data(x=hc3_test['all_token_embeddings'][i], edge_index=hc3_test['all_edge_index'][i], y=hc3_test['y'][i]).to(device)
        outputs = test_gcnmodel(data)
        test_pres.append(outputs.item())
        loss = criterion(outputs, data.y.float().view(-1, 1))
        test_loss += loss.item()
        predictions = (outputs >= 0.5).long()
        correct_predictions += (predictions == data.y.view(-1, 1)).sum().item()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"运行时间: {elapsed_time} 秒")
y_pred = [1 if prob >= 0.5 else 0 for prob in test_pres]
y_true = hc3_test['y'].view(-1, 1)
test_loss /= test_len
test_acc = correct_predictions / test_len
test_f1 = f1_score(y_true, y_pred)
print(f"test_loss: {test_loss}, test_acc: {test_acc}, test_f1: {test_f1}")

Test: 100%|██████████| 998/998 [00:02<00:00, 454.76it/s]

运行时间: 2.199662923812866 秒
test_loss: 0.9917000444925045, test_acc: 0.9078156312625251, test_f1: 0.9157509157509158





In [10]:
auc = roc_auc_score(hc3_test['y'], test_pres)
auc

0.9861881005173688

In [11]:
with open(f"test_result.txt", "a", encoding="utf-8") as w:
    w.write(f"{test_file}\t acc: {test_acc}\t auc: {auc}\t f1: {test_f1}\t seed: {seed}\t model: {dataset_name}\t{datetime.now()}\n")