In [1]:
import os
import csv
import pickle
import torch
import torch.nn as nn
from torch_geometric.data import Dataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool, global_max_pool
from transformers import BertTokenizer, BertModel
from torch.utils.tensorboard import SummaryWriter
# BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('/home/jiangzhengqun/ml/task2/bert_model')

class GraphDataset(Dataset):
    def __init__(self, csv_file, root_dir):
        super().__init__()
        self.labels = []
        self.file_paths = []
        self.inputs = []
        self.root_dir = root_dir
        
        # 读取 CSV 文件，获取文件名、标签和输入序列
        with open(csv_file, mode='r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)  # 跳过表头
            for row in csv_reader:
                file_name, input_sequence, label = row
                self.file_paths.append(os.path.join(root_dir, file_name))
                self.labels.append(float(label))  # 根据需要调整数据类型
                tokenized_inputs = tokenizer(input_sequence, padding='max_length', truncation=True, return_tensors='pt', max_length=10)
                for key in tokenized_inputs:
                    tokenized_inputs[key] = tokenized_inputs[key].squeeze(0)
                self.inputs.append(tokenized_inputs) # 读取输入序列
    @property
    def raw_file_names(self):
        return self.file_paths
    
    @property
    def processed_file_names(self):
        return self.file_paths
    
    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        with open(self.file_paths[idx], 'rb') as f:
            graph = pickle.load(f)

        # 创建图数据对象
        node_type = torch.tensor(graph['node_type'], dtype=torch.float)
        num_inverted_predecessors = torch.tensor(graph['num_inverted_predecessors'], dtype=torch.float).unsqueeze(1)

        # 拼接两个特征矩阵
        x = torch.cat([node_type, num_inverted_predecessors], dim=1)
        data = Data(
            x=x,
            edge_index=torch.tensor(graph['edge_index'], dtype=torch.long),
            num_nodes=int(graph['nodes'])
        )
        # 添加标签
        data.y = torch.tensor([self.labels[idx]], dtype=torch.float)
        
        return data, self.inputs[idx]


class GCNBERTModel(nn.Module):
    def __init__(self, num_features):
        super(GCNBERTModel, self).__init__()
        self.gcn1 = GCNConv(num_features, 16)
        self.gcn2 = GCNConv(16, 32)
        self.bert_fc = nn.Linear(768, 32)
        self.fc_combined = nn.Sequential(
            nn.Linear(32 * 2, 32),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(),
            nn.Linear(32, 32),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(),
            nn.Linear(32, 1)
        )
        self.bert_model = BertModel.from_pretrained('/home/jiangzhengqun/ml/task2/bert_model')

    def forward(self, graph_data, tokenized_inputs):
        # GCN processing
        x, edge_index, batch = graph_data.x, graph_data.edge_index, graph_data.batch
        x = self.gcn1(x, edge_index)
        x = torch.relu(x)
        x = self.gcn2(x, edge_index)
        x = torch.relu(x)
        x = global_mean_pool(x, batch)

        # BERT processing
        bert_outputs = self.bert_model(**tokenized_inputs)
        sequence_output = bert_outputs.last_hidden_state
        sequence_emb = torch.mean(sequence_output, dim=1)
        sequence_emb = self.bert_fc(sequence_emb)

        # Combine GCN and BERT outputs
        combined = torch.cat([x, sequence_emb], dim=1)
        out = self.fc_combined(combined)
        return out

In [2]:
# Usage
train_file = '/home/jiangzhengqun/ml/dataset2/task2_train_data.csv'
train_dir = '/home/jiangzhengqun/ml/dataset2/task2_train_data'
train_set = GraphDataset(csv_file=train_file, root_dir=train_dir)
train_loader = DataLoader(train_set, batch_size=128)

test_file = '/home/jiangzhengqun/ml/dataset2/task2_test_data.csv'
test_dir = '/home/jiangzhengqun/ml/dataset2/task2_test_data'
test_set = GraphDataset(csv_file=test_file, root_dir=test_dir)
test_loader = DataLoader(test_set, batch_size=128)

In [3]:
# Example usage
model = GCNBERTModel(num_features=2)
criterion = nn.MSELoss()  # Mean Squared Error loss for regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = torch.nn.MSELoss().to(device)

In [4]:
# writer = SummaryWriter('./runs')
writer = SummaryWriter('./run_2024_6_7')
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    n_loss = 0
    for data, tokenized_inputs in train_loader:
        data.to(device)
        for key in tokenized_inputs:
            tokenized_inputs[key] = tokenized_inputs[key].to(device)
        optimizer.zero_grad()
        out = model(data, tokenized_inputs).squeeze(1)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        n_loss += 1
    train_loss = epoch_loss / n_loss
    print(f'after epoch {epoch + 1} the train_loss is {train_loss}')
    model.eval()
    epoch_loss = 0
    n_loss = 0
    with torch.no_grad():
        for data, tokenized_inputs in test_loader:
            data.to(device)
            for key in tokenized_inputs:
                tokenized_inputs[key] = tokenized_inputs[key].to(device)
            optimizer.zero_grad()
            out = model(data, tokenized_inputs).squeeze(1)
            loss = criterion(out, data.y)
            epoch_loss += loss.item()
            n_loss += 1
    eval_loss = epoch_loss / n_loss
    writer.add_scalars('Loss', {'train': train_loss, 'valid': eval_loss}, epoch + 1)
    print(f'after epoch {epoch + 1} the eval_loss is {eval_loss}')
    # 保存模型参数
    if (epoch + 1) % 100 == 0 or (epoch + 1) == num_epochs:
        torch.save(model.state_dict(), f'./model1/model_epoch_{epoch + 1}.pth')
writer.close()
# 在训练结束时保存最终的模型参数
torch.save(model.state_dict(), './model1/model_final.pth')


  node_type = torch.tensor(graph['node_type'], dtype=torch.float)
  num_inverted_predecessors = torch.tensor(graph['num_inverted_predecessors'], dtype=torch.float).unsqueeze(1)
  edge_index=torch.tensor(graph['edge_index'], dtype=torch.long),


after epoch 1 the train_loss is 0.07283311248432066
after epoch 1 the eval_loss is 0.08024843511256305
after epoch 2 the train_loss is 0.05655975732140157
after epoch 2 the eval_loss is 0.07324079378039725
after epoch 3 the train_loss is 0.05512824948460167
after epoch 3 the eval_loss is 0.05598431573757394
after epoch 4 the train_loss is 0.05445767522296803
after epoch 4 the eval_loss is 0.05579523812048137
after epoch 5 the train_loss is 0.054217128901401276
after epoch 5 the eval_loss is 0.06195736669575457
after epoch 6 the train_loss is 0.05424709628717866
after epoch 6 the eval_loss is 0.054949718998448756
after epoch 7 the train_loss is 0.053776135719597856
after epoch 7 the eval_loss is 0.06177992922973565
after epoch 8 the train_loss is 0.054321070408805947
after epoch 8 the eval_loss is 0.05716705598487434
after epoch 9 the train_loss is 0.05415920694331573
after epoch 9 the eval_loss is 0.05318928686690263
after epoch 10 the train_loss is 0.05390553713402559
after epoch 10 t