In [1]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.autograd import Variable, Function
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
import re
import networkx as nx
import pandas as pd
import time, os
import numpy as np

In [2]:
num_epochs = 200
batch_size = 32
learning_rate = 0.001
hidden_dim = 16

In [3]:
import numpy as np
import networkx as nx
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
import pandas as pd

def get_graph(filepath):
    dataGraph = pd.read_csv(filepath, sep=' ', skiprows=0)
    graph = nx.Graph()
    edges = np.array(dataGraph)
    for edge in edges:
        print('leftNode:', edge[0], 'rightNode:', edge[1])
        graph.add_edge(edge[0], edge[1])
    return graph

def get_edge_embeddings(graph, savepath):  # node2vec生成边向量表征(64维)
    node2vec = Node2Vec(graph, dimensions=64, walk_length=5, num_walks=10, workers=1)
    model = node2vec.fit(window=10, min_count=1, batch_words=1)
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
    edges_kv = edges_embs.as_keyed_vectors()
    edges_kv.save_word2vec_format(savepath)

def pro_data_main(dataset):
    print("Input dataset:", dataset)
    pathReal = 'Datasets/' + dataset + '/realData.csv'
    #only real graph to be embedded
    # pathFake = 'Data/' + dataset + '/fakeData.csv'
    graph = get_graph(pathReal) #real graph
    savepath = 'Datasets/node2vecFeature/' + dataset.lower()+ 'Feature.txt'  # 最终数据
    get_edge_embeddings(graph, savepath)

if __name__ == '__main__':
    datasets = ['Facebook']
    for dataset in datasets:
        pro_data_main(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Input dataset: YELP
leftNode: 0 rightNode: 3924
leftNode: 1 rightNode: 3925
leftNode: 2 rightNode: 3926
leftNode: 3 rightNode: 3924
leftNode: 4 rightNode: 3924
leftNode: 5 rightNode: 3927
leftNode: 6 rightNode: 3924
leftNode: 7 rightNode: 3926
leftNode: 8 rightNode: 3928
leftNode: 9 rightNode: 3929
leftNode: 10 rightNode: 3930
leftNode: 11 rightNode: 3931
leftNode: 12 rightNode: 3929
leftNode: 13 rightNode: 3927
leftNode: 14 rightNode: 3932
leftNode: 15 rightNode: 3924
leftNode: 16 rightNode: 3933
leftNode: 17 rightNode: 3927
leftNode: 18 rightNode: 3934
leftNode: 19 rightNode: 3927
leftNode: 20 rightNode: 3934
leftNode: 21 rightNode: 3924
leftNode: 22 rightNode: 3924
leftNode: 23 rightNode: 3925
leftNode: 24 rightNode: 3924
leftNode: 25 rightNode: 3925
leftNode: 26 rightNode: 3924
leftNode: 27 rightNode: 3924
leftNode: 28 rightNode: 3924
leftNode: 29 rightNode: 3924
leftNode: 30 rightNode: 3924
leftNode: 31 rightNode: 3924
leftNode: 32 rightNode: 3927
leftNode: 33 rightNode: 3924
left

Computing transition probabilities: 100%|█████████████████████████████████████████████████████████████████████████████| 6011/6011 [00:09<00:00, 652.95it/s]
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:07<00:00,  1.39it/s]
Generating edge features: 100%|███████████████████████████████████████████████████████████████████████████| 18069066/18069066.0 [04:03<00:00, 74254.54it/s]


MemoryError: Unable to allocate 4.31 GiB for an array with shape (18069066, 64) and data type float32

In [5]:
class edgeFeatures(object):
    def __init__(self, label=None, type = None, embeddings = None):
        self.label = label
        self.type = type
        self.embeddings = embeddings
        return

def structuralGraph(realFileName, fakeFileName, dataset):
    dataReal = pd.read_csv(realFileName, sep=' ', skiprows=0)
    dataFake = pd.read_csv(fakeFileName, sep=' ', skiprows=0)

    train_Real_Graph = nx.Graph()
    train_Fake_Graph = nx.Graph()
    test_Real_Graph = nx.Graph()
    test_Fake_Graph = nx.Graph()

    real_edge_Attritube = np.array(dataReal.iloc[:, 0:3])
    fake_edge_Attritube = np.array(dataFake.iloc[:, 0:3])

    lenReal = len(real_edge_Attritube)
    lenFake = len(fake_edge_Attritube)

    # print(real_edge_Attritube)
    #new type id according to dataset
    if dataset.lower() == 'facebook':
        dataNewType = [9, 8, 7, 6, 5, 4]
    else:
        dataNewType = [2]

    for i in range(lenReal):
        relation = real_edge_Attritube[i][2]
        if relation in dataNewType:
            test_Real_Graph.add_edge(real_edge_Attritube[i][0], real_edge_Attritube[i][1], relationship=relation)
        else:
            train_Real_Graph.add_edge(real_edge_Attritube[i][0], real_edge_Attritube[i][1], relationship=relation)


    for i in range(lenFake):
        relation = fake_edge_Attritube[i][2]
        if relation in dataNewType:
            test_Fake_Graph.add_edge(fake_edge_Attritube[i][0], fake_edge_Attritube[i][1], relationship=relation)
        else:
            train_Fake_Graph.add_edge(fake_edge_Attritube[i][0], fake_edge_Attritube[i][1], relationship=relation)

    return train_Real_Graph, train_Fake_Graph, test_Real_Graph, test_Fake_Graph

def get_train_validate_test(dataset):
    realFileName = 'Datasets/' + dataset + '/realData.csv'
    fakeFileName = 'Datasets/' + dataset + '/fakeData.csv'
    train_Real_Graph, train_Fake_Graph, test_Real_Graph, test_Fake_Graph = structuralGraph(realFileName, fakeFileName, dataset)
    node2vecReFile = "Datasets/node2vecFeature/" + dataset + "Feature.txt"
    data = pd.read_csv(node2vecReFile, sep=' ', skiprows=1, header=None)
    edges = np.array(data.iloc[:, 0:1]) + np.array(data.iloc[:, 1:2])
    embeddings = np.array(data.iloc[:, 2:66])
    nodeL = np.array(data.iloc[:, 0:1])
    nodeR = np.array(data.iloc[:, 1:2])
    train_data = []
    test = []
    for i in range(len(edges)):
        edgeFeature = edgeFeatures(" ")
        nodel = int(re.sub("\D", "", nodeL[i][0]))
        noder = int(re.sub("\D", "", nodeR[i][0]))
        if train_Real_Graph.has_edge(nodel, noder) or train_Fake_Graph.has_edge(nodel, noder): # train set
            if train_Real_Graph.has_edge(nodel, noder):
                label = 1
                type = train_Real_Graph.get_edge_data(nodel, noder)['relationship']
            else:
                label = 0
                type = train_Fake_Graph.get_edge_data(nodel, noder)['relationship']
            edgeFeature.embeddings = embeddings[i]
            edgeFeature.label = label
            edgeFeature.type = type
            train_data.append(edgeFeature)
        elif test_Real_Graph.has_edge(nodel, noder) or test_Fake_Graph.has_edge(nodel, noder):  # test set
            if test_Real_Graph.has_edge(nodel, noder):
                label = 1
                type = test_Real_Graph.get_edge_data(nodel, noder)['relationship']
            else:
                label = 0
                type = test_Fake_Graph.get_edge_data(nodel, noder)['relationship']
            edgeFeature.embeddings = embeddings[i]
            edgeFeature.label = label
            edgeFeature.type = type
            test.append(edgeFeature)
        else:
            continue

    train, validate = train_test_split(train_data, test_size=0.2)  # train_test_split返回切分的数据集train/validate
    train_dataset = []
    validate_dataset = []
    test_dataset = []
    for index, element in enumerate(train):
        vectors = torch.tensor(element.embeddings, dtype=torch.float32)
        label = torch.tensor(element.label, dtype=torch.float32)
        type = torch.tensor(element.type, dtype=torch.float32)
        m = [vectors, label, type]
        train_dataset.append(m)
    for index, element in enumerate(validate):
        vectors = torch.tensor(element.embeddings, dtype=torch.float32)
        label = torch.tensor(element.label, dtype=torch.float32)
        type = torch.tensor(element.type, dtype=torch.float32)
        m = [vectors, label, type]
        validate_dataset.append(m)
    for index, element in enumerate(test):
        vectors = torch.tensor(element.embeddings, dtype=torch.float32)
        label = torch.tensor(element.label, dtype=torch.float32)
        type = torch.tensor(element.type, dtype=torch.float32)
        m = [vectors, label, type]
        test_dataset.append(m)
    print('train length', len(train_dataset))
    print('validate length', len(validate_dataset))
    print('test length', len(test_dataset))
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    validate_loader = DataLoader(dataset=validate_dataset, batch_size=batch_size,  shuffle=False)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, validate_loader, test_loader

def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)

class re_shape(Function):
    @staticmethod
    def forward(ctx, x):
        return x.view_as(x.reshape(len(x),len(x[0][0])))

    @staticmethod
    def backward(ctx, grad_output):
        output =  grad_output.reshape(len(grad_output),1,len(grad_output[0]))
        return output,None

class GradReverse(Function):
    @ staticmethod
    def forward(ctx, x, lambd, **kwargs: None):
        ctx.lambd = lambd
        return x.view_as(x)

    @staticmethod
    def backward(ctx, *grad_output):
        return grad_output[0] * -ctx.lambd, None

    def backward(ctx, grad_output):
        return grad_output * -ctx.lambd, None

class adversarial_neural_networks(nn.Module):
    def __init__(self, predicted_Type):
        super(adversarial_neural_networks, self).__init__()
        self.predicted_Type = predicted_Type

        ##The generative predictor
        self.predictor = nn.Sequential()
        self.predictor.add_module('exta_Conv1',nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10, stride=1, padding=0))
        self.predictor.add_module('fully_connected_layer1', nn.Linear(55, 32))

        self.predictor_classifier = nn.Sequential()
        self.predictor_classifier.add_module('c_fc1', nn.Linear(32,24))
        self.predictor_classifier.add_module('c_fc1_relu', nn.ReLU())
        self.predictor_classifier.add_module('c_fc2', nn.Linear(24, 16))
        self.predictor_classifier.add_module('c_fc2_relu', nn.ReLU())
        self.predictor_classifier.add_module('c_fc3', nn.Linear(16, 2))
        self.predictor_classifier.add_module('c_softmax', nn.Softmax(dim=1))  # 对每一行进行softmax

        #discriminative classifier learn shared feature
        self.discriminative_classifier = nn.Sequential()
        self.discriminative_classifier.add_module('d_fc1', nn.Linear(32, 16))
        self.discriminative_classifier.add_module('relu_f1', nn.ReLU())
        self.discriminative_classifier.add_module('d_fc2', nn.Linear(16, self.predicted_Type))
        self.discriminative_classifier.add_module('d_softmax',nn.Softmax(dim=1))

    def forward(self, embeddings):
        embeddings = self.predictor(embeddings)
        shared_embeddings = re_shape.apply(embeddings)
        link_output = self.predictor_classifier(shared_embeddings)
        reverse_embeddings = GradReverse.apply(shared_embeddings, 1.0)
        type_output = self.discriminative_classifier(reverse_embeddings)
        return link_output, type_output

def to_np(x):
    return x.data.cpu().numpy()

def train_adversarial_neural_networks(train_loader, validate_loader, model, output_file):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(list(model.parameters()), lr=learning_rate)
    best_validate_acc = 0.000
    best_validate_dir = ''

    print('training model')
    # Train of Model
    for epoch in range(num_epochs):  # num_epochs is 50
        p = float(epoch) / 100
        lr = learning_rate / (1. + 10 * p) ** 0.75
        optimizer.lr = lr
        cost_vector = []
        prediction_cost_vector = []
        classification_cost_vector = []
        acc_vector = []
        valid_acc_vector = []
        vali_cost_vector = []
        train_score = []
        train_label = []
        for i, (train_data, train_labels, type_labels) in enumerate(train_loader):
            optimizer.zero_grad()
            train_data = to_var(train_data)
            train_labels = to_var(train_labels),
            type_labels = to_var(type_labels)
            link_outputs, type_outputs = model(train_data.unsqueeze(1))
            train_score += list(link_outputs[:, 1].cpu().detach().numpy())
            train_label += list(train_labels[0].numpy())
            train_labels = train_labels[0]
            train_labels = train_labels.long()
            type_labels = type_labels.long()
            prediction_loss = criterion(link_outputs, train_labels)
            classification_loss = criterion(type_outputs, type_labels)
            loss = prediction_loss + classification_loss
            loss.backward()
            optimizer.step()
            _, argmax = torch.max(link_outputs, 1)
            accuracy = (train_labels == argmax.squeeze()).float().mean()
            prediction_cost_vector.append(prediction_loss.item())
            classification_cost_vector.append(classification_loss.item())
            cost_vector.append(loss.item())
            acc_vector.append(accuracy.item())

        # validate process
        model.eval()
        validate_acc_vector_temp = []
        for i, (validate_data, validate_labels, type_labels) in enumerate(validate_loader):
            validate_data = to_var(validate_data)
            validate_labels = to_var(validate_labels)
            type_labels = to_var(type_labels)
            validate_outputs, type_outputs = model(validate_data.unsqueeze(1))
            _, validate_argmax = torch.max(validate_outputs, 1)
            validate_labels = validate_labels.long()
            vali_loss = criterion(validate_outputs, validate_labels)
            validate_accuracy = (validate_labels == validate_argmax.squeeze()).float().mean()
            vali_cost_vector.append(vali_loss.item())
            validate_acc_vector_temp.append(validate_accuracy.item())
        validate_acc = np.mean(validate_acc_vector_temp)
        valid_acc_vector.append(validate_acc)
        model.train()
        print('Epoch [%d/%d],  Loss: %.4f, Link Prediction Loss: %.4f, Type Classification Loss: %.4f, Train_Acc: %.4f,  Validate_Acc: %.4f.'
              % (epoch + 1, num_epochs, np.mean(cost_vector), np.mean(prediction_cost_vector), np.mean(classification_cost_vector),
                 np.mean(acc_vector), validate_acc))

        if validate_acc > best_validate_acc:
            best_validate_acc = validate_acc
            if not os.path.exists(output_file):
                os.mkdir(output_file)
            best_validate_dir = output_file + str(epoch + 1) + '.pkl'
            torch.save(model.state_dict(), best_validate_dir)
    return best_validate_dir

def test_adversarial_neural_networks(best_validate_dir, test_loader, model):
    # Test the Model
    print('testing model')
    model.load_state_dict(torch.load(best_validate_dir))
    if torch.cuda.is_available():
        model.cuda()
    model.eval()
    test_score = []
    test_pred = []
    test_true = []
    tes_score = []
    tes_label = []

    for i, (test_data, test_labels, type_labels) in enumerate(test_loader):
        test_data = to_var(test_data)
        test_labels = to_var(test_labels)
        # type_labels = to_var(type_labels)
        test_outputs, type_outputs = model(test_data.unsqueeze(1))
        tes_score += list(test_outputs[:, 1].cpu().detach().numpy())
        tes_label += list(test_labels.numpy())
        _, test_argmax = torch.max(test_outputs, 1)
        if i == 0:
            test_score = to_np(test_outputs)
            test_pred = to_np(test_argmax)
            test_true = to_np(test_labels)
        else:
            test_score = np.concatenate((test_score, to_np(test_outputs)), axis=0)
            test_pred = np.concatenate((test_pred, to_np(test_argmax)), axis=0)
            test_true = np.concatenate((test_true, to_np(test_labels)), axis=0)

    test_accuracy = metrics.accuracy_score(test_true, test_pred)
    test_precision = metrics.precision_score(test_true, test_pred, average='macro')
    test_aucroc = metrics.roc_auc_score(tes_label, tes_score, average='macro')

    return test_aucroc, test_precision, test_accuracy

def main(predicted_Type, dataset, output_file):
    train_loader, validate_loader, test_loader = get_train_validate_test(dataset)
    model = adversarial_neural_networks(predicted_Type)
    best_validate_dir = train_adversarial_neural_networks(train_loader, validate_loader, model, output_file)
    auc, precision, accuracy = test_adversarial_neural_networks(best_validate_dir, test_loader, model)
    print("Final reault: AUC -- %.4f  " % (auc), "Precision -- %.4f  " % (precision), 'Accuracy -- %.4f  ' % (accuracy))

if __name__ == '__main__':
    datasets = ['Facebook', 'IMDB', 'YELP', 'DBLP']
    dataset = datasets[1]
    print('Input dataset is:', dataset)
    predicted_Type_datasets = {'Facebook': 4, 'IMDB': 2, 'YELP': 2, 'DBLP': 2}
    predicted_Type = predicted_Type_datasets[dataset]
    output_file = 'trainOutput/' + dataset.lower() + '/output.txt'
    main(predicted_Type, dataset, output_file)

Input dataset is: IMDB


MemoryError: Unable to allocate 1.25 GiB for an array with shape (2616482, 64) and data type float64

In [23]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.autograd import Variable, Function
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
import re, networkx as nx
import pandas as pd
import numpy as np
import time, os, random

# ===================
# GA CONFIGURATION
# ===================
NUM_EPOCHS = 50
POPULATION_SIZE = 10
NUM_GENERATIONS = 5
MUTATION_RATE = 0.1

class edgeFeatures:
    def __init__(self, label=None, type=None, embeddings=None):
        self.label = label
        self.type = type
        self.embeddings = embeddings

def structuralGraph(realFileName, fakeFileName, dataset):
    dataReal = pd.read_csv(realFileName, sep=' ')
    dataFake = pd.read_csv(fakeFileName, sep=' ')

    train_Real_Graph = nx.Graph()
    train_Fake_Graph = nx.Graph()
    test_Real_Graph = nx.Graph()
    test_Fake_Graph = nx.Graph()

    real_edges = np.array(dataReal.iloc[:, 0:3])
    fake_edges = np.array(dataFake.iloc[:, 0:3])

    dataNewType = [9, 8, 7, 6, 5, 4] if dataset.lower() == 'facebook' else [2]

    for edge in real_edges:
        graph = test_Real_Graph if edge[2] in dataNewType else train_Real_Graph
        graph.add_edge(edge[0], edge[1], relationship=edge[2])

    for edge in fake_edges:
        graph = test_Fake_Graph if edge[2] in dataNewType else train_Fake_Graph
        graph.add_edge(edge[0], edge[1], relationship=edge[2])

    return train_Real_Graph, train_Fake_Graph, test_Real_Graph, test_Fake_Graph

import pandas as pd
import numpy as np

def get_train_validate_test(dataset):
    realFileName = 'Datasets/' + dataset + '/realData.csv'
    fakeFileName = 'Datasets/' + dataset + '/fakeData.csv'
    train_Real_Graph, train_Fake_Graph, test_Real_Graph, test_Fake_Graph = structuralGraph(realFileName, fakeFileName, dataset)
    
    node2vecReFile = "Datasets/node2vecFeature/" + dataset + "Feature.txt"
    data = pd.read_csv(node2vecReFile, sep=' ', skiprows=1, header=None, dtype=str).dropna()

    edges = np.array(data.iloc[:, 0:1]) + np.array(data.iloc[:, 1:2])
    embeddings = np.array(data.iloc[:, 2:66])

    # Convert nodeL and nodeR to numeric
    nodeL = pd.to_numeric(data.iloc[:, 0], errors='coerce').dropna().to_numpy()
    nodeR = pd.to_numeric(data.iloc[:, 1], errors='coerce').dropna().to_numpy()

    train_data = []
    test = []
    
    for i in range(len(edges)):
        edgeFeature = edgeFeatures(" ")
        
        nodel_str = str(nodeL[i]).strip()
        noder_str = str(nodeR[i]).strip()
        
        if not nodel_str or not noder_str:
            print(f"Empty value found at index {i}: nodel={nodel_str}, noder={noder_str}")
            continue
        
        try:
            nodel = int(re.sub(r"\D", "", nodel_str))
            noder = int(re.sub(r"\D", "", noder_str))
        except ValueError as e:
            print(f"Error processing edge {i} with values (nodel, noder): ({nodel_str}, {noder_str}): {e}")
            continue
        
        # Remaining logic follows...


    # Split into training and validation sets
    train, validate = train_test_split(train_data, test_size=0.2)

    return create_dataloader(train), create_dataloader(validate), create_dataloader(test_data)

def create_dataloader(data):
    dataset = [[torch.tensor(e.embeddings, dtype=torch.float32),
                torch.tensor(e.label, dtype=torch.float32)] for e in data]
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

class GradReverse(Function):
    @staticmethod
    def forward(ctx, x, lambd):
        ctx.lambd = lambd
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output * -ctx.lambd, None

class ANN(nn.Module):
    def __init__(self, hidden_dim):
        super(ANN, self).__init__()
        self.predictor = nn.Sequential(
            nn.Linear(64, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 2)
        )

    def forward(self, x):
        return self.predictor(x)

def train(model, train_loader, criterion, optimizer):
    model.train()
    for data, labels in train_loader:
        data, labels = Variable(data), Variable(labels.long())
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

def evaluate(model, data_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for data, labels in data_loader:
            outputs = model(data)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
    return metrics.accuracy_score(all_labels, all_preds)

# ============================
# GA OPTIMIZATION FUNCTIONS
# ============================
def initialize_population():
    return [{'lr': random.uniform(0.0001, 0.01),
             'hidden_dim': random.choice([8, 16, 32, 64]),
             'batch_size': random.choice([16, 32, 64])} for _ in range(POPULATION_SIZE)]

def mutate(params):
    if random.random() < MUTATION_RATE:
        params['lr'] = random.uniform(0.0001, 0.01)
    if random.random() < MUTATION_RATE:
        params['hidden_dim'] = random.choice([8, 16, 32, 64])
    if random.random() < MUTATION_RATE:
        params['batch_size'] = random.choice([16, 32, 64])

def crossover(parent1, parent2):
    child = {}
    for key in parent1:
        child[key] = parent1[key] if random.random() > 0.5 else parent2[key]
    return child

def genetic_algorithm(train_loader, validate_loader):
    population = initialize_population()
    best_params, best_acc = None, 0

    for generation in range(NUM_GENERATIONS):
        scores = []
        for params in population:
            model = ANN(params['hidden_dim'])
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])

            train(model, train_loader, criterion, optimizer)
            acc = evaluate(model, validate_loader)
            scores.append((acc, params))

        scores.sort(reverse=True, key=lambda x: x[0])
        best_acc, best_params = scores[0]

        next_gen = [scores[0][1], scores[1][1]]  # Keep top 2
        for _ in range(POPULATION_SIZE - 2):
            parent1, parent2 = random.sample(scores[:5], 2)
            child = crossover(parent1[1], parent2[1])
            mutate(child)
            next_gen.append(child)

        population = next_gen
        print(f'Generation {generation}, Best Accuracy: {best_acc}')

    return best_params

def main():
    dataset = 'Facebook'
    train_loader, validate_loader, test_loader = get_train_validate_test(dataset)

    best_params = genetic_algorithm(train_loader, validate_loader)
    print(f'Best Hyperparameters: {best_params}')

    model = ANN(best_params['hidden_dim'])
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=best_params['lr'])

    train(model, train_loader, criterion, optimizer)
    test_acc = evaluate(model, test_loader)
    print(f'Test Accuracy: {test_acc}')

if __name__ == '__main__':
    main()


IndexError: index 0 is out of bounds for axis 0 with size 0

In [25]:
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.autograd import Variable, Function
from torch.utils.data import Dataset, DataLoader
from sklearn import metrics
import re
import networkx as nx
import pandas as pd
import time, os
import numpy as np
from pyswarm import pso  # PSO package

# Original parameters
num_epochs = 5
hidden_dim = 16  # This will be optimized
output_file = 'trainOutput/output.txt'

class edgeFeatures(object):
    def __init__(self, label=None, type=None, embeddings=None):
        self.label = label
        self.type = type
        self.embeddings = embeddings
        return

def structuralGraph(realFileName, fakeFileName, dataset):
    dataReal = pd.read_csv(realFileName, sep=' ', skiprows=0)
    dataFake = pd.read_csv(fakeFileName, sep=' ', skiprows=0)

    train_Real_Graph = nx.Graph()
    train_Fake_Graph = nx.Graph()
    test_Real_Graph = nx.Graph()
    test_Fake_Graph = nx.Graph()

    real_edge_Attritube = np.array(dataReal.iloc[:, 0:3])
    fake_edge_Attritube = np.array(dataFake.iloc[:, 0:3])

    lenReal = len(real_edge_Attritube)
    lenFake = len(fake_edge_Attritube)

    # print(real_edge_Attritube)
    #new type id according to dataset
    if dataset.lower() == 'facebook':
        dataNewType = [9, 8, 7, 6, 5, 4]
    else:
        dataNewType = [2]

    for i in range(lenReal):
        relation = real_edge_Attritube[i][2]
        if relation in dataNewType:
            test_Real_Graph.add_edge(real_edge_Attritube[i][0], real_edge_Attritube[i][1], relationship=relation)
        else:
            train_Real_Graph.add_edge(real_edge_Attritube[i][0], real_edge_Attritube[i][1], relationship=relation)


    for i in range(lenFake):
        relation = fake_edge_Attritube[i][2]
        if relation in dataNewType:
            test_Fake_Graph.add_edge(fake_edge_Attritube[i][0], fake_edge_Attritube[i][1], relationship=relation)
        else:
            train_Fake_Graph.add_edge(fake_edge_Attritube[i][0], fake_edge_Attritube[i][1], relationship=relation)

    return train_Real_Graph, train_Fake_Graph, test_Real_Graph, test_Fake_Graph


def get_train_validate_test(dataset):
    realFileName = 'Datasets/' + dataset + '/realData.csv'
    fakeFileName = 'Datasets/' + dataset + '/fakeData.csv'
    train_Real_Graph, train_Fake_Graph, test_Real_Graph, test_Fake_Graph = structuralGraph(realFileName, fakeFileName, dataset)
    node2vecReFile = "Datasets/node2vecFeature/" + dataset + "Feature.txt"
    data = pd.read_csv(node2vecReFile, sep=' ', skiprows=1, header=None)
    edges = np.array(data.iloc[:, 0:1]) + np.array(data.iloc[:, 1:2])
    embeddings = np.array(data.iloc[:, 2:66])
    nodeL = np.array(data.iloc[:, 0:1])
    nodeR = np.array(data.iloc[:, 1:2])
    train_data = []
    test = []
    for i in range(len(edges)):
        edgeFeature = edgeFeatures(" ")
        nodel = int(re.sub("\D", "", nodeL[i][0]))
        noder = int(re.sub("\D", "", nodeR[i][0]))
        if train_Real_Graph.has_edge(nodel, noder) or train_Fake_Graph.has_edge(nodel, noder): # train set
            if train_Real_Graph.has_edge(nodel, noder):
                label = 1
                type = train_Real_Graph.get_edge_data(nodel, noder)['relationship']
            else:
                label = 0
                type = train_Fake_Graph.get_edge_data(nodel, noder)['relationship']
            edgeFeature.embeddings = embeddings[i]
            edgeFeature.label = label
            edgeFeature.type = type
            train_data.append(edgeFeature)
        elif test_Real_Graph.has_edge(nodel, noder) or test_Fake_Graph.has_edge(nodel, noder):  # test set
            if test_Real_Graph.has_edge(nodel, noder):
                label = 1
                type = test_Real_Graph.get_edge_data(nodel, noder)['relationship']
            else:
                label = 0
                type = test_Fake_Graph.get_edge_data(nodel, noder)['relationship']
            edgeFeature.embeddings = embeddings[i]
            edgeFeature.label = label
            edgeFeature.type = type
            test.append(edgeFeature)
        else:
            continue

    train, validate = train_test_split(train_data, test_size=0.2)  # train_test_split返回切分的数据集train/validate
    train_dataset = []
    validate_dataset = []
    test_dataset = []
    for index, element in enumerate(train):
        vectors = torch.tensor(element.embeddings, dtype=torch.float32)
        label = torch.tensor(element.label, dtype=torch.float32)
        type = torch.tensor(element.type, dtype=torch.float32)
        m = [vectors, label, type]
        train_dataset.append(m)
    for index, element in enumerate(validate):
        vectors = torch.tensor(element.embeddings, dtype=torch.float32)
        label = torch.tensor(element.label, dtype=torch.float32)
        type = torch.tensor(element.type, dtype=torch.float32)
        m = [vectors, label, type]
        validate_dataset.append(m)
    for index, element in enumerate(test):
        vectors = torch.tensor(element.embeddings, dtype=torch.float32)
        label = torch.tensor(element.label, dtype=torch.float32)
        type = torch.tensor(element.type, dtype=torch.float32)
        m = [vectors, label, type]
        test_dataset.append(m)
    print('train length', len(train_dataset))
    print('validate length', len(validate_dataset))
    print('test length', len(test_dataset))
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    validate_loader = DataLoader(dataset=validate_dataset, batch_size=batch_size,  shuffle=False)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, validate_loader, test_loader

def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)

class re_shape(Function):
    @staticmethod
    def forward(ctx, x):
        return x.view_as(x.reshape(len(x), len(x[0][0])))

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.reshape(len(grad_output), 1, len(grad_output[0]))
        return output, None

class GradReverse(Function):
    @staticmethod
    def forward(ctx, x, lambd):
        ctx.lambd = lambd
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output * -ctx.lambd, None

class adversarial_neural_networks(nn.Module):
    def __init__(self, predicted_Type, hidden_dim):
        super(adversarial_neural_networks, self).__init__()
        self.predicted_Type = predicted_Type
        # Adjust hidden dimensions according to PSO results
        self.predictor = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10, stride=1, padding=0),
            nn.Linear(55, hidden_dim)
        )
        self.predictor_classifier = nn.Sequential(
            nn.Linear(hidden_dim, 24),
            nn.ReLU(),
            nn.Linear(24, 16),
            nn.ReLU(),
            nn.Linear(16, 2),
            nn.Softmax(dim=1)
        )
        self.discriminative_classifier = nn.Sequential(
            nn.Linear(hidden_dim, 16),
            nn.ReLU(),
            nn.Linear(16, self.predicted_Type),
            nn.Softmax(dim=1)
        )

    def forward(self, embeddings):
        embeddings = self.predictor(embeddings)
        shared_embeddings = re_shape.apply(embeddings)
        link_output = self.predictor_classifier(shared_embeddings)
        reverse_embeddings = GradReverse.apply(shared_embeddings, 1.0)
        type_output = self.discriminative_classifier(reverse_embeddings)
        return link_output, type_output

def fitness_function(params):
    # Params: [learning_rate, batch_size, hidden_dim]
    learning_rate = params[0]
    batch_size = int(params[1])
    hidden_dim = int(params[2])

    # Re-initialize data loaders with new batch size
    train_loader, validate_loader, _ = get_train_validate_test(dataset)
    model = adversarial_neural_networks(predicted_Type, hidden_dim)
    if torch.cuda.is_available():
        model.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(list(model.parameters()), lr=learning_rate)

    # Training loop (simplified for PSO optimization)
    best_validate_acc = 0.0
    for epoch in range(5):  # Using fewer epochs for PSO evaluation
        for i, (train_data, train_labels, type_labels) in enumerate(train_loader):
            optimizer.zero_grad()
            train_data, train_labels, type_labels = to_var(train_data), to_var(train_labels), to_var(type_labels)
            link_outputs, type_outputs = model(train_data.unsqueeze(1))
            train_labels, type_labels = train_labels.long(), type_labels.long()
            prediction_loss = criterion(link_outputs, train_labels)
            classification_loss = criterion(type_outputs, type_labels)
            loss = prediction_loss + classification_loss
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        validate_acc_vector_temp = []
        for i, (validate_data, validate_labels, type_labels) in enumerate(validate_loader):
            validate_data, validate_labels, type_labels = to_var(validate_data), to_var(validate_labels), to_var(type_labels)
            validate_outputs, _ = model(validate_data.unsqueeze(1))
            _, validate_argmax = torch.max(validate_outputs, 1)
            validate_labels = validate_labels.long()
            validate_accuracy = (validate_labels == validate_argmax.squeeze()).float().mean()
            validate_acc_vector_temp.append(validate_accuracy.item())

        validate_acc = np.mean(validate_acc_vector_temp)
        best_validate_acc = max(best_validate_acc, validate_acc)
        model.train()

    return -best_validate_acc  # PSO minimizes, so we negate the accuracy

def optimize_with_pso():
    # PSO bounds for hyperparameters
    lb = [1e-4, 16, 8]  # Lower bounds: [learning rate, batch size, hidden_dim]
    ub = [1e-2, 128, 64]  # Upper bounds

    # Run PSO
    best_params, best_score = pso(fitness_function, lb, ub, swarmsize=10, maxiter=10)

    print(f"Best params: {best_params}")
    print(f"Best validation accuracy: {-best_score}")

    return best_params

def main(predicted_Type, dataset, output_file):
    # Optimize using PSO
    best_params = optimize_with_pso()

    # Extract best hyperparameters
    learning_rate, batch_size, hidden_dim = best_params
    batch_size = int(batch_size)
    hidden_dim = int(hidden_dim)

    # Re-train model with optimized hyperparameters
    train_loader, validate_loader, test_loader = get_train_validate_test(dataset)
    model = adversarial_neural_networks(predicted_Type, hidden_dim)
    if torch.cuda.is_available():
        model.cuda()

    # Train and evaluate the model with the best parameters
    best_validate_dir = train_adversarial_neural_networks(train_loader, validate_loader, model, output_file)
    auc, precision, accuracy = test_adversarial_neural_networks(best_validate_dir, test_loader, model)
    print("Final result: AUC -- %.4f  " % (auc), "Precision -- %.4f  " % (precision), 'Accuracy -- %.4f  ' % (accuracy))

if __name__ == '__main__':
    datasets = ['Facebook', 'IMDB', 'YELP', 'DBLP']
    dataset = datasets[0]
    print('Input dataset is:', dataset)
    predicted_Type_datasets = {'Facebook': 4, 'IMDB': 2, 'YELP': 2, 'DBLP': 2}
    predicted_Type = predicted_Type_datasets[dataset]
    output_file = 'trainOutput/' + dataset.lower() + '/output.txt'
    main(predicted_Type, dataset, output_file)


Input dataset is: Facebook
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475
test length 1561
train length 1896
validate length 475


  model.load_state_dict(torch.load(best_validate_dir))
