Setup and Initialization

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Data Preprocessing

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Placeholder functions for loading JSON data files
def load_json(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        return json.load(file)

Feature Extraction

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from datetime import datetime

# Load user data (for illustration, assuming user_data is a list of dictionaries)
user_data = load_json('user.json')  # Load user data

# Define feature extraction function
def extract_user_features(users):
    features = []
    for user in users:
        # Behavioral features
        tweet_frequency = user['tweet_count'] / ((datetime.now() - datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y')).days)
        retweet_ratio = user['retweet_count'] / user['tweet_count'] if user['tweet_count'] else 0
        mention_ratio = user['mention_count'] / user['tweet_count'] if user['tweet_count'] else 0
        
        # Content features
        hashtag_ratio = user['hashtag_count'] / user['tweet_count'] if user['tweet_count'] else 0
        url_ratio = user['url_count'] / user['tweet_count'] if user['tweet_count'] else 0
        sensitive_content_ratio = user['sensitive_tweet_count'] / user['tweet_count'] if user['tweet_count'] else 0
        
        # Account features
        account_age_days = (datetime.now() - datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y')).days
        is_verified = int(user['verified'])
        profile_completeness = int(user['description'] != '') + int(user['location'] != '') + int(user['profile_image_url'] != '')
        
        # Network features
        followers_count = user['followers_count']
        following_count = user['following_count']
        follower_following_ratio = followers_count / following_count if following_count else followers_count
        
        features.append([
            tweet_frequency, retweet_ratio, mention_ratio, hashtag_ratio, url_ratio, sensitive_content_ratio,
            account_age_days, is_verified, profile_completeness, follower_following_ratio
        ])
    return features

# Example feature extraction for users, extendable to other entities
def extract_features(users):
    features = np.array([
        [user['followers_count'], user['following_count']]  # Continue with other features
        for user in users
    ])
    return StandardScaler().fit_transform(features)

In [None]:
# Load entity data
users_data = load_json('user.json')
tweets_data = load_json('tweet_0.json')  # Do this for tweet_[0-8].json
lists_data = load_json('list.json')
hashtags_data = load_json('hashtag.json')

# Load relationships
nodes = pd.read_csv('node.json')
edges = pd.read_csv('edge.csv')

# Load labels and splits
labels = pd.read_csv('label.csv')
splits = pd.read_csv('split.csv')

user_features = extract_features(users_data)  # Do this for tweets, lists, and hashtags

In [None]:
# Extract features
#user_features = extract_user_features(user_data)

# Convert to pandas DataFrame (for subsequent normalization and training)
features_df = pd.DataFrame(user_features, columns=[
    'tweet_frequency', 'retweet_ratio', 'mention_ratio', 'hashtag_ratio', 'url_ratio', 
    'sensitive_content_ratio', 'account_age_days', 'is_verified', 'profile_completeness', 
    'follower_following_ratio'
])

# Normalize and scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_df)


Graph Construction

In [None]:
def dynamic_subgraph_sampling(graph, root_node, walk_length=30, num_walks=20):
    from random import choice
    
    subgraphs = []
    for _ in range(num_walks):
        subgraph_nodes = set([root_node])
        current_node = root_node
        for _ in range(walk_length):
            neighbors = list(graph.neighbors(current_node))
            if neighbors:
                next_node = choice(neighbors)
                subgraph_nodes.add(next_node)
                current_node = next_node
            else:
                break
        induced_subgraph = graph.subgraph(subgraph_nodes).copy()
        subgraphs.append(induced_subgraph)
    return subgraphs

In [None]:
G = nx.from_pandas_edgelist(edges, source='source_id', target='target_id', create_using=nx.DiGraph())

# Ensure this function is robust and handles the variety of entity and relation types
dynamic_subgraphs = dynamic_subgraph_sampling(G, start_node='some_user_id')

Graph Data Preparation

In [None]:
def prepare_graph_data(edges, node_features):
    node_encoder = LabelEncoder()
    edges['source_encoded'] = node_encoder.fit_transform(edges['source_id'])
    edges['target_encoded'] = node_encoder.transform(edges['target_id'])
    
    edge_index = torch.tensor([edges['source_encoded'].values, edges['target_encoded'].values], dtype=torch.long)
    
    node_features['node_id_encoded'] = node_encoder.transform(node_features['node_id'])
    node_features = node_features.sort_values('node_id_encoded')
    node_features_tensor = torch.tensor(node_features['features'].tolist(), dtype=torch.float)
    
    return edge_index.to(device), node_features_tensor.to(device)

In [None]:
#edges = pd.read_csv('edges.csv')  # Adjust with actual path and columns
#node_features = pd.read_csv('node_features.csv')  # Adjust with actual path and columns
edge_index, node_features = prepare_graph_data(edges, nodes)

Model Definition

In [None]:
class EnhancedGCN(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(EnhancedGCN, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GATConv(2 * out_channels, out_channels, heads=3)  # Using GAT for the second layer

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        return x

class GraphBotDetector(nn.Module):
    def __init__(self, feature_dim, num_classes):
        super(GraphBotDetector, self).__init__()
        self.feature_extractor = EnhancedGCN(feature_dim, 128)
        self.pool = global_mean_pool
        self.fc = nn.Linear(128 * 3, num_classes)  # Adjust according to GAT heads

    def forward(self, batch_data):
        node_features = self.feature_extractor(batch_data.x, batch_data.edge_index)
        graph_features = self.pool(node_features, batch_data.batch)
        return self.fc(graph_features)

In [None]:
# Model definition (GCN with GAT, previously defined)
model = GraphBotDetector(feature_dim=256, num_classes=2).to_device(device)  # Adjust the feature_dim as per the actual feature vector length

# Training preparation (Loss function, optimizer, dataloaders, etc.)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

Training and Evaluation

In [None]:
# Placeholder function for splitting data based on Twibot-22 'split.csv'
def load_splits():
    # Implement loading of 'split.csv' and return train, valid, test indices
    pass

train_idx, valid_idx, test_idx = load_splits()

# Prepare the dataset
X_train, y_train = scaled_features[train_idx], labels[train_idx]
X_valid, y_valid = scaled_features[valid_idx], labels[valid_idx]
X_test, y_test = scaled_features[test_idx], labels[test_idx]

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float).to(device)
# ... do the same for X_valid_tensor, y_valid_tensor, X_test_tensor, y_test_tensor

In [None]:
model = GraphBotDetector(feature_dim=256, num_classes=2).to(device)  # Feature dimension needs verification
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Placeholder for actual DataLoader implementation
data_loader = DataLoader([Data(x=node_features, edge_index=edge_index)], batch_size=1, shuffle=True)

In [None]:
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
# Example training call
for epoch in range(10):  # Number of epochs should be adjusted based on actual needs
    loss = train(model, data_loader, optimizer, criterion)
    print(f'Epoch {epoch+1}: Loss {loss}')