In [2]:
import torch

device='cuda'
device = torch.device(device if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
import os
import gc
import torch
import pandas as pd
import numpy as np
import dask.dataframe as dd
import torch.nn.functional as F
from torch.nn import Linear, ReLU
from torch_geometric.data import HeteroData
from torch_geometric.nn import GATConv, global_mean_pool
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from sklearn.preprocessing import LabelEncoder, StandardScaler
import networkx as nx
from datetime import datetime
from random import choice, choices, sample
from sklearn.metrics import roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

device='cuda'
device = torch.device(device if torch.cuda.is_available() else 'cpu')

# Data Paths
DATA_PATHS = {
    'users': 'cache/TwiBot-22/user.json',
    'tweets': [f'cache/TwiBot-22/tweet_{i}.json' for i in range(9)],
    'lists': 'cache/TwiBot-22/list.json',
    'hashtags': 'cache/TwiBot-22/hashtag.json',
    'edges': 'cache/TwiBot-22/edge.csv',
    'labels': 'cache/TwiBot-22/label.csv',
    'splits': 'cache/TwiBot-22/split.csv'
}

# Load data using Dask for efficiency with large datasets
def load_data_with_dask(file_path):
    ddf = dd.read_csv(file_path)
    return ddf.compute()

# Efficient data loading in chunks
def load_data_in_chunks(file_path, chunk_size=1000000):
    return pd.read_csv(file_path, chunksize=chunk_size)

# Clear unused memory
def clear_memory():
    gc.collect()

# Define the model
class GraphBotDetector(torch.nn.Module):
    def __init__(self, feature_dim, num_classes):
        super(GraphBotDetector, self).__init__()
        self.feature_extractor = GATConv(feature_dim, 128, heads=3, concat=True)
        self.pool = global_mean_pool
        self.fc = torch.nn.Linear(128 * 3, num_classes)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.feature_extractor(x, edge_index))
        x = self.pool(x, batch)
        x = self.fc(x)
        return x

# Convert dense matrix to sparse tensor
def convert_to_sparse_tensor(matrix):
    sparse_matrix = sp.coo_matrix(matrix)
    values = torch.FloatTensor(sparse_matrix.data)
    indices = torch.LongTensor([sparse_matrix.row, sparse_matrix.col])
    return torch.sparse.FloatTensor(indices, values, torch.Size(sparse_matrix.shape))

# Assuming 'edges' DataFrame has 'source_type' and 'target_type' columns that specify the type of the nodes
def construct_heterogeneous_graph(edges):
    H = nx.DiGraph()
    for _, edge in edges.iterrows():
        H.add_node(edge['source_id'], type=edge['source_type'])
        H.add_node(edge['target_id'], type=edge['target_type'])
        H.add_edge(edge['source_id'], edge['target_id'], type=edge['relationship_type'])
    return H

def heterogeneous_subgraph_sampling(graph, root_node, walk_length=30, num_walks=20):
    subgraphs = []
    for _ in range(num_walks):
        current_node = root_node
        subgraph_nodes = {current_node}
        subgraph_edges = []

        for _ in range(walk_length):
            neighbors = [(neighbor, graph.edges[current_node, neighbor]['type']) for neighbor in graph.neighbors(current_node)]
            if not neighbors:
                break

            # Weighted choice among neighbors to maintain diversity of edge types
            next_node, _ = choices(neighbors, weights=[1 for _ in neighbors], k=1)[0]
            subgraph_nodes.add(next_node)
            subgraph_edges.append((current_node, next_node))
            current_node = next_node

        # Create the induced subgraph based on sampled nodes and add edges
        sampled_subgraph = nx.DiGraph()
        sampled_subgraph.add_nodes_from([(n, graph.nodes[n]) for n in subgraph_nodes])
        sampled_subgraph.add_edges_from(subgraph_edges)
        subgraphs.append(sampled_subgraph)
    return subgraphs

In [3]:
import networkx as nx
import pandas as pd
import json
from datetime import datetime
import numpy as np
import torch
from torch_geometric.data import Data
from torch.utils.data import Dataset, DataLoader

# Define feature extraction function
def extract_user_features(users):
    features = []
    for user in users.itertuples():
        # Calculate time delta in days
        account_age_days = (datetime.now() - datetime.strptime(user.created_at, '%a %b %d %H:%M:%S +0000 %Y')).days
        tweet_count = max(user.tweet_count, 1)  # Avoid division by zero

        features.append([
            user.tweet_count / account_age_days,  # tweet_frequency
            user.retweet_count / tweet_count,     # retweet_ratio
            user.mention_count / tweet_count,     # mention_ratio
            user.hashtag_count / tweet_count,     # hashtag_ratio
            user.url_count / tweet_count,         # url_ratio
            user.sensitive_tweet_count / tweet_count,  # sensitive_content_ratio
            account_age_days,                  # account_age_days
            int(user.verified),                # is_verified
            int(user.description != '') + int(user.location != '') + int(user.profile_image_url != ''),  # profile_completeness
            user.followers_count / max(user.following_count, 1)  # follower_following_ratio
        ])
    return features

# Load global edge data and construct the heterogeneous graph
def load_graph(edge_file):
    edges = pd.read_csv(edge_file)

    # Initialize directed graph
    G = nx.DiGraph()
    for _, edge in edges.iterrows():
        G.add_edge(edge['source_id'], edge['target_id'], relation=edge['relation'])
    return G

G = load_graph('cache/TwiBot-22/edge.csv')

KeyError: 'relationship_type'

In [1]:
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
import numpy as np
import dask.dataframe as dd
import networkx as nx
import json

class TwibotDataset(Dataset):
    def __init__(self, graph, data_paths, transform=None):
        """
        Initialize the dataset using global graph and data paths.
        """
        self.graph = graph
        self.data_paths = data_paths
        self.transform = transform
        self.nodes = self.load_nodes()

    def load_nodes(self):
        # Combine user data from multiple json files into a single DataFrame
        users = pd.concat([pd.read_json(f) for f in self.data_paths['tweets']], ignore_index=True)
        # Extract features
        features_df = pd.DataFrame(extract_user_features(users))
        return features_df
    
    def __len__(self):
        """
        Return the number of items in the dataset.
        """
        return len(self.nodes)

    def __getitem__(self, idx):
        # Get node features and ID
        node_features = self.nodes.iloc[idx].values
        node_id = self.nodes.index[idx]

        # Create a dynamic subgraph centered at the node_id
        subgraph = self.create_subgraph(node_id)

        # Convert subgraph to PyTorch Geometric Data
        data = self.subgraph_to_data(subgraph)

        if self.transform:
            data.x = self.transform(data.x)

        return data

    def subgraph_to_data(self, subgraph):
        """
        Convert a networkx subgraph to PyTorch Geometric Data.
        """
        edge_index = torch.tensor(list(subgraph.edges())).t().contiguous()
        x = torch.tensor([self.graph.nodes[n]['features'] for n in subgraph.nodes()], dtype=torch.float)
        return Data(x=x, edge_index=edge_index)

# Assuming preprocessing functions are defined, if any
transform = torch.tensor  # Placeholder, define proper transformations based on model requirements


In [5]:
# Load data
data_paths = DATA_PATHS

# Example usage of TwibotDataset and DataLoader
dataset = TwibotDataset(G, DATA_PATHS)
loader = DataLoader(dataset, batch_size=10, shuffle=True) #num_workers=4

edges = load_data_with_dask(data_paths['edges'])
graph = construct_heterogeneous_graph(edges)
root_node = sample(list(graph.nodes), 1)[0]

In [None]:
class GraphBotDetector(torch.nn.Module):
    def __init__(self, feature_dim, num_classes):
        super(GraphBotDetector, self).__init__()
        self.feature_extractor = GATConv(feature_dim, 128, heads=3, concat=True)
        self.pool = global_mean_pool
        self.fc = torch.nn.Linear(128 * 3, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.feature_extractor(x, edge_index))
        x = self.pool(x, torch.zeros(x.size(0), dtype=torch.long, device=x.device))
        x = self.fc(x)
        return x

# Assume feature_dim correctly matches the output of extract_user_features
model = GraphBotDetector(feature_dim=10, num_classes=2)
optimizer = Adam(model.parameters(), lr=0.01)
criterion = CrossEntropyLoss()

# Training loop
for data in loader:
    model.train()
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, data.y)  # Assuming target 'y' is correctly set up
    loss.backward()
    optimizer.step()

In [None]:
# Sample subgraphs
subgraphs = heterogeneous_subgraph_sampling(graph, root_node)

In [None]:
# Initialize model and components
model = GraphBotDetector(feature_dim=10, num_classes=2)
model.to(device)
optimizer = Adam(model.parameters(), lr=0.01)
criterion = CrossEntropyLoss()
writer = SummaryWriter()

In [None]:
# Train model
for epoch in range(10):  # Adjust based on dataset size and convergence observations
    model.train()
    for subgraph in subgraphs:
        # Placeholder: convert subgraph to data instance
        data = Data(x=subgraph.nodes['features'], edge_index=subgraph.edges['index'])
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        writer.add_scalar('Loss/train', loss.item(), epoch)


In [None]:
# Evaluate model
model.eval()
# Placeholder for evaluation logic

In [None]:
writer.close()
clear_memory()