## PG Data Set

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
np.random.seed(0)

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('..')


import torch
from torch_geometric.data import Dataset, Data
from sklearn.preprocessing import OneHotEncoder
import os
import pandas as pd
from tqdm import tqdm
import networkx as nx

from src.config import *

In [2]:
class PageGraphDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(PageGraphDataset, self).__init__(root, transform, pre_transform)
    

    @property
    def raw_file_names(self):
        return 'graph-metadata.csv'
    

    @property
    def processed_file_names(self):
        # Used in `self.process`
        self.data = self.__get_graph_metadata(filter_nas=True, uq=0.99, uc=0.99).reset_index()

        return [f'data_{ix}.pt' for ix in self.data.index.to_list()]


    def download(self):
        pass
    

    def process(self):
        # Get node and edge types across all graphs
        graph_ids_with_labels = self.data['graph_id_with_label'].to_list()
        node_types_set, edge_types_set = self.__generate_sets_of_edge_and_node_types(graph_ids_with_labels)
        
        # Fit encoders
        node_types_np = np.array(list(node_types_set)).reshape(-1, 1)
        node_type_enc = OneHotEncoder(handle_unknown='ignore')
        node_type_enc.fit(node_types_np)

        edge_types_np = np.array(list(edge_types_set)).reshape(-1, 1)
        edge_type_enc = OneHotEncoder(handle_unknown='ignore')
        edge_type_enc.fit(edge_types_np)
        
        for (ix, row) in tqdm(self.data.iterrows()):
            G = self.__load_relabelled_graph(row['graph_id_with_label'])

            node_types_oe = self.__encode_node_features(G, node_type_enc)
            edge_types_oe = self.__encode_edge_features(G, edge_type_enc)
            edge_index = self.__encode_edge_index(G)
                
            _, label = row['graph_id_with_label'].split('-')
            data = Data(x=node_types_oe, edge_index=edge_index, edge_attr=edge_types_oe, y=torch.tensor(int(label), dtype=torch.int64), graph_id=row['graph_id_with_label'])
            torch.save(data, os.path.join(self.processed_dir, f'data_{ix}.pt'))

            
    def len(self):
        return self.data.shape[0]
            

    def get(self, graph_id):
        return torch.load(os.path.join(self.processed_dir, f'data_{graph_id}.pt'))
            

    def __get_subdirectory(self, label):
        return BROKEN_DIR if bool(int(label)) else UNBROKEN_DIR


    def __get_graph_path(self, graph_id_with_label, graph_type):
        assert(isinstance(graph_type, GraphType))

        graph_id, label = graph_id_with_label.split('-')
        if graph_type == GraphType.DELTA:
            path = DELTA_PATH
        elif graph_type == GraphType.PRE_INTERVENTION:
            path = PRE_INTERVENTION_PATH
        elif graph_type == GraphType.POST_INTERVENTION:
            path = POST_INTERVENTION_PATH
        else:
            return None

        return os.path.join(self.raw_dir, self.__get_subdirectory(label), graph_id, path)


    def __get_graph_metadata(self, filter_nas=False, iqr=False, uq=False, uc=1):
        meta_df = pd.read_csv(self.raw_paths[0], converters={'G_pre_cols': pd.eval})

        if filter_nas:
            meta_df = meta_df[meta_df[['G_pre_nodes', 'G_delta_nodes']].isna().sum(axis=1) == 0]

        if not (iqr or uq):
            return meta_df

        if iqr:
            assert(0 < iqr < 1)
            eps = (1-iqr)/2
            lower_quantile, upper_quantil = 0+eps, 1-eps

            lb_nodes = meta_df['G_pre_nodes'].quantile(lower_quantile)
            ub_nodes = meta_df['G_pre_nodes'].quantile(upper_quantil)
            lb_edges = meta_df['G_pre_edges'].quantile(lower_quantile)
            ub_edges = meta_df['G_pre_edges'].quantile(upper_quantil)

            predicate = """
                G_pre_nodes >= %i and \
                G_pre_nodes <=%i and \
                G_pre_edges >= %i and \
                G_pre_edges <=%i
            """ % (lb_edges, ub_edges, lb_nodes, ub_nodes)
            return meta_df.query(predicate)

        if uq:
            assert(0 < uq < 1)
            meta_df['edge_node_ratio'] = meta_df['G_pre_edges'] / meta_df['G_pre_nodes']
            ub_edge_node_ratio = meta_df['edge_node_ratio'].quantile(uq)

            assert(0 < uc <= 1)
            ub_node_count = meta_df['G_pre_nodes'].quantile(uc)
            ub_edge_count = meta_df['G_pre_edges'].quantile(uc)

            predicate = """
                edge_node_ratio <= %i and \
                G_pre_nodes <= %i and \
                G_pre_edges <= %i
            """ % (ub_edge_node_ratio, ub_node_count, ub_edge_count)
            return meta_df.query(predicate)
    

    def __load_relabelled_graph(self, graph_id_with_label):
        graph_path = self.__get_graph_path(graph_id_with_label, GraphType.DELTA)
        G = nx.read_graphml(graph_path)
        return nx.convert_node_labels_to_integers(G)


    def __generate_sets_of_edge_and_node_types(self, ixs):
        node_types_set = set()
        edge_types_set = set()

        for graph_id in tqdm(ixs):
            G = self.__load_relabelled_graph(graph_id)

            graph_node_types_ser = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')['node type'].unique()
            node_types_set = node_types_set.union(set(graph_node_types_ser))

            edge_node_types_ser = nx.to_pandas_edgelist(G)['edge type']
            edge_types_set = edge_types_set.union(set(edge_node_types_ser))

        return node_types_set, edge_types_set

    
    def __encode_node_features(self, G, enc):
        node_types_ser = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')['node type']
        node_types_np = node_types_ser.to_numpy().reshape(-1, 1)
        return torch.tensor(enc.transform(node_types_np).toarray(), dtype=torch.float)


    def __encode_edge_features(self, G, enc):
        edge_types_ser = nx.to_pandas_edgelist(G)['edge type']
        edge_types_np = edge_types_ser.to_numpy().reshape(-1, 1)
        return torch.tensor(enc.transform(edge_types_np).toarray(), dtype=torch.float)

    
    def __encode_edge_index(self, G):
        # edges = list(G.edges(keys=False))
        edges = list(G.edges())
        return torch.tensor(edges, dtype=torch.long).t().contiguous()
    
dataset = PageGraphDataset('/Volumes/brave-build-drive/pg-gnn')

In [3]:
dataset

PageGraphDataset(1924)

## Training

In [4]:
import torch
from torch.nn import Linear, Sigmoid
import torch.nn.functional as F 
from torch_geometric.data import DataLoader
from torch_geometric.nn import (GCNConv, GATConv, GATv2Conv, TopKPooling,
                                global_mean_pool, global_max_pool)

In [5]:
embedding_size = 64
num_node_features = dataset[0].x.shape[1]
num_edge_features = dataset[0].edge_attr.shape[1]
num_conv_layers = 3-2
num_heads = 1

num_node_features, num_edge_features, num_conv_layers

(16, 24, 1)

In [10]:
class GCN(torch.nn.Module):
    def __init__(self):
        # Init parent
        super(GCN, self).__init__()

        # torch.manual_seed(0)
        self.initial_conv = GATv2Conv(num_node_features, embedding_size, heads=num_heads, edge_dim=num_edge_features)

        self.module_list = torch.nn.ModuleList()
        for i in range(num_conv_layers):
            conv = GATv2Conv(embedding_size, embedding_size, heads=num_heads, edge_dim=num_edge_features)
            self.module_list.append(conv)

        self.final_conv = GATv2Conv(embedding_size, embedding_size, heads=num_heads, edge_dim=num_edge_features)
        
        self.lin = Linear(embedding_size*1, 1)
        self.sig = Sigmoid()
        
    def forward(self, x, edge_index, batch, edge_attr=None):
        # 1. Obtain node embeddings 
        x = self.initial_conv(x, edge_index, edge_attr)
        x = x.relu()
        
        for conv in self.module_list:
            x = conv(x, edge_index, edge_attr)
            x = x.relu()
        
        x = self.final_conv(x, edge_index, edge_attr)
        x = x.relu()
        
        # 2. Readout layer
        x = global_mean_pool(x, batch)
        # x = torch.cat([global_max_pool(x, batch), global_mean_pool(x, batch)], dim=1)
        x = x.relu()
        
        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        return self.sig(self.lin(x))


In [7]:
model = GCN()
print(model)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))
# 23873

GCN(
  (initial_conv): GATv2Conv(16, 64, heads=1)
  (module_list): ModuleList(
    (0): GATv2Conv(64, 64, heads=1)
  )
  (final_conv): GATv2Conv(64, 64, heads=1)
  (lin): Linear(in_features=64, out_features=1, bias=True)
  (sig): Sigmoid()
)
Number of parameters:  23873


In [11]:
def train():
    model.train()

    for data in train_loader:
        data.to(device)
        out = model(data.x, data.edge_index, data.batch, data.edge_attr)
        targets = data.y.unsqueeze(1).to(torch.float32)
        loss = criterion(out, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    return loss


def test(loader):
    model.eval()

    num_batches = len(loader)
    loss, correct = 0, 0
    
    with torch.no_grad():
        for data in loader:
            data.to(device)
            out = model(data.x, data.edge_index, data.batch, data.edge_attr)  
            targets = data.y.unsqueeze(1).to(torch.float32)
            loss = criterion(out, targets)
            correct += (out.argmax(1) == targets).type(torch.float).sum().item()

    # loss /= len(loader)
    # correct /= len(loader.dataset)
    
    print('loss', loss, 'correct', correct)

    return loss, correct
        

import time
start_time = time.time()

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = torch.nn.BCELoss()

# device = torch.device("mps")
device = torch.device("cpu")
model = model.to(device)

data_size = len(dataset)
NUM_GRAPHS_PER_BATCH = 512

print('data', dataset[:int(data_size * 0.8)])

train_loader = DataLoader(dataset[:int(data_size * 0.8)], 
                    batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)
test_loader = DataLoader(dataset[int(data_size * 0.8):], 
                         batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)

losses, train_losses, test_losses, train_accs, test_accs = [], [], [], [], []
for epoch in range(1, 1000):
    loss = train()
    losses.append(loss)
    
    train_loss, train_acc = test(train_loader)
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    
    test_loss, test_acc = test(test_loader)
    test_losses.append(test_loss)
    test_accs.append(test_acc)

    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
    
end_time = time.time()
duration = end_time-start_time

data PageGraphDataset(1539)
loss tensor(0.6886) correct 370179.0
loss tensor(0.6947) correct 83545.0
loss tensor(0.6910) correct 369670.0
loss tensor(0.6938) correct 83545.0
loss tensor(0.6948) correct 369670.0
loss tensor(0.6935) correct 83545.0
loss tensor(0.7034) correct 369161.0
loss tensor(0.6938) correct 83545.0
loss tensor(0.6970) correct 369670.0
loss tensor(0.6944) correct 83545.0
loss tensor(0.6859) correct 370179.0
loss tensor(0.6944) correct 83545.0
loss tensor(0.6990) correct 369670.0
loss tensor(0.6940) correct 83545.0
loss tensor(0.7031) correct 369670.0
loss tensor(0.6936) correct 83545.0
loss tensor(0.6892) correct 370179.0
loss tensor(0.6937) correct 83545.0
loss tensor(0.6806) correct 370688.0
loss tensor(0.6940) correct 83545.0
Epoch: 010, Loss: 0.6741, Train Loss: 0.6806, Test Loss: 0.6940, Train Acc: 370688.0000, Test Acc: 83545.0000
loss tensor(0.6871) correct 370179.0
loss tensor(0.6949) correct 83545.0
loss tensor(0.6816) correct 370179.0
loss tensor(0.6958) co

KeyboardInterrupt: 

In [None]:
import seaborn as sns

losses_float = [float(loss.cpu().detach().numpy()) for loss in losses]
loss_ixs = [i for i,l in enumerate(losses_float)]

sns.lineplot(loss_ixs, losses_float, label='loss')

In [None]:
sns.lineplot(loss_ixs, train_accs, label='train')
sns.lineplot(loss_ixs, test_accs, label='test')

In [None]:
#######

In [None]:
from torch_geometric.data import DataLoader
import warnings
warnings.filterwarnings("ignore")

# Root mean squared error
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.007)  

# Use GPU for training
device = torch.device("mps")
model = model.to(device)

In [None]:
# Wrap data in a data loader
data_size = len(dataset)
NUM_GRAPHS_PER_BATCH = 128

loader = DataLoader(dataset[:int(data_size * 0.6)], 
                    batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)
test_loader = DataLoader(dataset[int(data_size * 0.6):], 
                         batch_size=NUM_GRAPHS_PER_BATCH, shuffle=True)

# for batch in loader:
#     x = torch.Tensor(np.vstack(batch.x)).float()
#     print(x.shape)
#     print(batch.batch.shape)
#     print(batch.batch)

In [None]:
def train():
    # Enumerate over the data
    for batch in loader:
        # Use GPU
        batch.to(device)

        # Reset gradients
        optimizer.zero_grad()

        # Passing the node features and the connection info
#         x = torch.Tensor(np.vstack(batch.x)).float()

        pred, embedding = model(batch.x, batch.edge_index, batch.batch)

        # Calculating the loss and gradients
        loss = loss_fn(pred, batch.y.float())
        loss.backward()

        # Update using the gradients
        optimizer.step()

    return loss, embedding

print("Starting training...")
losses = []
for epoch in range(25):
    loss, h = train()
    losses.append(loss)
    if epoch % 1 == 0:
        print(f"Epoch {epoch} | Train Loss {loss}")

In [None]:
# Visualize learning (training loss)
import seaborn as sns
losses_float = [float(loss.cpu().detach().numpy()) for loss in losses] 
loss_indices = [i for i,l in enumerate(losses_float)] 
plt = sns.lineplot(loss_indices, losses_float)
plt

In [None]:
import pandas as pd 

# Analyze the results for one batch
test_batch = next(iter(test_loader))
with torch.no_grad():
    test_batch.to(device)
    pred, embed = model(test_batch.x, test_batch.edge_index, test_batch.batch) 
    df = pd.DataFrame()
    df["y_real"] = test_batch.y.tolist()
    df["y_pred"] = pred.tolist()
df["y_real"] = df["y_real"].apply(lambda row: row)
df["y_pred"] = df["y_pred"].apply(lambda row: row[0])
df

In [None]:
plt = sns.scatterplot(data=df, x="y_real", y="y_pred")
plt.set(xlim=(-7, 2))
plt.set(ylim=(-7, 2))
plt