<a href="https://colab.research.google.com/github/camligorkem/cs-260c-project/blob/main/CS_260_Node_Classification_Exploration_RK_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages.
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

[K     |████████████████████████████████| 7.9 MB 7.6 MB/s 
[K     |████████████████████████████████| 3.5 MB 6.7 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [2]:
import torch 
import numpy as np
import math


from torch_geometric.utils import degree
import torch_geometric
import torch_geometric.utils as tg_utils

In [114]:
!rm -r data

In [115]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index



Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Training node label rate: 0.05
Has isolated nodes: False
Has self-loops: False
Is undirected: True


Processing...
Done!


In [5]:
zeros = torch.zeros(data.x.shape)
ones = torch.ones(data.x.shape)

noise = (0.1**0.5)*torch.randn(data.x.shape)
print(noise.shape)

torch.Size([2708, 1433])


In [116]:
def get_masked_noise(data,  noise_level=0.15):
  x = data.x
  noise_added_node_num = int(noise_level * x.shape[0])
  chose_random_rows = np.random.choice(x.shape[0], noise_added_node_num, replace=False)
  #print(chose_random_rows)
  mask_rows = torch.zeros(x.shape)
  mask_rows[chose_random_rows,:] = torch.ones(1, x.shape[1])
  noise = (0.1**0.5)*torch.randn(x.shape)
  masked_noise = noise* mask_rows.int().float()

  #print(mask_rows)
  #print(noise)
  #print(masked_noise)
  return masked_noise

In [117]:
# data_noisy = data
data.x_noisy = data.x + noise
data.x_zeros = zeros
data.x_ones = ones

for noise_level in [0.15, 0.3, 0.45, 0.6, 0.9, 0.95, 0.99]:
  masked_noise = get_masked_noise(data,  noise_level=noise_level)
  data[f'x_noisy_n_{noise_level}'] = data.x + masked_noise

## Training a Multi-layer Perception Network (MLP)


In [16]:
import torch
from torch.nn import Linear
import torch.nn.functional as F


class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(dataset.num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

model = MLP(hidden_channels=16)
print(model)

MLP(
  (lin1): Linear(in_features=1433, out_features=16, bias=True)
  (lin2): Linear(in_features=16, out_features=7, bias=True)
)


In [17]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.

def train(model, optimizer, x_type='x'):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data[x_type])  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(model, x_type='x'):
      model.eval()
      out = model(data[x_type])
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

<IPython.core.display.Javascript object>

In [18]:
x_type='x'
model = MLP(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
  
for epoch in range(1, 201):
    loss = train(model, optimizer, x_type=x_type)
    #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
test_acc = test(model, x_type=x_type)
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.5900


## MLP different X features noise levels

In [19]:
noise_levels = [0.15, 0.3, 0.45, 0.6, 0.9, 0.95, 0.99]
x_noises =[f'x_noisy_n_{x}' for x in noise_levels ]

x_types=['x', 'x_noisy', 'x_ones'] +x_noises+['x']
for x_type in x_types:
  model = MLP(hidden_channels=16)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.
  for epoch in range(1, 201):
      loss = train(model, optimizer, x_type=x_type)
      #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(model, x_type=x_type)
  print(f'{x_type} Test Accuracy: {test_acc:.4f}')


x Test Accuracy: 0.5900
x_noisy Test Accuracy: 0.1540
x_ones Test Accuracy: 0.0640
x_noisy_n_0.15 Test Accuracy: 0.5160
x_noisy_n_0.3 Test Accuracy: 0.4210
x_noisy_n_0.45 Test Accuracy: 0.3520
x_noisy_n_0.6 Test Accuracy: 0.3070
x_noisy_n_0.9 Test Accuracy: 0.2030
x_noisy_n_0.95 Test Accuracy: 0.1570
x_noisy_n_0.99 Test Accuracy: 0.1540
x Test Accuracy: 0.5900


# GCN

In [91]:
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=16)
print(model)

GCN(
  (conv1): GCNConv(1433, 16)
  (conv2): GCNConv(16, 7)
)


In [92]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

criterion = torch.nn.CrossEntropyLoss()

def train(model, optimizer, x_type='x', edge_type='edge_index'):

      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data[x_type], data[edge_type])  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(model, x_type='x', edge_type='edge_index'):
      model.eval()
      out = model(data[x_type], data[edge_type])
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc


<IPython.core.display.Javascript object>

## GCN different X features noise levels

In [25]:
noise_levels = [0.15, 0.3, 0.45, 0.6, 0.9, 0.95, 0.99]
x_noises =[f'x_noisy_n_{x}' for x in noise_levels ]

x_types=['x', 'x_noisy', 'x_ones'] +x_noises
for x_type in x_types:
  model = GCN(hidden_channels=16)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
  for epoch in range(1, 101):
      loss = train(model, optimizer, x_type=x_type)
      #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(model, x_type=x_type)
  print(f'{x_type} Test Accuracy: {test_acc:.4f}')

x Test Accuracy: 0.8150
x_noisy Test Accuracy: 0.5260
x_ones Test Accuracy: 0.3190
x_noisy_n_0.15 Test Accuracy: 0.5620
x_noisy_n_0.3 Test Accuracy: 0.5850
x_noisy_n_0.45 Test Accuracy: 0.5300
x_noisy_n_0.6 Test Accuracy: 0.5320
x_noisy_n_0.9 Test Accuracy: 0.5820
x_noisy_n_0.95 Test Accuracy: 0.5330
x_noisy_n_0.99 Test Accuracy: 0.5370


In [89]:
# remove x% edges
def remove_from_all_edges(data, noise_level = 0.15, bidirectional=False):
  edge_index = data.edge_index

  if bidirectional:
    noise_level /= 2
  
  edge_ratio_to_keep = 1 - noise_level
  num_edges_keep= int(edge_ratio_to_keep * edge_index.shape[1])
  chose_random_edge_indices = np.random.choice(edge_index.shape[1], num_edges_keep, replace=False)

  #print(edge_index[0][chose_random_edge_indices].shape)
  #print(num_edges_keep)
  #print(edge_index.shape[1])

  edge_index_removed = torch.zeros((2,num_edges_keep), dtype=torch.int64)
  edge_index_removed[0] = edge_index[0][chose_random_edge_indices]
  edge_index_removed[1] = edge_index[1][chose_random_edge_indices]

  if bidirectional:
    # find the node names deleted in below indices and delete also for the opposite side.
    final_edges_bidirec_0 = []
    final_edges_bidirec_1 = []

    # create a set with all edges
    edge_maps = set()
    for e_0, e_1 in zip(edge_index_removed[0], edge_index_removed[1]):
      edge_maps.add((e_0.item(),e_1.item()))

    for e_0, e_1 in zip(edge_index_removed[0], edge_index_removed[1]):
      e_0_val = e_0.item()
      e_1_val = e_1.item()
      # check an edge has its other direction, if yes add to the final list, if not skip
      if (e_0_val, e_1_val) in edge_maps and (e_1_val, e_0_val) in edge_maps:
        final_edges_bidirec_0.append(e_0_val) 
        final_edges_bidirec_1.append(e_1_val)

    final_edges_bidirec_0 = torch.tensor(final_edges_bidirec_0)
    final_edges_bidirec_1 = torch.tensor(final_edges_bidirec_1)
    edge_index_removed = torch.zeros((2,len(final_edges_bidirec_1)), dtype=torch.int64)
    edge_index_removed[0] = final_edges_bidirec_0
    edge_index_removed[1] = final_edges_bidirec_1

  return edge_index_removed



In [None]:
edge_index_removed = remove_from_all_edges(data, noise_level = 0.15, bidirectional=False)
data.edge_index_85 = edge_index_removed

In [90]:
edge_index_removed = remove_from_all_edges(data, noise_level = 0.15, bidirectional=True)
data.edge_index_85 = edge_index_removed

In [93]:
ed_type = 'edge_index_85'
noise_levels = [0.15, 0.3, 0.45, 0.6, 0.9, 0.95, 0.99]
x_noises =[f'x_noisy_n_{x}' for x in noise_levels ]

x_types=['x', 'x_noisy', 'x_ones'] +x_noises
for x_type in x_types:
  model = GCN(hidden_channels=16)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
  for epoch in range(1, 101):
      loss = train(model, optimizer,x_type=x_type, edge_type=ed_type)
      #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(model, x_type=x_type,edge_type=ed_type)
  print(f'{x_type} Test Accuracy: {test_acc:.4f}')

x Test Accuracy: 0.7970
x_noisy Test Accuracy: 0.4950
x_ones Test Accuracy: 0.3190
x_noisy_n_0.15 Test Accuracy: 0.5280
x_noisy_n_0.3 Test Accuracy: 0.5450
x_noisy_n_0.45 Test Accuracy: 0.4610
x_noisy_n_0.6 Test Accuracy: 0.4590
x_noisy_n_0.9 Test Accuracy: 0.5200
x_noisy_n_0.95 Test Accuracy: 0.4790
x_noisy_n_0.99 Test Accuracy: 0.4970


In [31]:
# remove x% edges from random k, top_k, bottom_k nodes

def choose_nodes(data, num_nodes, k_nodes, choose_type):
  if choose_type=='random':
    nodes_chosen = torch.from_numpy(np.random.choice(num_nodes, k_nodes, replace=False))
  elif choose_type=='top_k':
    # find indegree edges
    dg = torch_geometric.utils.degree(data.edge_index[0])
    top_k_nodes_degrees, top_k_nodes_indices = torch.topk(dg, k_nodes)
    #print(top_k_nodes_degrees, top_k_nodes_indices)
    nodes_chosen = top_k_nodes_indices
  elif choose_type=='bottom_k':
    # find indegree edges
    dg = torch_geometric.utils.degree(data.edge_index[0])
    bottom_k_nodes_degrees, bottom_k_nodes_indices = torch.topk(dg, k_nodes, largest=False)
    #print(bottom_k_nodes_degrees, bottom_k_nodes_indices)
    nodes_chosen = bottom_k_nodes_indices
  else:
    raise 'choose_type should be from random, top_k, bottom_k'
  return nodes_chosen

# to do loop for each node separately
def remove_edges_from_chosen_nodes(data, nodes_chosen, edges_to_remove_per_node_ratio):
  edges_0_list, edges_1_list = [],[]
  for nc in nodes_chosen:
    edge_0, edge_1 = remove_edge_per_node(data=data, node=nc, 
                                          edges_to_remove_per_node_ratio=edges_to_remove_per_node_ratio)
    edges_0_list.append(edge_0)
    edges_1_list.append(edge_1)

  edges_0 = torch.cat(edges_0_list, 0)
  edges_1 = torch.cat(edges_1_list, 0)

  return edges_0, edges_1

def remove_edge_per_node(data, node, edges_to_remove_per_node_ratio=0.1):
  mask_node_indices = torch.isin(data.edge_index[0], node)

  select_node_edges_0 = data.edge_index[0][mask_node_indices]
  select_node_edges_1 = data.edge_index[1][mask_node_indices]
  #print(select_node_edges_0)
  #print(select_node_edges_1)

  # choose how much of the edges we will remove for this node
  # we decide on number of edges to remove for each node based on the number of edges each node has
  # and by taking the ratio given by edges_to_remove_per_node_ratio
  # note: we use ceil to remove at least one node (unless ratio is 0)
  num_edges_remove = int(math.ceil(edges_to_remove_per_node_ratio* select_node_edges_0.shape[0]))
  # print(num_edges_remove)
  num_edges_keep = select_node_edges_0.shape[0] - num_edges_remove

  # choose random edges to keep, the rest is removed
  chose_random_edge_indices = np.random.choice(select_node_edges_0.shape[0], num_edges_keep, replace=False)
  # print(num_edges_keep)
  
  edge_index_removed = torch.zeros((2,num_edges_keep), dtype=torch.int64)
  edge_node_index_removed_0 = select_node_edges_0[chose_random_edge_indices]
  edge_node_index_removed_1 = select_node_edges_1[chose_random_edge_indices]

  return edge_node_index_removed_0, edge_node_index_removed_1

def remove_edges_from_nodes(data, noise_level = 0.15, k_nodes=10,
                            choose_type='random', bidirectional=False):
  if bidirectional:
    noise_level /= 2

  edge_p_node_ratio_to_keep = 1 - noise_level

  # choose topk, bottomk, or random
  nodes_chosen = choose_nodes(data=data, num_nodes=data.num_nodes, k_nodes=k_nodes, choose_type=choose_type)

  # keep edges from remaining nodes
  mask_node_indices = torch.isin(data.edge_index[0],nodes_chosen)
  index_keep = torch.ones(data.edge_index[0].shape[0], dtype=bool)
  index_keep[mask_node_indices] = False
  edges_to_keep_0 = data.edge_index[0][index_keep]
  edges_to_keep_1 = data.edge_index[1][index_keep]
  #print(edges_to_keep_0) 
  #print(edges_to_keep_1)

  # remove one-directional or bi-directional
  edges_0_kept_chosen_nodes, edges_1_kept_chosen_nodes = remove_edges_from_chosen_nodes(data=data, 
                                                                                        nodes_chosen=nodes_chosen,
                                                                                        edges_to_remove_per_node_ratio=noise_level)
  # concat edges to keep and edges_kept_chosen_nodes
  final_edges_0 = torch.cat([edges_to_keep_0, edges_0_kept_chosen_nodes], 0)
  final_edges_1 = torch.cat([edges_to_keep_1, edges_1_kept_chosen_nodes], 0)

  # do bidirectional here! IF bidirectional set to true remove both directions of the edges.
  if bidirectional:
    # find the node names deleted in below indices and delete also for the opposite side.
    final_edges_bidirec_0 = []
    final_edges_bidirec_1 = []

    # create a set with all edges
    edge_maps = set()
    for e_0, e_1 in zip(final_edges_0, final_edges_1):
      edge_maps.add((e_0.item(),e_1.item()))

    for e_0, e_1 in zip(final_edges_0, final_edges_1):
      e_0_val = e_0.item()
      e_1_val = e_1.item()
      # check an edge has its other direction, if yes add to the final list, if not skip
      if (e_0_val, e_1_val) in edge_maps and (e_1_val, e_0_val) in edge_maps:
        final_edges_bidirec_0.append(e_0_val) 
        final_edges_bidirec_1.append(e_1_val)

    final_edges_bidirec_0 = torch.tensor(final_edges_bidirec_0)
    final_edges_bidirec_1 = torch.tensor(final_edges_bidirec_1)
    final_edges_0 = final_edges_bidirec_0
    final_edges_1 = final_edges_bidirec_1
  edge_index_removed = torch.zeros((2, final_edges_0.shape[0]), dtype=torch.int64)
  edge_index_removed[0] = final_edges_0
  edge_index_removed[1] = final_edges_1

    
  # use TORCH_GEOMETRIC.UTILS.SORT_EDGE_INDEX
  edge_index_removed_sorted = tg_utils.sort_edge_index(edge_index_removed)
  return edge_index_removed_sorted


In [67]:
edges = remove_edges_from_nodes(data=data, noise_level = 0.2, k_nodes=10,
                            choose_type='top_k', bidirectional=True)

edges.shape

torch.Size([2, 10424])

In [None]:
edges = remove_edges_from_nodes(data=data, edges_to_remove_per_node_ratio = 0.5, k_nodes=1000,
                            remove_type='random', remove_bidirectional=True)

edges.shape

torch.Size([2, 6538])

In [None]:
edges = remove_edges_from_nodes(data=data, edges_to_remove_per_node_ratio = 0.5, k_nodes=1000,
                            remove_type='top_k', remove_bidirectional=False)
edges.shape

torch.Size([2, 6889])

In [None]:
# add different types of edge removals in the data

remove_type = 'bottom_k' #'top_k', 'bottom_k', 'random'
remove_bidirectional = True #False, True
k_nodes = 1000
edges_to_remove_ratios =  [0, 0.1, 0.15, 0.3, 0.45, 0.6, 0.9, 0.95]

for remove_ratio in edges_to_remove_ratios:
  edges = remove_edges_from_nodes(data=data, edges_to_remove_per_node_ratio = remove_ratio, k_nodes=k_nodes,
                              remove_type=remove_type, remove_bidirectional=remove_bidirectional)
  print(edges.shape)
  data[f'edge_removed_{remove_type}_bidirec_{remove_bidirectional}_nodes_{k_nodes}_ratio_{remove_ratio}'] = edges


edge_noises = [f'edge_removed_{remove_type}_bidirec_{remove_bidirectional}_nodes_{k_nodes}_ratio_{x}' for x in edges_to_remove_ratios ]
edge_types = ['edge_index'] + edge_noises

for ed_type in edge_types:
  model = GCN(hidden_channels=16)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
  for epoch in range(1, 101):
      loss = train(model, optimizer, x_type='x', edge_type=ed_type)
      #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(model, x_type='x',edge_type=ed_type)
  print(f'{ed_type} Test Accuracy: {test_acc:.4f}')

torch.Size([2, 10556])
torch.Size([2, 8782])
torch.Size([2, 8758])
torch.Size([2, 8766])
torch.Size([2, 8768])
torch.Size([2, 7922])
torch.Size([2, 7922])
torch.Size([2, 7922])
edge_index Test Accuracy: 0.8150
edge_removed_bottom_k_bidirec_True_nodes_1000_ratio_0 Test Accuracy: 0.8150
edge_removed_bottom_k_bidirec_True_nodes_1000_ratio_0.1 Test Accuracy: 0.7850
edge_removed_bottom_k_bidirec_True_nodes_1000_ratio_0.15 Test Accuracy: 0.7770
edge_removed_bottom_k_bidirec_True_nodes_1000_ratio_0.3 Test Accuracy: 0.7890
edge_removed_bottom_k_bidirec_True_nodes_1000_ratio_0.45 Test Accuracy: 0.7760
edge_removed_bottom_k_bidirec_True_nodes_1000_ratio_0.6 Test Accuracy: 0.7660
edge_removed_bottom_k_bidirec_True_nodes_1000_ratio_0.9 Test Accuracy: 0.7660
edge_removed_bottom_k_bidirec_True_nodes_1000_ratio_0.95 Test Accuracy: 0.7660


In [None]:
# TODOs:
# TODO remove bidirectional - DONE

# TODO add edges:
  # TODO add random edges
  # TODO add edges to top k node 
  # TODO add edges to bottom k node 
  # TODO add edges to random k node 

# decide on experiments
# experiments/research questions
# decide on datasets
# decide models to use (mlp, gnn, graphsage, ??)
# decide on what to log, save, how to design experiments 
# reliability of experiments: 10 experiments and avg results (log each experiment)
# Run each experiment
# plots for experiments

In [95]:
def add_random_edges(data, noise_level = 0.15, bidirectional=False):
  edge_index = data.edge_index
  num_nodes = data.num_nodes

  if bidirectional:
    noise_level /= 2

  new_edges = edge_index.T
  edge_num = new_edges.shape[0]
  num_of_new_edges = int(edge_num * noise_level)

  for i in range(num_of_new_edges):
    while True:
      new_edge = ((torch.rand(1,2) * 1000000).to(int) % num_nodes)
      new_edge_flip = torch.flip(new_edge, [1])
      new_edge_exist = torch.any(torch.all(torch.eq(new_edges,new_edge),1))
      new_edge_flip_exist = torch.any(torch.all(torch.eq(new_edges,new_edge_flip),1))
      if not new_edge_exist and not new_edge_flip_exist:
        new_edges = torch.cat((new_edges,new_edge), 0)
        if bidirectional:
          new_edges = torch.cat((new_edges,new_edge_flip), 0)
        break
      elif not new_edge_exist:
        new_edges = torch.cat((new_edges,new_edge), 0)
        break
      elif not new_edge_flip_exist:
        new_edges = torch.cat((new_edges,new_edge_flip), 0)
        break

  edge_index_sorted = tg_utils.sort_edge_index(new_edges.T)
  return edge_index_sorted

In [96]:
for added_ratio in [0.15]:
  new_edges = add_random_edges(data,  noise_level=added_ratio, bidirectional=True)
  data[f'edge_index_added_n_{added_ratio}'] = new_edges


In [98]:
added_levels = [0.15]
edge_noises = [f'edge_index_added_n_{x}' for x in added_levels ]
edge_types = ['edge_index'] + edge_noises

for ed_type in edge_types:
  model = GCN(hidden_channels=16)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
  for epoch in range(1, 101):
      loss = train(model, optimizer, x_type='x', edge_type=ed_type)
      # print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(model, x_type='x',edge_type=ed_type)
  print(f'{ed_type} Test Accuracy: {test_acc:.4f}')

edge_index Test Accuracy: 0.8150
edge_index_added_n_0.15 Test Accuracy: 0.7840


In [80]:
## Helper function of add_edges_to_nodes

def add_edges_to_a_node(data, new_edges, node, num_to_add=0, bidirectional=False):
  for i in range(num_to_add):

    # Running till we find the right edge to add
    while True:
      #create a random node. 1000000 is a arbitrary number which can be replaced to any number bigger than data.num_nodes
      new_index = ((torch.rand(1) * 1000000).to(int) % data.num_nodes).item()
      new_edge = torch.tensor([[node, new_index]])
      new_edge_flip = torch.flip(new_edge, [1])
      #check whether the new edge and flip one exists or not 
      new_edge_exist = torch.any(torch.all(torch.eq(new_edges,new_edge),1))
      new_edge_flip_exist = torch.any(torch.all(torch.eq(new_edges,new_edge_flip),1))
      if not new_edge_exist and not new_edge_flip_exist:
        new_edges = torch.cat((new_edges,new_edge), 0)
        if bidirectional:
          new_edges = torch.cat((new_edges,new_edge_flip), 0)
        break
      elif not new_edge_exist:
        new_edges = torch.cat((new_edges,new_edge), 0)
        break
      elif not new_edge_flip_exist:
        new_edges = torch.cat((new_edges,new_edge_flip), 0)
        break
  return new_edges

## Adding x% edges to random k, top k, or bottom k nodes

def add_edges_to_nodes(data, noise_level=0.15, k_nodes=10, chosse_type='random', bidirectional=False):
  if bidirectional:
    noise_level /= 2

  #choose nodes by three different types
  nodes_chosen = choose_nodes(data=data, num_nodes=data.num_nodes, k_nodes=k_nodes, choose_type=chosse_type)
  
  new_edges = data.edge_index.T

  #add new edges to every chosen node
  for node in nodes_chosen:
    edge_num_of_node = torch.isin(data.edge_index, node).to(int).sum()
    edge_num_to_add = int(edge_num_of_node * noise_level)
    
    new_edges = add_edges_to_a_node(data, new_edges, node.item(), edge_num_to_add, bidirectional)
  
  edge_index_sorted = tg_utils.sort_edge_index(new_edges.T)
  return edge_index_sorted

In [99]:
# add different types of edge addition in the data
add_type = 'top_k' #'top_k', 'bottom_k', 'random'
add_bidirectional = False #False, True
k_nodes = 1000
edges_to_add_ratios =  [0, 0.1, 0.15, 0.3, 0.45, 0.6]

for add_ratio in edges_to_add_ratios:
  edges = add_edges_to_nodes(data=data, noise_level = add_ratio, k_nodes=k_nodes,
                              chosse_type=add_type, bidirectional=add_bidirectional)
  print(edges.shape)
  data[f'edge_added_{add_type}_bidirec_{add_bidirectional}_nodes_{k_nodes}_ratio_{add_ratio}'] = edges


edge_noises = [f'edge_added_{add_type}_bidirec_{add_bidirectional}_nodes_{k_nodes}_ratio_{x}' for x in edges_to_add_ratios ]
edge_types = ['edge_index'] + edge_noises

for ed_type in edge_types:
  model = GCN(hidden_channels=16)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
  for epoch in range(1, 101):
      loss = train(model, optimizer, x_type='x', edge_type=ed_type)
      #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(model, x_type='x',edge_type=ed_type)
  print(f'{ed_type} Test Accuracy: {test_acc:.4f}')

torch.Size([2, 10556])
torch.Size([2, 11543])
torch.Size([2, 12225])
torch.Size([2, 14384])
torch.Size([2, 16295])
torch.Size([2, 18441])
edge_index Test Accuracy: 0.8150
edge_added_top_k_bidirec_False_nodes_1000_ratio_0 Test Accuracy: 0.8150
edge_added_top_k_bidirec_False_nodes_1000_ratio_0.1 Test Accuracy: 0.7990
edge_added_top_k_bidirec_False_nodes_1000_ratio_0.15 Test Accuracy: 0.7810
edge_added_top_k_bidirec_False_nodes_1000_ratio_0.3 Test Accuracy: 0.7290
edge_added_top_k_bidirec_False_nodes_1000_ratio_0.45 Test Accuracy: 0.6460
edge_added_top_k_bidirec_False_nodes_1000_ratio_0.6 Test Accuracy: 0.6470


## Experiments

In [108]:
!pip install class-resolver

from torch_geometric.nn import MLP, GCN, GraphSAGE, GAT
from class_resolver import ClassResolver

Collecting class-resolver
  Downloading class_resolver-0.3.4-py3-none-any.whl (20 kB)
Installing collected packages: class-resolver
Successfully installed class-resolver-0.3.4


In [109]:
def get_model(model_name, model_params):
  # add more model from here if needed: https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#models
  # you can also check model parameters from above
  model = None
  if model_name=='MLP':
    model = MLP(**model_params)
  elif model_name=='GNN':
    model = GCN(**model_params)
  elif model_name=='GAT':
    model = GraphSAGE(**model_params)
  elif model_name=='Graphsage':
    model = GAT(**model_params)
  else:
    raise 'Model names should be within MLP, GNN, GAT, Graphsage'
  return model

In [110]:
# Creating MLP example, add all parameters you want to use to create/tune model in the below dictionary
model_params= {'in_channels':1433, 'hidden_channels':16, 'out_channels':7, 'num_layers':3}
mlp_model = get_model(model_name='MLP', model_params=model_params)
print(mlp_model)

model_params= {'in_channels':1433, 'hidden_channels':16, 'out_channels':7, 'num_layers':3, 'dropout':0.1}
gnn_model = get_model(model_name='GNN', model_params=model_params)
print(gnn_model)


MLP(1433, 16, 16, 7)
GCN(1433, 7, num_layers=3)


In [120]:
def get_noise_function(noise_info):
  '''
  This function directly returns the function we created for different noise logics
  Change the name of the function below if noise logic function name is changed.
  '''
  noise_fn = None 
  if noise_info['noise_type']=='feature_noise':
    noise_fn = get_masked_noise
  elif noise_info['noise_type']=='edge_removal':
    if noise_info['strategy']=='all_edges' :
      noise_fn = remove_from_all_edges
    elif noise_info['strategy']=='nodes' :
      noise_fn = remove_edges_from_nodes
  elif noise_info['noise_type']=='edge_addition':
    if noise_info['strategy']=='all_edges' :
      noise_fn = add_random_edges
    elif noise_info['strategy']=='nodes' :
      noise_fn = add_edges_to_nodes
  else:
    raise 'Noise type should be chosen from feature_noise, edge_removal, edge_addition' 
  return noise_fn
  

def create_noised_data_for_experiment(data, noise_info):
  '''
  Create noised data outside of experiment, so that you can use same data in different experiments
  '''
  noise_data_names = []
  noise_fn = get_noise_function(noise_info)
  
  if noise_info.get('strategy')=='nodes':
    add_prefix = f"{noise_info.get('strategy', '')}_choose_type-{noise_info['params'].get('choose_type', '')}_knodes-{noise_info['params'].get('k_nodes', '')}_bidirec-{noise_info['params'].get('bidirectional', '')}"
  elif noise_info.get('strategy')=='all_edges':
    add_prefix = f"{noise_info.get('strategy', '')}_"
  else: 
    add_prefix=''
  for noise_level in noise_info['noise_levels']:
    noise_data_name = f'{noise_info["noise_type"]}_{add_prefix}{noise_level}'
    noise_info['params']['noise_level'] = noise_level
    noised_data = noise_fn(**noise_info['params'])
    data[noise_data_name] = noised_data
    noise_data_names.append(noise_data_name)
  return noise_data_names
 
# reuse train and test
def train(model, optimizer, x_type='x', edge_type='edge_index'):
  criterion = torch.nn.CrossEntropyLoss()
  model.train()
  optimizer.zero_grad()  # Clear gradients.
  out = model(data[x_type], data[edge_type])  # Perform a single forward pass.
  loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
  loss.backward()  # Derive gradients.
  optimizer.step()  # Update parameters based on gradients.
  return loss

def test(model, x_type='x', edge_type='edge_index'):
  model.eval()
  out = model(data[x_type], data[edge_type])
  pred = out.argmax(dim=1)  # Use the class with highest probability.
  test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
  test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
  return test_acc

def experiment(dataset_name, model_names, model_params, data, x_types, edge_types, noise_info, num_epochs=50, repeat_num=1, print_updates=False):
  '''
  Assumes the noisy data is already created and inside the data object (so that we can use same data sample for different models to compare)
  '''
  exp_count = len(x_types)*len(edge_types)*len(model_names)
  count=0
  res = []
  for model_name in model_names:
    for x_type in x_types:
      for ed_type in edge_types:
        count+=1
        if print_updates:
            print(f'Run {model_name}: {count}/{exp_count}: {x_type} - {ed_type}')
            
        for exp_num in range(1, repeat_num+1): # we will repeat experiment repeat many times, to increase results reliability
           
          model =  get_model(model_name=model_name, model_params=model_params[model_name])
          optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) # parametrize similar to model later TODO
          for epoch in range(num_epochs):
              loss = train(model, optimizer, x_type=x_type, edge_type=ed_type)
              # print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

          # we test our results in original data, no noise added ones 
          # if you want to test on noised data change them to x_type=x_type and edge_type=ed_type below
          test_acc = test(model, x_type='x',edge_type='edge_index') 
          if print_updates:
            print(f'Exp_num:{exp_num} Test Accuracy: {test_acc:.4f}')

          # TODO we can add other metrics such as runtime to log in here later
          exp_res = {'dataset_name': dataset_name,
                    'model_name':model_name, 'x_type':x_type, 
                    'edge_type':ed_type, 'test_accuracy':test_acc, 
                    'num_epochs':num_epochs, 'model_params':model_params[model_name],
                    'noise_info':noise_info, 'experiment_run_number':exp_num} # TODO add more info on experiment noise_info
          res.append(exp_res)
        

  res_df = pd.DataFrame(res, columns=exp_res.keys())
  return res_df

In [87]:
import pandas as pd

## Sample Experiments for each type of noise

In [88]:
dataset_name='Cora'

In [121]:
# noise data creation example: add edges
edge_addition_params = {'data':data}
noise_info = {'noise_type':'edge_addition', 'params':edge_addition_params, 'noise_levels':[0.3, 0.6, 0.9 ], 'strategy': 'all_edges'}

noise_var_names = create_noised_data_for_experiment(data=data, noise_info=noise_info)
print(data)

# Experiment on edge addition noise
model_names = ['GNN', 'GAT', 'Graphsage']
model_params = {  # TODO random params change later
    'GNN':{'in_channels':1433, 'hidden_channels':16, 'out_channels':7, 'num_layers':3, 'dropout':0.1},
    'GAT':{'in_channels':1433, 'hidden_channels':12, 'out_channels':7, 'num_layers':3, 'dropout':0.2},
    'Graphsage':{'in_channels':1433, 'hidden_channels':14, 'out_channels':7, 'num_layers':3, 'dropout':0.1}
}
exp_df = experiment(dataset_name=dataset_name, model_names=model_names, model_params=model_params, data=data, x_types=['x'], 
           edge_types=noise_var_names, noise_info=noise_info, num_epochs=25, repeat_num=3, print_updates=True)

exp_df

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], x_noisy=[2708, 1433], x_zeros=[2708, 1433], x_ones=[2708, 1433], x_noisy_n_0.15=[2708, 1433], x_noisy_n_0.3=[2708, 1433], x_noisy_n_0.45=[2708, 1433], x_noisy_n_0.6=[2708, 1433], x_noisy_n_0.9=[2708, 1433], x_noisy_n_0.95=[2708, 1433], x_noisy_n_0.99=[2708, 1433], edge_addition_all_edges_edge_addition_0.3=[2, 13722], edge_addition_all_edges_edge_addition_0.6=[2, 16889], edge_addition_all_edges_edge_addition_0.9=[2, 20056], edge_addition_all_edges_0.3=[2, 13722], edge_addition_all_edges_0.6=[2, 16889], edge_addition_all_edges_0.9=[2, 20056])
Run GNN: 1/9: x - edge_addition_all_edges_0.3
Exp_num:1 Test Accuracy: 0.5370


KeyboardInterrupt: ignored

In [None]:
# noise data creation example: add noise feature
feature_noise_params = {'x':data.x}
noise_info = {'noise_type':'feature_noise', 'params':feature_noise_params, 'noise_levels':[0, 0.3, 0.6 ], 'noise_param_name':'noise_level'}

noise_var_names_feat_exp =  create_noised_data_for_experiment(data=data, noise_info=noise_info)
print(data)

# Experiment on feature noise 
model_names = ['GNN', 'GAT']
model_params = {  # TODO random params change later
    'GNN':{'in_channels':1433, 'hidden_channels':16, 'out_channels':7, 'num_layers':3, 'dropout':0.1},
    'GAT':{'in_channels':1433, 'hidden_channels':12, 'out_channels':7, 'num_layers':3, 'dropout':0.2},
    'Graphsage':{'in_channels':1433, 'hidden_channels':14, 'out_channels':7, 'num_layers':3, 'dropout':0.1}
}
exp_df2 = experiment(dataset_name=dataset_name,model_names=model_names, model_params=model_params, data=data, x_types=noise_var_names_feat_exp, 
           edge_types=['edge_index'], noise_info=noise_info, num_epochs=25, repeat_num=3, print_updates=True)

exp_df2

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_addition_0.6=[2, 16889], edge_addition_0.9=[2, 20056], edge_addition_0=[2, 10556], edge_addition_0.3=[2, 13722], feature_noise_0=[2708, 1433], feature_noise_0.3=[2708, 1433], feature_noise_0.6=[2708, 1433], edge_removal_all_edges_0=[2, 10556], edge_removal_all_edges_0.3=[2, 7600], edge_removal_all_edges_0.6=[2, 5162], edge_removal_all_edges_0.9=[2, 3234], edge_removal_nodes_removetype-_knodes-_bidirec-0=[2, 10556], edge_removal_nodes_removetype-_knodes-_bidirec-0.3=[2, 8218], edge_removal_nodes_removetype-_knodes-_bidirec-0.6=[2, 6412], edge_removal_nodes_removetype-_knodes-_bidirec-0.9=[2, 5108], edge_removal__removetype-random_knodes-1000_bidirec-True0=[2, 10556], edge_removal__removetype-random_knodes-1000_bidirec-True0.3=[2, 8356], edge_removal__removetype-random_knodes-1000_bidirec-True0.6=[2, 6368], edge_removal__removetype-random_knodes-1000_bidirec-True0.9=[2, 4596])

Unnamed: 0,dataset_name,model_name,x_type,edge_type,test_accuracy,num_epochs,model_params,noise_info,experiment_run_number
0,Cora,GNN,feature_noise_0,edge_index,0.13,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",1
1,Cora,GNN,feature_noise_0,edge_index,0.13,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",2
2,Cora,GNN,feature_noise_0,edge_index,0.13,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",3
3,Cora,GNN,feature_noise_0.3,edge_index,0.122,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",1
4,Cora,GNN,feature_noise_0.3,edge_index,0.193,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",2
5,Cora,GNN,feature_noise_0.3,edge_index,0.319,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",3
6,Cora,GNN,feature_noise_0.6,edge_index,0.144,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",1
7,Cora,GNN,feature_noise_0.6,edge_index,0.137,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",2
8,Cora,GNN,feature_noise_0.6,edge_index,0.129,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'feature_noise', 'params': {'x'...",3
9,Cora,GAT,feature_noise_0,edge_index,0.144,25,"{'in_channels': 1433, 'hidden_channels': 12, '...","{'noise_type': 'feature_noise', 'params': {'x'...",1


In [None]:
# noise data creation example: remove edges -- strategy 1: from all edges

remove_edge_params = {'edge_index':data.edge_index, 'remove_bidirectional':True}
noise_info = {'noise_type':'edge_removal', 'strategy':'all_edges', 'params':remove_edge_params, 
              'noise_levels':[0, 0.3, 0.6, 0.9 ], 'noise_param_name':'edges_to_remove_ratio'}

exp3_edge_var_names = create_noised_data_for_experiment(data=data, noise_info=noise_info)
print(data)


# Experiment on edge removal - strategy-all edges
model_names = ['GNN', 'Graphsage']
model_params = {  # TODO random params change later
    'GNN':{'in_channels':1433, 'hidden_channels':16, 'out_channels':7, 'num_layers':3, 'dropout':0.1},
    'GAT':{'in_channels':1433, 'hidden_channels':12, 'out_channels':7, 'num_layers':3, 'dropout':0.2},
    'Graphsage':{'in_channels':1433, 'hidden_channels':14, 'out_channels':7, 'num_layers':3, 'dropout':0.1}
}
exp_df3 = experiment(dataset_name=dataset_name,model_names=model_names, model_params=model_params, data=data, x_types=['x'], 
           edge_types=exp3_edge_var_names, noise_info=noise_info, num_epochs=25, repeat_num=3, print_updates=True)

exp_df3

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_addition_0.6=[2, 16889], edge_addition_0.9=[2, 20056], edge_addition_0=[2, 10556], edge_addition_0.3=[2, 13722], feature_noise_0=[2708, 1433], feature_noise_0.3=[2708, 1433], feature_noise_0.6=[2708, 1433], edge_removal_all_edges_0=[2, 10556], edge_removal_all_edges_0.3=[2, 7628], edge_removal_all_edges_0.6=[2, 5142], edge_removal_all_edges_0.9=[2, 3190], edge_removal_nodes_removetype-_knodes-_bidirec-0=[2, 10556], edge_removal_nodes_removetype-_knodes-_bidirec-0.3=[2, 8218], edge_removal_nodes_removetype-_knodes-_bidirec-0.6=[2, 6412], edge_removal_nodes_removetype-_knodes-_bidirec-0.9=[2, 5108], edge_removal__removetype-random_knodes-1000_bidirec-True0=[2, 10556], edge_removal__removetype-random_knodes-1000_bidirec-True0.3=[2, 8356], edge_removal__removetype-random_knodes-1000_bidirec-True0.6=[2, 6368], edge_removal__removetype-random_knodes-1000_bidirec-True0.9=[2, 4596])

Unnamed: 0,dataset_name,model_name,x_type,edge_type,test_accuracy,num_epochs,model_params,noise_info,experiment_run_number
0,Cora,GNN,x,edge_removal_all_edges_0,0.464,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",1
1,Cora,GNN,x,edge_removal_all_edges_0,0.554,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",2
2,Cora,GNN,x,edge_removal_all_edges_0,0.696,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",3
3,Cora,GNN,x,edge_removal_all_edges_0.3,0.745,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",1
4,Cora,GNN,x,edge_removal_all_edges_0.3,0.507,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",2
5,Cora,GNN,x,edge_removal_all_edges_0.3,0.62,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",3
6,Cora,GNN,x,edge_removal_all_edges_0.6,0.54,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",1
7,Cora,GNN,x,edge_removal_all_edges_0.6,0.452,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",2
8,Cora,GNN,x,edge_removal_all_edges_0.6,0.531,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",3
9,Cora,GNN,x,edge_removal_all_edges_0.9,0.594,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'al...",1


In [None]:

# noise data creation example: remove edges -- strategy 2: remove from selected nodes
remove_edge_from_nodes_params = {'data':data, 'remove_bidirectional':True, 'k_nodes':1000, 'remove_type':'random'}

noise_info = {'noise_type':'edge_removal', 'strategy':'nodes', 'params':remove_edge_from_nodes_params, 
              'noise_levels':[0, 0.3, 0.6, 0.9 ], 'noise_param_name':'edges_to_remove_per_node_ratio'}

exp4_var_names = create_noised_data_for_experiment(data=data, noise_info=noise_info)
print(data)

# Experiment on edge removal - strategy- nodes

model_names = ['GNN', 'Graphsage']
model_params = {  # TODO random params change later
    'GNN':{'in_channels':1433, 'hidden_channels':16, 'out_channels':7, 'num_layers':3, 'dropout':0.1},
    'GAT':{'in_channels':1433, 'hidden_channels':12, 'out_channels':7, 'num_layers':3, 'dropout':0.2},
    'Graphsage':{'in_channels':1433, 'hidden_channels':14, 'out_channels':7, 'num_layers':3, 'dropout':0.1}
}
exp_df4 = experiment(dataset_name=dataset_name,model_names=model_names, model_params=model_params, data=data, x_types=['x'], 
           edge_types=exp4_var_names, noise_info=noise_info, num_epochs=25, repeat_num=3, print_updates=True)

exp_df4

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_addition_0.6=[2, 16889], edge_addition_0.9=[2, 20056], edge_addition_0=[2, 10556], edge_addition_0.3=[2, 13722], feature_noise_0=[2708, 1433], feature_noise_0.3=[2708, 1433], feature_noise_0.6=[2708, 1433], edge_removal_all_edges_0=[2, 10556], edge_removal_all_edges_0.3=[2, 7628], edge_removal_all_edges_0.6=[2, 5142], edge_removal_all_edges_0.9=[2, 3190], edge_removal_nodes_removetype-_knodes-_bidirec-0=[2, 10556], edge_removal_nodes_removetype-_knodes-_bidirec-0.3=[2, 8218], edge_removal_nodes_removetype-_knodes-_bidirec-0.6=[2, 6412], edge_removal_nodes_removetype-_knodes-_bidirec-0.9=[2, 5108], edge_removal__removetype-random_knodes-1000_bidirec-True0=[2, 10556], edge_removal__removetype-random_knodes-1000_bidirec-True0.3=[2, 8418], edge_removal__removetype-random_knodes-1000_bidirec-True0.6=[2, 6322], edge_removal__removetype-random_knodes-1000_bidirec-True0.9=[2, 4732])

Unnamed: 0,dataset_name,model_name,x_type,edge_type,test_accuracy,num_epochs,model_params,noise_info,experiment_run_number
0,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.499,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",1
1,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.592,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",2
2,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.526,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",3
3,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.406,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",1
4,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.516,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",2
5,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.598,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",3
6,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.581,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",1
7,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.598,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",2
8,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.393,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",3
9,Cora,GNN,x,edge_removal__removetype-random_knodes-1000_bi...,0.417,25,"{'in_channels': 1433, 'hidden_channels': 16, '...","{'noise_type': 'edge_removal', 'strategy': 'no...",1


In [None]:
# TODO check with different datasets
# Fine tune the models we want to use and decide final parameters
