In [None]:
# Install required packages.
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

[K     |████████████████████████████████| 7.9 MB 6.4 MB/s 


In [None]:
import torch 
import numpy as np
import math


from torch_geometric.utils import degree
import torch_geometric
import torch_geometric.utils as tg_utils

In [None]:
!rm -r data

rm: cannot remove 'data': No such file or directory


In [None]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Training node label rate: 0.05
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [None]:
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [None]:
data.x

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
data.edge_index

tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])

In [None]:
edge = data.edge_index.T
randNum = torch.rand(2) * 100000 % 2706
a = randNum.to(int)
print(a)
print(torch.flip(a, [0]))


tensor([ 328, 2491])
tensor([2491,  328])


In [None]:
zeros = torch.zeros(data.x.shape)
ones = torch.ones(data.x.shape)

noise = (0.1**0.5)*torch.randn(data.x.shape)
print(noise.shape)

torch.Size([2708, 1433])


In [None]:
def get_masked_noise(x,  noise_level=0.15):
  noise_added_node_num = int(noise_level * x.shape[0])
  chose_random_rows = np.random.choice(x.shape[0], noise_added_node_num, replace=False)
  #print(chose_random_rows)
  mask_rows = torch.zeros(x.shape)
  mask_rows[chose_random_rows,:] = torch.ones(1, x.shape[1])
  noise = (0.1**0.5)*torch.randn(x.shape)
  masked_noise = noise* mask_rows.int().float()

  #print(mask_rows)
  #print(noise)
  #print(masked_noise)
  return masked_noise

In [None]:
# data_noisy = data
data.x_noisy = data.x + noise
data.x_zeros = zeros
data.x_ones = ones

for noise_level in [0.15, 0.3, 0.45, 0.6, 0.9, 0.95, 0.99]:
  masked_noise = get_masked_noise(x=data.x,  noise_level=noise_level)
  data[f'x_noisy_n_{noise_level}'] = data.x + masked_noise

In [None]:
# Add 15% noise over all features
mask = torch.randn((3, 5)) < 0.15
noise = (0.1**0.5)*torch.randn(3,5)
masked_noise = noise* mask.int().float()

print(mask)
print(noise)
print(masked_noise)

tensor([[ True,  True,  True, False,  True],
        [False, False, False, False, False],
        [False,  True, False,  True,  True]])
tensor([[-0.3573, -0.3100,  0.1069, -0.2587, -0.2220],
        [ 0.3067,  0.0627,  0.2165, -0.1292,  0.1462],
        [ 0.2164, -0.3841, -0.1794, -1.0503, -0.0371]])
tensor([[-0.3573, -0.3100,  0.1069, -0.0000, -0.2220],
        [ 0.0000,  0.0000,  0.0000, -0.0000,  0.0000],
        [ 0.0000, -0.3841, -0.0000, -1.0503, -0.0371]])


In [None]:
# Add noise on 15% of nodes
chose_random_rows = np.random.choice(5, 2, replace=False)
print(chose_random_rows)
mask_rows = torch.zeros((5,3))
mask_rows[chose_random_rows,:] = torch.ones(1, 3)
noise = (0.1**0.5)*torch.randn(5,3)
masked_noise = noise* mask_rows.int().float()

print(mask_rows)
print(noise)
print(masked_noise)

[4 3]
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[-0.0617,  0.1535, -0.4385],
        [-0.1377,  0.5825,  0.2581],
        [ 0.0058, -0.0732, -0.1255],
        [-0.0478,  0.0815,  0.2327],
        [-0.5016,  0.2203,  0.0815]])
tensor([[-0.0000,  0.0000, -0.0000],
        [-0.0000,  0.0000,  0.0000],
        [ 0.0000, -0.0000, -0.0000],
        [-0.0478,  0.0815,  0.2327],
        [-0.5016,  0.2203,  0.0815]])


## Training a Multi-layer Perception Network (MLP)


In [None]:
import torch
from torch.nn import Linear
import torch.nn.functional as F


class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(dataset.num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

model = MLP(hidden_channels=16)
print(model)

MLP(
  (lin1): Linear(in_features=1433, out_features=16, bias=True)
  (lin2): Linear(in_features=16, out_features=7, bias=True)
)


In [None]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.

def train(model, optimizer, x_type='x'):
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data[x_type])  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(model, x_type='x'):
      model.eval()
      out = model(data[x_type])
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

<IPython.core.display.Javascript object>

In [None]:
model1 = MLP(hidden_channels=16)
x_type='x'
for epoch in range(1, 201):
    loss = train(x_type=x_type)
    #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
test_acc = test(x_type=x_type)
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.6990


## MLP different X features noise levels

In [None]:
noise_levels = [0.15, 0.3, 0.45, 0.6, 0.9, 0.95, 0.99]
x_noises =[f'x_noisy_n_{x}' for x in noise_levels ]

x_types=['x', 'x_noisy', 'x_ones'] +x_noises+['x']
for x_type in x_types:
  model = MLP(hidden_channels=16)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.
  for epoch in range(1, 201):
      loss = train(model, optimizer, x_type=x_type)
      #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(model, x_type=x_type)
  print(f'{x_type} Test Accuracy: {test_acc:.4f}')


x Test Accuracy: 0.5900
x_noisy Test Accuracy: 0.1590
x_ones Test Accuracy: 0.0640
x_noisy_n_0.15 Test Accuracy: 0.5130
x_noisy_n_0.3 Test Accuracy: 0.4380
x_noisy_n_0.45 Test Accuracy: 0.3380
x_noisy_n_0.6 Test Accuracy: 0.3130
x_noisy_n_0.9 Test Accuracy: 0.1600
x_noisy_n_0.95 Test Accuracy: 0.1740
x_noisy_n_0.99 Test Accuracy: 0.1350
x Test Accuracy: 0.5900


# GCN

In [None]:
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=16)
print(model)

GCN(
  (conv1): GCNConv(1433, 16)
  (conv2): GCNConv(16, 7)
)


In [None]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

#model = GCN(hidden_channels=16)
criterion = torch.nn.CrossEntropyLoss()

def train(model, optimizer, x_type='x', edge_type='edge_index'):

      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data[x_type], data[edge_type])  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test(model, x_type='x', edge_type='edge_index'):
      model.eval()
      out = model(data[x_type], data[edge_type])
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc


<IPython.core.display.Javascript object>

## GCN different X features noise levels

In [None]:
noise_levels = [0.15, 0.3, 0.45, 0.6, 0.9, 0.95, 0.99]
x_noises =[f'x_noisy_n_{x}' for x in noise_levels ]

x_types=['x', 'x_noisy', 'x_ones'] +x_noises +['x']
for x_type in x_types:
  model = GCN(hidden_channels=16)
  optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
  for epoch in range(1, 101):
      loss = train(model, optimizer, x_type=x_type)
      #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(model, x_type=x_type)
  print(f'{x_type} Test Accuracy: {test_acc:.4f}')

x Test Accuracy: 0.8150
x_noisy Test Accuracy: 0.5280
x_ones Test Accuracy: 0.3190
x_noisy_n_0.15 Test Accuracy: 0.5840
x_noisy_n_0.3 Test Accuracy: 0.5060
x_noisy_n_0.45 Test Accuracy: 0.5290
x_noisy_n_0.6 Test Accuracy: 0.5370
x_noisy_n_0.9 Test Accuracy: 0.5370
x_noisy_n_0.95 Test Accuracy: 0.5340
x_noisy_n_0.99 Test Accuracy: 0.5510
x Test Accuracy: 0.8150


In [None]:
# remove x% edges
def remove_from_all_edges(edge_index, edges_to_remove_ratio = 0.15):
  edges_to_remove_ratio = 0.15
  edge_ratio_to_keep = 1 - edges_to_remove_ratio
  num_edges_keep= int(edge_ratio_to_keep * edge_index.shape[1])
  chose_random_edge_indices = np.random.choice(edge_index.shape[1], num_edges_keep, replace=False)

  print(edge_index[0][chose_random_edge_indices].shape)
  print(num_edges_keep)
  print(edge_index.shape[1])

  edge_index_removed = torch.zeros((2,num_edges_keep), dtype=torch.int64)
  edge_index_removed[0] = edge_index[0][chose_random_edge_indices]
  edge_index_removed[1] = edge_index[1][chose_random_edge_indices]
  return edge_index_removed

edge_index_removed = remove_from_all_edges(edge_index=data.edge_index, edges_to_remove_ratio = 0.15)
data.edge_index_85 = edge_index_removed

torch.Size([8972])
8972
10556


In [None]:
ed_type = 'edge_index_85'
noise_levels = [0.15, 0.3, 0.45, 0.6, 0.9, 0.95, 0.99]
x_noises =[f'x_noisy_n_{x}' for x in noise_levels ]

x_types=['x', 'x_noisy', 'x_ones'] +x_noises
for x_type in x_types:
  #model = GCN(hidden_channels=16)
  for epoch in range(1, 101):
      loss = train(x_type=x_type, edge_type=ed_type)
      #print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
  test_acc = test(x_type=x_type,edge_type=ed_type)
  print(f'{x_type} Test Accuracy: {test_acc:.4f}')

x Test Accuracy: 0.7100
x_noisy Test Accuracy: 0.4200
x_ones Test Accuracy: 0.1440
x_noisy_n_0.15 Test Accuracy: 0.5360
x_noisy_n_0.3 Test Accuracy: 0.4940
x_noisy_n_0.45 Test Accuracy: 0.4700
x_noisy_n_0.6 Test Accuracy: 0.4110
x_noisy_n_0.9 Test Accuracy: 0.3910
x_noisy_n_0.95 Test Accuracy: 0.3600
x_noisy_n_0.99 Test Accuracy: 0.3950


In [None]:
# remove x% edges from random k, top_k, bottom_k nodes

def choose_nodes_to_remove_from(data, num_nodes, k_nodes, remove_type):
  if remove_type=='random':
    nodes_chosen = torch.tensor([np.random.choice(num_nodes, k_nodes, replace=False)])
  elif remove_type=='top_k':
    # find indegree edges
    dg = torch_geometric.utils.degree(data.edge_index[0])
    top_k_nodes_degrees, top_k_nodes_indices = torch.topk(dg, k_nodes)
    #print(top_k_nodes_degrees, top_k_nodes_indices)
    nodes_chosen = top_k_nodes_indices
  elif remove_type=='bottom_k':
    # find indegree edges
    dg = torch_geometric.utils.degree(data.edge_index[0])
    bottom_k_nodes_degrees, bottom_k_nodes_indices = torch.topk(dg, k_nodes, largest=False)
    #print(bottom_k_nodes_degrees, bottom_k_nodes_indices)
    nodes_chosen = bottom_k_nodes_indices
  else:
    raise 'remove_type should be from random, top_k, bottom_k'
  return nodes_chosen

# to do loop for each node separately
def remove_edges_from_chosen_nodes(data, nodes_chosen, edges_to_remove_per_node_ratio, remove_bidirectional):
  edges_0_list, edges_1_list = [],[]
  for nc in nodes_chosen:
    edge_0, edge_1 = remove_edge_per_node(data=data, node=nc, 
                                          edges_to_remove_per_node_ratio=edges_to_remove_per_node_ratio, 
                                          remove_bidirectional=remove_bidirectional)
    edges_0_list.append(edge_0)
    edges_1_list.append(edge_1)

  edges_0 = torch.cat(edges_0_list, 0)
  edges_1 = torch.cat(edges_1_list, 0)
  return edges_0, edges_1

def remove_edge_per_node(data, node, edges_to_remove_per_node_ratio=0.1, remove_bidirectional=False):
  mask_node_indices = torch.isin(data.edge_index[0], node)

  select_node_edges_0 = data.edge_index[0][mask_node_indices]
  select_node_edges_1 = data.edge_index[1][mask_node_indices]
  #print(select_node_edges_0)
  #print(select_node_edges_1)

  # choose how much of the edges we will remove for this node
  # we decide on number of edges to remove for each node based on the number of edges each node has
  # and by taking the ratio given by edges_to_remove_per_node_ratio
  # note: we use ceil to remove at least one node (unless ratio is 0)
  num_edges_remove = int(math.ceil(edges_to_remove_per_node_ratio* select_node_edges_0.shape[0]))
  #print(num_edges_remove)
  num_edges_keep = select_node_edges_0.shape[0] - num_edges_remove

  # choose random edges to keep, the rest is removed
  chose_random_edge_indices = np.random.choice( select_node_edges_0.shape[0], num_edges_keep, replace=False)
  #print(num_edges_keep)
  
  if remove_bidirectional:
    mask_node_indices_2 = torch.isin(data.edge_index[1], node)
    # find the node names deleted in below indices and delete also for the opposite side.
    raise 'You haven\'t implemented remove bidirectional yet!'

  edge_index_removed = torch.zeros((2,num_edges_keep), dtype=torch.int64)
  edge_node_index_removed_0 = select_node_edges_0[chose_random_edge_indices]
  edge_node_index_removed_1 = select_node_edges_1[chose_random_edge_indices]

  return edge_node_index_removed_0, edge_node_index_removed_1

def remove_edges_from_nodes(data, edges_to_remove_per_node_ratio = 0.15, k_nodes=10,
                            remove_type='random', remove_bidirectional=False):
  edges_to_remove_per_node_ratio = 0.15
  edge_p_node_ratio_to_keep = 1 - edges_to_remove_per_node_ratio

  # choose topk, bottomk, or random
  nodes_chosen = choose_nodes_to_remove_from(data=data, num_nodes=data.num_nodes, k_nodes=k_nodes, remove_type=remove_type)

  # keep edges from remaining nodes
  mask_node_indices = torch.isin(data.edge_index[0],nodes_chosen)
  index_keep = torch.ones(data.edge_index[0].shape[0], dtype=bool)
  index_keep[mask_node_indices] = False
  edges_to_keep_0 = data.edge_index[0][index_keep]
  edges_to_keep_1 = data.edge_index[1][index_keep]
  print(edges_to_keep_0) 
  print(edges_to_keep_1)

  # remove one-directional or bi-directional
  edges_0_kept_chosen_nodes, edges_1_kept_chosen_nodes = remove_edges_from_chosen_nodes(data, nodes_chosen, edges_to_remove_per_node_ratio, remove_bidirectional)

  # concat edges to keep and edges_kept_chosen_nodes
  final_edges_0 = torch.cat([edges_to_keep_0, edges_0_kept_chosen_nodes], 0)
  final_edges_1 = torch.cat([edges_to_keep_1, edges_1_kept_chosen_nodes], 0)

  edge_index_removed = torch.zeros((2, final_edges_0.shape[0]), dtype=torch.int64)
  edge_index_removed[0] = final_edges_0
  edge_index_removed[1] = final_edges_1

  # use TORCH_GEOMETRIC.UTILS.SORT_EDGE_INDEX
  edge_index_removed_sorted = tg_utils.sort_edge_index(edge_index_removed)
  return edge_index_removed_sorted


In [None]:
edges = remove_edges_from_nodes(data=data, edges_to_remove_per_node_ratio = 0.5, k_nodes=1000,
                            remove_type='random', remove_bidirectional=False)

edges.shape

tensor([   2,    2,    2,  ..., 2707, 2707, 2707])
tensor([   1,  332, 1454,  ...,  598, 1473, 2706])


torch.Size([2, 9988])

In [None]:
edges = remove_edges_from_nodes(data=data, edges_to_remove_per_node_ratio = 0.5, k_nodes=1000,
                            remove_type='top_k', remove_bidirectional=False)
edges.shape

tensor([   0,    0,    0,  ..., 2703, 2704, 2705])
tensor([ 633, 1862, 2582,  ..., 1298,  641,  287])


torch.Size([2, 9124])

In [None]:
print(data.edge_index[0][:10])
data.edge_index[1][:10]

tensor([0, 0, 0, 1, 1, 1, 2, 2, 2, 2])


tensor([ 633, 1862, 2582,    2,  652,  654,    1,  332, 1454, 1666])

In [None]:
mm = torch.isin(data.edge_index[0], 1862)
data.edge_index[1][mm]

tensor([   0,  926, 1701, 2582])

In [None]:
# TODOs:
# TODO remove bidirectional

# TODO add edges:
  # TODO add random edges
  # TODO add edges to top k node 
  # TODO add edges to bottom k node 
  # TODO add edges to random k node 

# decide on experiments
# experiments/research questions
# decide on datasets
# decide models to use (mlp, gnn, graphsage, ??)
# decide on what to log, save, how to design experiments 
# reliability of experiments: 10 experiments and avg results (log each experiment)
# Run each experiment
# plots for experiments