In [1]:
# interactive reimport
%load_ext autoreload
%autoreload 2

In [2]:
import sys, logging, time
import os.path as osp
import torch
from model import Model
from torch_geometric.loader import NeighborLoader
import numpy as np

In [3]:
import argparse
parser = argparse.ArgumentParser(description = 'pytorch version of GraphSAGE')
parser.add_argument('--data', type = str, default = 'cora')
# parser.add_argument('--aggr_func', type = str, default = 'MEAN') # dead argmument
parser.add_argument('--num_epochs', type = int, default = 10)
parser.add_argument('--batch_size', type = int, default = 128)
parser.add_argument('--seed', type = int, default = 13)
parser.add_argument('--cuda', action = 'store_true', help = 'use CUDA')
parser.add_argument('--num_neg_samples', type = int, default = 10) # dead argument
parser.add_argument('--lr', type = float, default = 1e-3)
args = parser.parse_args(args=['--cuda'])
args.device = torch.device("cuda" if args.cuda else "cpu")

In [4]:
args

Namespace(data='cora', num_epochs=10, batch_size=128, seed=13, cuda=True, num_neg_samples=10, lr=0.001, device=device(type='cuda'))

In [5]:
np.random.seed(args.seed)
torch.manual_seed(args.seed)

logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(levelname)s - %(message)s')
logging.info('Device:' + str(args.device))

2023-01-15 18:27:42,597 - INFO - Device:cuda


In [6]:
data_name = 'cora' 
attributes_file_name = osp.join('../data', data_name, 'attributes')
labels_file_name = osp.join('../data', data_name, 'labels')
valid_file_name = osp.join('../data', data_name, 'valid_nodes')

features = np.loadtxt(attributes_file_name, dtype=np.float32)
labels = np.loadtxt(labels_file_name, dtype=np.int64)[:,1]
valid_all_nodes_list = np.loadtxt(valid_file_name, dtype = np.int64)

In [7]:
from data_handler import update_viewed_all_nodes_and_edges, generate_whole_graph
def load_graph(t=14):
	stream_edges_dir_name = osp.join('../data', data_name, 'stream_edges')
	viewed_all_nodes, viewed_all_edges = None, None
	for tt in range(t):
		coming_edges = np.loadtxt(osp.join(stream_edges_dir_name, str(tt)), dtype=int)
		viewed_all_nodes, viewed_all_edges = update_viewed_all_nodes_and_edges(
								coming_edges, viewed_all_nodes, viewed_all_edges) 
		graph, valid_nodes = generate_whole_graph(viewed_all_nodes, viewed_all_edges, valid_all_nodes_list, features, labels)
	return graph, valid_nodes

graph, valid_nodes = load_graph()

In [8]:
valid_nodes

[0,
 3,
 8,
 9,
 13,
 16,
 19,
 20,
 24,
 26,
 27,
 30,
 31,
 32,
 34,
 35,
 39,
 42,
 56,
 57,
 60,
 71,
 72,
 77,
 83,
 88,
 91,
 100,
 103,
 112,
 113,
 114,
 120,
 123,
 124,
 125,
 127,
 128,
 131,
 132,
 134,
 135,
 140,
 143,
 144,
 147,
 150,
 151,
 152,
 154,
 158,
 162,
 168,
 169,
 178,
 179,
 181,
 185,
 192,
 195,
 196,
 198,
 203,
 204,
 209,
 218,
 220,
 221,
 223,
 229,
 234,
 236,
 237,
 238,
 247,
 248,
 249,
 254,
 257,
 259,
 260,
 262,
 263,
 266,
 267,
 268,
 270,
 275,
 276,
 278,
 280,
 281,
 283,
 287,
 291,
 292,
 298,
 301,
 302,
 303,
 320,
 321,
 322,
 323,
 325,
 326,
 328,
 332,
 335,
 337,
 342,
 345,
 347,
 349,
 357,
 359,
 362,
 363,
 372,
 373,
 377,
 379,
 380,
 381,
 386,
 387,
 394,
 397,
 402,
 404,
 405,
 408,
 422,
 423,
 424,
 427,
 432,
 434,
 438,
 441,
 452,
 454,
 457,
 460,
 461,
 462,
 464,
 465,
 467,
 471,
 472,
 474,
 476,
 487,
 488,
 490,
 496,
 503,
 513,
 514,
 515,
 527,
 531,
 555,
 559,
 563,
 569,
 576,
 577,
 579,
 580,
 586,

In [9]:
train_mask = np.ones(len(graph.x), dtype=int)
test_mask = np.zeros(len(graph.x), dtype=int)
test_mask[valid_nodes] = 1
train_mask -= test_mask
graph.train_mask = torch.tensor(train_mask, dtype=torch.bool)
graph.test_mask = torch.tensor(test_mask, dtype = torch.bool)

In [10]:
# Model parameter
input_dim = graph.num_node_features # 1433
hidden_dim = 64
output_dim = len(np.unique(graph.y)) # 7
num_layers = 3

In [11]:
# Model definition
model = Model(in_channels=input_dim, hidden_channels=hidden_dim, out_channels=output_dim, num_layers=num_layers).to(args.device)
print(model)
# Model optimizer, may change into adam
# optimizer = torch.optim.Adam(model.parameters(), lr = args.lr)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)

Model(
  (sage): GraphSAGE(1433, 64, num_layers=2)
  (lin): Linear(in_features=64, out_features=7, bias=False)
)


In [12]:
from torch_geometric.loader import NeighborLoader
import copy
graph = graph.to(args.device, 'x', 'y')
train_loader = NeighborLoader(
	graph, 
	num_neighbors=[args.num_neg_samples] * (num_layers - 1),
	input_nodes=graph.train_mask,
	shuffle=True,
	batch_size=args.batch_size)
valid_loader = NeighborLoader(
	copy.copy(graph),
	input_nodes = None,
	num_neighbors=[-1],
	# num_neighbors=[args.num_neg_samples] * (num_layers - 1),
	shuffle = False,
	batch_size = args.batch_size
)

In [14]:
# Model training
from tqdm import tqdm
def train(epoch, verbose=False):
	model.train()
	if verbose:
		pbar = tqdm(total=int(len(train_loader.dataset)))
		pbar.set_description(f'Epoch{ epoch:02d}')
	total_loss = 0
	total_correct = 0
	total_examples = 0
	for batch in train_loader:
		batch = batch.to(args.device)
		optimizer.zero_grad()
		y_pred = (model.forward(batch)[:batch.batch_size])[batch.train_mask[:batch.batch_size]].argmax(dim=-1)
		y_true = (batch.y[:batch.batch_size])[batch.train_mask[:batch.batch_size]]
		total_correct += int((y_pred == y_true).sum())
		# print(y_pred.unique(), y_true.unique())
		loss = model.loss(batch)
		loss.backward()
		optimizer.step()
		loss = loss.data.item()
		total_loss += loss * batch.train_mask[:batch.batch_size].sum()
		total_examples += batch.train_mask[:batch.batch_size].sum()
		if verbose:
			pbar.update(batch.batch_size)
	if verbose:
		pbar.close()
	return total_loss/total_examples, total_correct / total_examples

In [15]:
for epoch in range(1, args.num_epochs+1):
    avg_loss, acc = train(epoch)
    print(f'Epoch {epoch:02d}, Loss: {avg_loss:.4f}, Train accuracy: {acc:.4f}')

Epoch 01, Loss: 1.9365, Train accuracy: 0.2510
Epoch 02, Loss: 1.8933, Train accuracy: 0.3229
Epoch 03, Loss: 1.8017, Train accuracy: 0.2993
Epoch 04, Loss: 1.6706, Train accuracy: 0.3014
Epoch 05, Loss: 1.5265, Train accuracy: 0.3868
Epoch 06, Loss: 1.3461, Train accuracy: 0.5121
Epoch 07, Loss: 1.1471, Train accuracy: 0.6488
Epoch 08, Loss: 0.9676, Train accuracy: 0.7510
Epoch 09, Loss: 0.8179, Train accuracy: 0.8082
Epoch 10, Loss: 0.6950, Train accuracy: 0.8417


In [16]:
from sklearn.metrics import f1_score

In [17]:
@torch.no_grad()
def test():
	model.eval()
	# total_correct = 0
	# total_examples = 0
	y_pred_all = []
	y_true_all = []
	for batch in valid_loader:
		batch = batch.to(args.device)
		y_pred = model.forward(batch)[:batch.batch_size]
		y_true = batch.y[:batch.batch_size]
		y_pred_val = y_pred[batch.test_mask[:batch.batch_size]].argmax(dim=-1).cpu()
		y_true_val = y_true[batch.test_mask[:batch.batch_size]].cpu()
		# total_correct += int((y_pred_val.argmax(dim=-1) == y_true_val).sum())
		# total_examples += batch.test_mask[:batch.batch_size].sum()
		# print(total_correct, total_examples)
		y_pred_all.append(y_pred_val)
		y_true_all.append(y_true_val)
	y_pred_all = torch.cat(y_pred_all)
	y_true_all = torch.cat(y_true_all)
	print("Validation Macro F1:" +  str(np.round(f1_score(y_true_all, y_pred_all, average="macro"), 6)))
	print("Validation Micro F1:" +  str(np.round(f1_score(y_true_all, y_pred_all, average="micro"), 6)))

	val_acc = (y_pred_all == y_true_all).sum()/(len(y_pred_all))
	print(f"Val accuracy: {val_acc: .4f}")
	return y_pred_all, y_true_all

In [18]:
y_pred, y_true = test()

Validation Macro F1:0.727744
Validation Micro F1:0.785
Val accuracy:  0.7850
