In [4]:
import networkx as nx
import matplotlib.pyplot as plt

def load_graph() -> nx.Graph:
    # Load the graph from the graphml file
    g = nx.read_graphml("dataset/airportsAndCoordAndPop.graphml")
    return g

def filter_country(graph: nx.Graph, nb_airports: int) -> nx.Graph:
    g = graph.copy()
    countries = set([g.nodes[node]["country"] for node in g.nodes])
    countries = {country: sum([1 for node in g.nodes if g.nodes[node]["country"] == country]) for country in countries}
    g = g.subgraph([node for node in g.nodes if countries[g.nodes[node]["country"]] >= nb_airports])    
    return g

def convert_fc(graph: nx.Graph) -> nx.Graph:
    g = graph.copy()
    g.remove_edges_from(g.edges)
    edges = list(set([tuple(sorted([n1, n2])) for n1 in g.nodes for n2 in g.nodes if n1 != n2]))
    g.add_edges_from(edges)
    return g


def connect_country(graph: nx.Graph) -> nx.Graph:
    g = graph.copy()
    g.remove_edges_from(g.edges)
    for node in g.nodes:
        country = g.nodes[node]["country"]
        for node2 in g.nodes:
            if g.nodes[node2]["country"] == country and node != node2:
                g.add_edge(node, node2)
    return g

def haversine(lat1, lon1, lat2, lon2):
    import math

    R = 6371  # radius of the Earth in km
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c

def add_distance(g: nx.Graph) -> nx.Graph:
    new_graph = g.copy()
    for edge in new_graph.edges:
        n1, n2 = edge
        new_graph.edges[edge]["distance"] = haversine(
            new_graph.nodes[n1]["lat"],
            new_graph.nodes[n1]["lon"],
            new_graph.nodes[n2]["lat"],
            new_graph.nodes[n2]["lon"],
        )
        
    return new_graph


In [2]:
import time
st = time.time()
g = load_graph()
print(f"Load: \t\t\t\t{g} ({time.time() - st:3f}s)")

st = time.time()
nb = 100
g = filter_country(g, nb_airports=nb)
print(f"Filter({nb}): \t\t\t{g} ({time.time() - st:3f}s)")

st = time.time()
g = convert_fc(g)
print(f"Convert to fully-connected: \t{g} ({time.time() - st:3f}s)")

st = time.time()
g = add_distance(g)
print(f"Add distance on edges: \t\t{g} ({time.time() - st:3f}s)")

Load: 				Graph with 3363 nodes and 13547 edges (0.198418s)
Filter(100): 			Graph with 1205 nodes and 3884 edges (0.110703s)
Convert to fully-connected: 	Graph with 1205 nodes and 725410 edges (1.320693s)
Add distance on edges: 		Graph with 1205 nodes and 725410 edges (2.303417s)


In [5]:
import torch
import numpy as np
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
from sklearn.preprocessing import LabelEncoder, StandardScaler

def encode_country(data: list[str]) -> tuple[torch.Tensor, np.ndarray]:
    encoder = LabelEncoder()
    data_encoded = encoder.fit_transform(data)
    classes = encoder.classes_
    return torch.tensor(data_encoded, dtype=torch.long), classes

def normalize(tensor: torch.Tensor):
    scaler = StandardScaler()
    tensor_transformed = torch.tensor(scaler.fit_transform(tensor), dtype=torch.float)
    return tensor_transformed


def prepart_data(
    graph: nx.Graph, 
    node_attr: list[str],
    with_distance: bool = True,
    train_ratio: float = 0.8, 
    test_ratio: float = 0.1,
    val_ratio: float = 0.1
    ):
    
    # Création de mask pour les données d'entraînement, de validation et de test
    if train_ratio + test_ratio + val_ratio!= 1:
        raise ValueError("train_ratio + test_ratio must be equal to 1")
    
    # Convert
    data: Data = from_networkx(graph, group_node_attrs=node_attr)
    if with_distance:
        dist = data.distance.clone().detach()
        data.edge_weight = dist.reshape(-1, 1)
        # Normalize edge_weight
        data.edge_weight = normalize(data.edge_weight)
        data.edge_weight = 1 - (data.edge_weight - data.edge_weight.min()) / (data.edge_weight.max() - data.edge_weight.min())
    else:
        data.edge_weight = torch.ones(data.num_edges, 1)
    
    # Encode Y (country label) 
    data.y, data.y_classes = encode_country(data.country) 
    
    # Normalize X
    data.x = normalize(data.x)
    
    
    # Mask train/test/val
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.train_mask[:int(data.num_nodes * train_ratio)] = True
    data.test_mask[int(data.num_nodes * train_ratio):int(data.num_nodes * train_ratio)+int(data.num_nodes * test_ratio)] = True
    data.val_mask[int(data.num_nodes * train_ratio)+int(data.num_nodes * test_ratio):] = True
    
    return data

In [None]:
    
g = load_graph()
g = filter_country(g, 10)
g = add_distance(g)
print(g)
data = prepart_data(g, ["lon", "lat"])
print(data)

In [6]:
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.nn as gnn

class GCN(nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super(GCN, self).__init__()
        self.conv1 = gnn.GCNConv(dim_in, dim_h)
        self.conv2 = gnn.GCNConv(dim_h, dim_h)
        self.conv3 = gnn.GCNConv(dim_h, dim_out)
        
        
    def forward(self, x, edge_index, edge_weight):
        x = F.relu(self.conv1(x, edge_index, edge_weight))
        x = F.relu(self.conv2(x, edge_index, edge_weight))
        x = self.conv3(x, edge_index, edge_weight)
        return  F.log_softmax(x, dim=1)
        

In [7]:
from torch.utils.tensorboard import SummaryWriter


def evaluate(model: GCN, data: Data, mask=None, loss_fct=None) -> tuple[float, float]:
    if mask is None:
        mask = np.ones(data.y.shape[0], dtype=bool)
    if loss_fct is None:
        loss_fct = torch.nn.CrossEntropyLoss()
        
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index, data.edge_weight)
        pred = out[mask].argmax(dim=1)
        acc = pred.eq(data.y[mask]).sum().item() / data.y[mask].shape[0]
        
        loss = loss_fct(out[mask], data.y[mask])
        
        return acc, loss.item()
    
    
def predict(gcn, x, ei, ew):
    with torch.no_grad():
            out = gcn(x, ei, ew)
            return out.argmax(dim=1)
    

def train(model: GCN, data: Data, epochs=100, lr=0.01, writer=None):
    if writer is None:
        writer = SummaryWriter(comment=f"_gcn_lr[{lr}]")
        
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    loss_fct = torch.nn.CrossEntropyLoss()
    
    best_weights = model.state_dict()
    best_loss = 1e9
    count_no_improve = 0
    
    for ep in range(epochs):
        # TRAIN
        model.train()
        optimizer.zero_grad()
        # Forward pass
        out = model(data.x, data.edge_index, data.edge_weight)
        
        # Compute loss
        loss = loss_fct(out[data.train_mask], data.y[data.train_mask])
        # Compute accuracy
        pred = out[data.train_mask].argmax(dim=1)
        acc = pred.eq(data.y[data.train_mask]).sum().item() / data.y[data.train_mask].shape[0]
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        writer.add_scalar("Train (loss)", loss.item(), ep)
        writer.add_scalar("Train (acc)", acc, ep)
        
        # EVALUATE
        val_acc, val_loss = evaluate(model, data, data.val_mask)
        
        writer.add_scalar("Val (acc)", val_acc, ep)
        writer.add_scalar("Val (loss)", val_loss, ep)
        
        # Early stopping
        if val_loss <= best_loss:
            best_loss = val_loss
            best_weights = model.state_dict()
            count_no_improve = 0
        else:
            count_no_improve += 1
            if count_no_improve > 100:
                print(f"Early stopping at epoch {ep}")
                break
        
        print(f"\rEpoch {ep+1}/{epochs} - Loss={val_loss} - Acc={val_acc}" + " " * 20, end="")
    
    model.load_state_dict(best_weights)
    val_acc = evaluate(model, data, data.test_mask)
    print(f"\nTest accuracy: {val_acc}")
    return model

In [None]:
# Parameters
lr = 0.001
filter_nb = 100
distance = False
full_connected = False
country_connected = True
node_attr = ["lon", "lat"]

# Tensorboard
writer = SummaryWriter(comment=f"_gcn_lr[{lr}]_filter[{filter_nb}]_cc[{country_connected}]_distance[{distance}]_node_attr[{",".join(node_attr)}]")

# Load the graph
print("Loading the graph...")
g = load_graph()
g = filter_country(g, filter_nb)
if full_connected:
    g = convert_fc(g)
if country_connected:
    g = connect_country(g)
if distance:
    g = add_distance(g)   
# Prepare the data
data = prepart_data(g, node_attr, with_distance=distance)
print(str(data) + "\n")

# Create the model
print("Creating the model...")
model = GCN(data.num_node_features, 16, data.y_classes.shape[0])
print(model)

# Train the model
print("Training the model...")
model = train(model, data, epochs=10000, lr=lr, writer=writer)

Loading the graph...
Data(edge_index=[2, 534218], population=[1205], country=[1205], city_name=[1205], x=[1205, 2], edge_weight=[534218, 1], y=[1205], y_classes=[4], train_mask=[1205], test_mask=[1205], val_mask=[1205])

Creating the model...
GCN(
  (conv1): GCNConv(2, 16)
  (conv2): GCNConv(16, 16)
  (conv3): GCNConv(16, 4)
)
Training the model...
Epoch 2986/10000 - Loss=0.001323458505794406 - Acc=1.0                                 

In [None]:
import tensorboard