# Tutorial: Training a GNN Recommendation System

In [9]:
from knowledge_algorithms.knowledge_graph.neo4j import Neo4jAPI
from talent_recommendation.BipartiteRecSys import load_configs, rec_sys_train, HeteroGNN
from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit
import torch
import random

The essential configurations are all listed in `./talent_recommendation/configs.yaml`. Feel free to modify it.

In [2]:
configs = "./talent_recommendation/configs.yaml"
configs = load_configs(configs, check_keys=["neo4j"])

configs

Run the following steps if you want to train a recommendation model in ONE STEP.

```python
model, data = rec_sys_train(configs)
```

In [None]:
import torch
from torch_geometric.nn import HeteroConv, RGCNConv, SAGEConv
import torch.nn.functional as F

In [None]:
db = Neo4jAPI(configs["neo4j"]["url"], configs["neo4j"]["user"], configs["neo4j"]["password"])

data = db.load_hetero_graph_dataset(["employee", "project", "position", "techStack"])
data.metadata()

In [None]:
def get_hetero_conv_layer(in_channels, out_channels, data: HeteroData, conv_layer=SAGEConv, aggr="sum"):
    if conv_layer == RGCNConv:
        return HeteroConv({
            edge_type: conv_layer(in_channels, out_channels, 1) for edge_type in data.edge_types
        }, aggr=aggr)
    else:
        return HeteroConv({
            edge_type: conv_layer(in_channels, out_channels, num_relations=1, add_self_loops=False) for edge_type in data.edge_types
        }, aggr=aggr)

In [None]:
class HeteroGNN(torch.nn.Module):
    def __init__(self, input_channels, hidden_channels, out_channels, data: HeteroData, num_conv_layers=3, 
                 conv_layer=RGCNConv):
        super().__init__()

        assert(num_conv_layers >= 2)
        
        self.conv_layer_list = torch.nn.ModuleList()
        self.num_conv_layers = num_conv_layers

        # Define the first convolutional layer
        conv1 = get_hetero_conv_layer(input_channels, hidden_channels, data, conv_layer)
        self.conv_layer_list.append(conv1)
        
        # Define the middle convolutional layers
        for _ in range(num_conv_layers-2):
            self.conv_layer_list.append(get_hetero_conv_layer(hidden_channels, hidden_channels, data))

        # Define the third convolutional layer
        self.conv_layer_list.append(get_hetero_conv_layer(hidden_channels, out_channels, data))

    def forward(self, x_dict, edge_index_dict):
        for i in range(self.num_conv_layers - 1):
            x_dict = self.conv_layer_list[i](x_dict, edge_index_dict)
            x_dict = {key: F.relu(x) for key, x in x_dict.items()}

        # Final layer
        x_dict = self.conv_layer_list[-1](x_dict, edge_index_dict)

        return x_dict

In [None]:
input_channels = 768  # Feature size of your nodes
hidden_channels = 128 # Example size, adjust as needed
out_channels = 64     # Example size, adjust as needed

def transform_hetero_data_for_rec_sys(data, to_undirected=True, random_data_split=True):
    if to_undirected:
        data = ToUndirected()(data)
    if random_data_split:
        transform = RandomLinkSplit(
            num_val=0.1,
            num_test=0.0,
            is_undirected=True,
            neg_sampling_ratio=0.0,
            edge_types=[("employee", "workedIn", "project")],
            rev_edge_types=[("project", "rev_workedIn", "employee")]
        )
        train_data, val_data, test_data = transform(data)
        return train_data, val_data, test_data
    else:
        return data

model = HeteroGNN(input_channels, hidden_channels, out_channels, data, conv_layer=SAGEConv)

In [None]:
def generate_negative_samples(pos_edge_index, num_nodes, num_neg_samples):
    """
    Generate negative samples.

    :param pos_edge_index: Tensor of shape [2, num_edges] representing positive edges.
    :param num_nodes: Tuple (num_employees, num_projects) - the number of nodes in each category.
    :param num_neg_samples: The number of negative samples to generate.
    :return: Tensor of negative samples.
    

    This step is crucial for effectively training your model, 
    especially since the quality of the negative samples can significantly influence the model's performance.

    - Positive Samples:
        Positive samples are straightforward since they are the existing edges in your graph. 
        For instance, if you're recommending projects to employees, 
        your positive samples are the `('employee', 'workedIn', 'project')` edges.

    - Negative Samples:
        Generating negative samples is more challenging. 
        You need to create pairs of nodes that do not have an existing edge between them. 
        It's important that these negative samples are realistic; 
        that is, they should be plausible but non-existent edges.

    Here's a simple approach to generate negative samples:
    
        1. Randomly select an 'employee' node.
        2. Randomly select a 'project' or 'position' node.
        3. Check if this pair forms an edge in your graph. If not, it's a valid negative sample.
        4. Repeat this process until you have the desired number of negative samples.
    
    
    *Examples:*
     
    ```python
    # Example usage
    num_employees = 10000
    num_projects = 4371  # or num_positions
    num_neg_samples = 10000  # This can be adjusted

    # Assuming you have your positive edge index for ('employee', 'workedIn', 'project')
    pos_edge_index = data[('employee', 'workedIn', 'project')].edge_index
    neg_edge_index = generate_negative_samples(pos_edge_index, (num_employees, num_projects), num_neg_samples)
    ```
    """
    neg_samples = []
    while len(neg_samples) < num_neg_samples:
        # Randomly select an 'employee' and a 'project/position'
        employee = random.randint(0, num_nodes[0] - 1)
        project = random.randint(0, num_nodes[1] - 1)

        # Check if this is a negative sample
        if not torch.any((pos_edge_index[0] == employee) & (pos_edge_index[1] == project)):
            neg_samples.append([employee, project])

    return torch.tensor(neg_samples).t().to(pos_edge_index.device)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def compute_auc(pos_scores, neg_scores):
    labels = torch.cat([torch.ones(pos_scores.size(0)), torch.zeros(neg_scores.size(0))])
    scores = torch.cat([pos_scores, neg_scores])
    return roc_auc_score(labels.detach().cpu(), scores.detach().cpu())

def compute_precision_recall_f1(pos_scores, neg_scores, threshold=0.5):
    scores = torch.cat([pos_scores, neg_scores])
    predictions = (scores > threshold).float()
    labels = torch.cat([torch.ones(pos_scores.size(0)), torch.zeros(neg_scores.size(0))])
    
    precision = precision_score(labels.detach().cpu(), predictions.detach().cpu())
    recall = recall_score(labels.detach().cpu(), predictions.detach().cpu())
    f1 = f1_score(labels.detach().cpu(), predictions.detach().cpu())

    return precision, recall, f1

def hit_at_k(pos_scores, neg_scores, k=10):
    # Combine scores and sort them
    combined_scores = torch.cat([pos_scores, neg_scores])
    _, indices = combined_scores.topk(k)

    # Calculate hits
    hits = (indices < pos_scores.size(0)).float().sum().item()
    return hits / k


In [None]:
# # Loss function
# margin = 0.5
# loss_function = torch.nn.MarginRankingLoss(margin=margin)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train(model:torch.nn.Module, train_data:HeteroData, val_data:HeteroData=None,
          target_edge_type=None, loss_function="MarginRankingLoss", optimizer="Adam", 
          num_epochs=3, num_neg_samples=None, loss_function_kwargs={"margin": 0.5}, 
          lr=0.001, k_to_hit=5, device=None, val_per_epochs=10):
    
    h, r, t = target_edge_type
    model.train()
    
    # ensure device
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data = train_data.to(device)
    model = model.to(device)
    
    # loss function
    if loss_function == "MarginRankingLoss":
        loss_function = torch.nn.MarginRankingLoss(**loss_function_kwargs)
        
    # optimizer
    if optimizer == "Adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    if num_neg_samples is None:
        num_neg_samples = train_data[target_edge_type].num_edges

    for epoch in range(num_epochs):
        optimizer.zero_grad()

        # Forward pass through GNN
        node_embeddings = model(train_data.x_dict, train_data.edge_index_dict)

        # Assume you have a function to generate positive and negative samples
        pos_samples = train_data[target_edge_type].edge_index
        neg_samples = generate_negative_samples(
            pos_samples, 
            (train_data[h].num_nodes, train_data[t].num_nodes), 
            num_neg_samples=num_neg_samples
        )

        # Compute scores for positive and negative samples
        # Example: Using dot product to compute scores
        pos_scores = (node_embeddings[h][pos_samples[0]] * node_embeddings[t][pos_samples[1]]).sum(dim=1)
        neg_scores = (node_embeddings[h][neg_samples[0]] * node_embeddings[t][neg_samples[1]]).sum(dim=1)

        # Target tensor for MarginRankingLoss
        target = torch.ones(pos_scores.size(), device=device)

        # Compute loss
        loss = loss_function(pos_scores, neg_scores, target)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        
        # Evaluate on validation set
        if epoch + 1 % val_per_epochs == 0:
            if val_data is not None:
                model.eval()
                with torch.no_grad():
                    # Compute embeddings for validation set
                    val_embeddings = model(val_data.x_dict, val_data.edge_index_dict)

                    # Generate positive and negative samples for validation
                    val_pos_samples = val_data[target_edge_type].edge_index
                    val_neg_samples = generate_negative_samples(
                        val_pos_samples,
                        (val_data[h].num_nodes, val_data[t].num_nodes),
                        num_neg_samples=val_data[target_edge_type].num_edges
                    )

                    # Compute scores for validation samples
                    val_pos_scores = (val_embeddings[h][val_pos_samples[0]] * val_embeddings[t][val_pos_samples[1]]).sum(dim=1)
                    val_neg_scores = (val_embeddings[h][val_neg_samples[0]] * val_embeddings[t][val_neg_samples[1]]).sum(dim=1)

                    # Calculate metrics
                    val_auc = compute_auc(val_pos_scores, val_neg_scores)
                    val_precision, val_recall, val_f1 = compute_precision_recall_f1(val_pos_scores, val_neg_scores)
                    val_hit_k = hit_at_k(val_pos_scores, val_neg_scores, k=k_to_hit)  # You can adjust k
                    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.6f}, "
                        f"Val AUC: {val_auc:.4f}, Val Precision: {val_precision:.4f}, "
                        f"Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}, Val Hit@K: {val_hit_k:.4f}")
            else:
                print(f"Epoch [{epoch+1:03d}/{num_epochs}], Loss: {loss.item()}")


In [None]:
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.0,
    is_undirected=True,
    neg_sampling_ratio=0.0,
    edge_types=[("employee", "workedIn", "project")],
    rev_edge_types=[("project", "rev_workedIn", "employee")]
)
train_data, val_data, test_data = transform(data)
train_data

HeteroData(
  employee={ x=[10000, 768] },
  project={ x=[4371, 768] },
  position={ x=[41, 768] },
  techStack={ x=[67, 768] },
  (employee, workedIn, project)={
    edge_index=[2, 35988],
    edge_label=[35988],
    edge_label_index=[2, 35988],
  },
  (employee, workedAs, position)={ edge_index=[2, 38513] },
  (employee, hasTechStack, techStack)={ edge_index=[2, 2966] },
  (project, needPosition, position)={ edge_index=[2, 35641] },
  (project, needTechstack, techStack)={ edge_index=[2, 37316] },
  (position, needTechstack, techStack)={ edge_index=[2, 2665] },
  (project, rev_workedIn, employee)={ edge_index=[2, 35988] },
  (position, rev_workedAs, employee)={ edge_index=[2, 38513] },
  (techStack, rev_hasTechStack, employee)={ edge_index=[2, 2966] },
  (position, rev_needPosition, project)={ edge_index=[2, 35641] },
  (techStack, rev_needTechstack, project)={ edge_index=[2, 37316] },
  (techStack, rev_needTechstack, position)={ edge_index=[2, 2665] },
  (employee, rev_rev_workedIn, 

In [None]:
train(model, train_data, val_data, ("employee", "workedIn", "project"), num_epochs=100)

Epoch [1/100], Loss: 2.343534, Val AUC: 0.4826, Val Precision: 0.5003, Val Recall: 0.7036, Val F1: 0.5848, Val Hit@K: 0.0000
Epoch [11/100], Loss: 1.890876, Val AUC: 0.5188, Val Precision: 0.7833, Val Recall: 0.0026, Val F1: 0.0052, Val Hit@K: 0.8000
Epoch [21/100], Loss: 0.507581, Val AUC: 0.5840, Val Precision: 0.5714, Val Recall: 0.0001, Val F1: 0.0002, Val Hit@K: 0.6000
Epoch [31/100], Loss: 0.419804, Val AUC: 0.6302, Val Precision: 0.9273, Val Recall: 0.0014, Val F1: 0.0028, Val Hit@K: 1.0000
Epoch [41/100], Loss: 0.367059, Val AUC: 0.6754, Val Precision: 0.8925, Val Recall: 0.0046, Val F1: 0.0092, Val Hit@K: 1.0000
Epoch [51/100], Loss: 0.343199, Val AUC: 0.7017, Val Precision: 0.8908, Val Recall: 0.0029, Val F1: 0.0059, Val Hit@K: 1.0000
Epoch [61/100], Loss: 0.309591, Val AUC: 0.7357, Val Precision: 0.9302, Val Recall: 0.0011, Val F1: 0.0022, Val Hit@K: 1.0000
Epoch [71/100], Loss: 0.281700, Val AUC: 0.7613, Val Precision: 0.9519, Val Recall: 0.0049, Val F1: 0.0098, Val Hit@K: 

In [14]:
torch.save(model.to("cpu").state_dict(), "./checkpoints/model.pt")

In [21]:
df = db.fetch_data("Match (e:employee) return e.name as name ORDER BY e.name")

In [25]:
(df["name"] == "MT83208").sum()

1