# Environment set up

In [None]:
# the endpoint generated by the cloudformation stack for neptune
endpoint = 'to be copied from CF'

In [None]:
import neptune_ml_utils as neptune_ml
neptune_ml.check_ml_enabled()

In [None]:
from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.driver.aiohttp.transport import AiohttpTransport
from gremlin_python.process.traversal import *
from gremlin_python.process.graph_traversal import id_
import os

In [None]:
s3_bucket_uri="s3://telcograph"

# Prepare the PoC for the Transductive mode




- drop some edges : Drop some links between some existing users and Cells to predict them with the transductive mode training.
- training is already done and the model_transductive is available in the Endpoint
- the transductive training take into account all users and cells properties as well as the user_live_cell properties. 
- for user_0 and cell_62000 we dropped ALL edges
- for user_1500 and cell_56500 we dropped ALL edges  
- for user_4570 three edges user_live_cell_1678350, user_live_cell_734598, user_live_cell_487137

### Select users to drop their user_live_cell edges

In [None]:
%%gremlin
g.V()
.hasId("user_0")
.outE()
.hasLabel("user_live_cell")
.inV()
.valueMap(true, "name")
.groupCount()
.unfold()
.order()
.by(values, desc)

In [None]:
%%gremlin
g.V('user_0').outE()

In [None]:
%%gremlin
g.V('user_0').bothE().where(otherV().hasId('cell_62000'))

In [None]:
#%%gremlin
#g.V('user_0').bothE().where(otherV().hasId('cell_62000')).drop()

In [None]:
%%gremlin
g.V()
.hasId("user_1500")
.outE()
.hasLabel("user_live_cell")
.inV()
.valueMap(true, "name")
.groupCount()
.unfold()
.order()
.by(values, desc)

In [None]:
%%gremlin
g.V('user_1500').bothE().where(otherV().hasId('cell_56500'))

In [None]:
%%gremlin
g.V()
.hasId("user_1500")
.outE()
.hasLabel("user_live_cell")
.inV()
.valueMap(true, "name")
.groupCount()
.unfold()
.order()
.by(values, desc)

In [None]:
%%gremlin
g.V('user_4570').bothE().where(otherV().hasId('cell_10570'))

In [None]:
%%gremlin
g.V('user_4570').outE('user_live_cell').hasId('user_live_cell_734598').drop()

In [None]:
#%%gremlin
#g.V('user_4570').outE('user_live_cell').hasId('user_live_cell_1678350').drop()

In [None]:
%%gremlin
g.V('user_4570').bothE().where(otherV().hasId('cell_10570'))

# Launch the export
## Frame the link prediction task in the export_param

In [None]:
export_params={ 
"command": "export-pg", 
"params": { "endpoint": neptune_ml.get_host(),
            "profile": "neptune_ml",
            "useIamAuth": neptune_ml.get_iam(),
            "cloneCluster": False,
            "nodeLabels": ["user", "cell"],
            "edgeLabels": ["user_live_cell"]
            }, 
"outputS3Path": f'{s3_bucket_uri}/neptune-export',
"additionalParams": {
        "neptune_ml": {
          "version": "v2.0",
          "targets": [
            {
                "edge": ["user", "user_live_cell", "cell"],
                "type" : "link_prediction",
                "split_rate": [0.8, 0.1, 0.1]
            }
         ]
        }
      },
"jobSize": "xlarge"}
export_params

In [None]:
%%neptune_ml export start --export-url {neptune_ml.get_export_service_host()} --export-iam --wait --store-to export_results
${export_params}

In [None]:
# check the status
%neptune_ml export status --export-url {neptune_ml.get_export_service_host()} --export-iam --job-id 4660f4dd-bcdf-4abe-949a-692ddd54f593

# Data processing/Preparation of graph data for Training

NB. Precise in export_param the S3 with the latest export to consider 

In [None]:
# The training_job_name can be set to a unique value below, otherwise one will be auto generated
training_job_name=neptune_ml.get_training_job_name('link-prediction')

processing_params = f"""
--config-file-name training-data-configuration.json
--job-id {training_job_name} 
--instance-type ml.r5.16xlarge
--s3-input-uri s3://telcograph/neptune-export/20230518_072439
--s3-processed-uri {str(s3_bucket_uri)}/preloading """

In [None]:
%neptune_ml dataprocessing start --wait --store-to processing_results {processing_params}

# Training 

<div style="background-color:#eeeeee; padding:20px; text-align:left; border-radius:10px; margin-top:10px; margin-bottom:10px; "><b>Information</b>: Link prediction is a more computationally complex model than classification or regression </div>

## Transductive Training steps 

In [None]:
#check the Ids for training and processing jobs
training_job_name = "my-train-job-ept-hypmodv9"
training_job_name

In [None]:
!curl -s "https://cluster.cluster-c2wgvyuhzgmx.eu-west-1.neptune.amazonaws.com:8182/ml/dataprocessing/"

In [None]:
!curl -s "https://cluster.cluster-c2wgvyuhzgmx.eu-west-1.neptune.amazonaws.com:8182/ml/dataprocessing/link-prediction-1684928671"

In [None]:
training_params=f"""
--job-id {training_job_name}
--data-processing-id {training_job_name}
--instance-type ml.g4dn.16xlarge
--s3-output-uri {str(s3_bucket_uri)}/training
--max-hpo-number 1
--max-hpo-parallel 3 """

In [None]:
training_params

In [None]:
%neptune_ml training start --wait --store-to training_results {training_params}

In [None]:
training_params

In [None]:
%neptune_ml training status --job-id my-train-job-ept-hypmodv9

# Inference 

## Endpoint creation 

In [None]:
endpoint_params=f"""
--id {training_job_name}
--model-training-job-id {training_job_name}"""
endpoint_params

In [None]:
%neptune_ml endpoint create --wait --store-to endpoint_results {endpoint_params}

In [None]:
endpoint_transductive=endpoint_results['endpoint']['name']

In [None]:
endpoint_transductive ='my-train-2023-08-29-10-12-7520000-endpoint'

### reminder

- user_0 and cell_62000 / dropped ALL edges 

- user_1500 and cell_56500 / dropped ALL edges 

- user_4570 three edges of cell_10570

In [None]:
import boto3
import json
import time

def invoke_sagemaker_endpoint(endpoint_name, vertex_id, edge_type, head_node_type=None, tail_node_type=None, 
    exclude_flag='null', topk=10, response_format='application/json', region='eu-west-1'):
    if tail_node_type:
        input_data = {
            "version": "v1",
            "mode": "predict_tail",
            "data": {
                "globalParameters": {
                    'topk': topk,
                    'edgeType': edge_type,
                    'tailNodeType': tail_node_type,
                    'excludeFlag': exclude_flag
                },
                "edges": [
                    {
                        "headNodeId": vertex_id,
                    }
                ]
            }
        }
    else:
        input_data = {
            "version": "v1",
            "mode": "predict_head",
            "data": {
                "globalParameters": {
                    'topk': topk,
                    'edgeType': edge_type,
                    'headNodeType': head_node_type,
                    'excludeFlag': exclude_flag
                },
                "edges": [
                    {
                        "tailNodeId": vertex_id,
                    }
                ]
            }
        }
    sagemaker_runtime = boto3.client('sagemaker-runtime', region_name=region)
    payload = json.dumps(input_data)
    tic = time.time()
    response = sagemaker_runtime.invoke_endpoint(EndpointName=endpoint_name,
                                                 ContentType='application/json',
                                                 Accept=response_format,
                                                 Body=payload)
    duration = time.time() - tic
    print()
    output = json.loads(response['Body'].read().decode())["output"]
    for result in output["edges"][0]["mlResults"]:
        print(result['inferredValue'])
    print("Inference took {} milliseconds".format(duration*1000))

<img src="attachment:2a0d7696-5e11-42af-a56f-f7acc5d63572.png" alt="image.png" width="1000"/>

## DEMO shows: how GNN predicts to which cells, given user will be connected or Predicting which users are going to connect to a given cell 

<div style="background-color:#eeeeee; padding:20px; text-align:left; border-radius:10px; margin-top:10px; margin-bottom:10px; "><b>Experimentation1</b>: GNN is going to predict that user_0 is connected to cell_62000</div>

In [None]:
%%gremlin
g.with("Neptune#ml.endpoint","${endpoint_transductive}")
.with("Neptune#ml.limit", 5)
.V("cell_62000")
.in("user_live_cell").with("Neptune#ml.prediction").hasLabel("user")

<div style="background-color:#eeeeee; padding:20px; text-align:left; border-radius:10px; margin-top:10px; margin-bottom:10px; "><b>Experimentation2</b>: GNN is going to predict that user_1500 is connected to cell_56500</div>

In [None]:
%%gremlin
g.with("Neptune#ml.endpoint","${endpoint_transductive}")
.with("Neptune#ml.limit",10)
.V("cell_56500")
.in("user_live_cell").with("Neptune#ml.prediction").hasLabel("user")

<div style="background-color:#eeeeee; padding:20px; text-align:left; border-radius:10px; margin-top:10px; margin-bottom:10px; "><b>Experimentation3</b>: GNN is going to predict that user_1500 is connected to cell_63500</div>

In [None]:
%%gremlin
g.with("Neptune#ml.endpoint","${endpoint_transductive}")
.with("Neptune#ml.limit",5)
.V("user_1500")
.out("user_live_cell").with("Neptune#ml.prediction").with("Neptune#ml.filterExistingEdges").hasLabel("cell")

# end to end architecture for multi- scenario 

<img src="attachment:f8b596d3-26cd-4127-aebb-132900d153db.png" alt="image.png" width="1000"/>

# Note on GNN evaluation 

- HITS@10 provides a measure of how often the model suggests the correct item within the top 10 recommendation

- While MR gives an overall indication of how close the correct item is to the top of the list on average.

- Sagemaker evaluate the model on the train and test set 

- Results on test set 

    * "HITS at top 1 (HITS@1)": 0.4010819758391616,
    * "HITS at top 10 (HITS@10)": 0.9438622262173598,
    * "HITS at top 3 (HITS@3)": 0.6301810418539388,
    * "mean rank (MR)": 3.719502285632852,
