# III. Custom Model Training

This notebook should be uploaded and run on a SageMaker Notebook instance associated with an Amazon Neptune cluster. 

In [None]:
%pip install -U graph-notebook

In [None]:
import neptune_ml_utils as neptune_ml

neptune_ml.check_ml_enabled()

In [None]:
S3_URI="s3://<REPLACE WITH YOUR S3 URI>"
# remove trailing slashes
S3_URI = S3_URI[:-1] if S3_URI.endswith('/') else S3_URI

In [None]:
%load_ext graph_notebook.magics

In [None]:
%graph_notebook_version
%graph_notebook_config
%status

## 1. Configure Features

In [None]:
export_params = {
    "command": "export-pg",
    "params": {
        "endpoint": neptune_ml.get_host(),
        "profile": "neptune_ml",
        "cloneCluster": False,
        "filter": {
            "nodes": [
                {
                    "label": "protein",
                    "properties": [
                        "length",
                        "molWeight",
                        "keywords",
                        "esm2",
                    ],
                },
                {"label": "organism", "properties": ["name"]},
                {"label": "family", "properties": ["name"]},
            ],
            "edges": [
                {"label": "found_in"},
                {"label": "member_of"},
                {
                    "label": "interacts_with",
                    "properties": ["experimentalSystem", "throughput"],
                },
            ],
        },
    },
    "outputS3Path": f"{S3_URI}/neptune-export",
    "additionalParams": {
        "neptune_ml": {
            "version": "v2.0",
            "targets": [
                {
                    "edge": ["protein", "interacts_with", "protein"],
                    "type": "link_prediction",
                    "split_rate": [0.8, 0.2, 0.0],
                }
            ],
            "features": [
                {"node": "protein", "property": "length", "type": "numerical"},
                {"node": "protein", "property": "molWeight", "type": "numerical"},
                {"node": "protein", "property": "esm2", "type": "none"},
                {
                    "node": "protein",
                    "property": "keywords",
                    "type": "category",
                    "separator": ";",
                },
                {"node": "organism", "property": "name", "type": "category"},
                {"node": "family", "property": "name", "type": "category"},
                {
                    "edge": ["protein", "interacts_with", "protein"],
                    "property": "experimentalSystem",
                    "type": "category",
                },
                {
                    "edge": ["protein", "interacts_with", "protein"],
                    "property": "throughput",
                    "type": "category",
                },
            ],
        }
    },
    "jobSize": "medium",
}

In [None]:
%%neptune_ml export start --export-url {neptune_ml.get_export_service_host()} --export-iam --wait --store-to export_results
${export_params}

## 2. Submit Feature Processing Job


In [None]:
# The training_job_name can be set to a unique value below, otherwise one will be auto generated
training_job_name=neptune_ml.get_training_job_name('link-prediction')
print(f"Training job is {training_job_name}")

In [None]:
processing_params = f"""
--config-file-name training-data-configuration.json
--job-id {training_job_name} 
--s3-input-uri {export_results['outputS3Uri']} 
--s3-processed-uri {str(S3_URI)}/preloading 
--instance-type ml.m5.2xlarge
"""

In [None]:
%neptune_ml dataprocessing start --wait --store-to processing_results {processing_params}

## 3. Submit Model Training Job

Upload scripts to S3

In [None]:

s3_custom_source_location = f"""{str(S3_URI)}/training/source/{training_job_name}"""

!aws s3 cp --recursive src/ $s3_custom_source_location

In [None]:
import requests
import os

response = requests.post(
    url=os.path.join("https://", neptune_ml.get_host() + ":8182", "ml/modeltraining"),
    headers={"Content-Type": "application/json"},
    json={
        "dataProcessingJobId": training_job_name,
        "trainModelS3Location": f"{S3_URI}/training/source/{training_job_name}/output",
        "trainingInstanceType": "ml.g5.2xlarge",
        "modelName": "custom",
        "maxHPONumberOfTrainingJobs": "12",
        "maxHPOParallelTrainingJobs": "4",
        "customModelTrainingParameters": {
            "sourceS3DirectoryPath": s3_custom_source_location,
            "trainingEntryPointScript": "train.py",
            "transformEntryPointScript": "transform.py",
        },
    },
)

print(response.text)

## 4. Create Inference Endpoint

In [None]:
endpoint_params=f"""
--id {training_job_name}
--model-training-job-id {training_job_name}"""

In [None]:
%neptune_ml endpoint create --wait --store-to endpoint_results {endpoint_params}

In [None]:
endpoint=endpoint_results['endpoint']['name']