In [None]:
!kedro install

In [None]:
import logging.config
from pathlib import Path

from IPython.core.magic import register_line_magic

# Find the project root (./../../../)
startup_error = None
project_path = Path('..').resolve()

@register_line_magic
def reload_kedro(path=None, line=None):
    """"Line magic which reloads all Kedro default variables."""
    global startup_error
    global context
    global catalog

    try:
        import kedro.config.default_logger
        from kedro.context import load_context
        from kedro.cli.jupyter import collect_line_magic
    except ImportError:
        logging.error(
            "Kedro appears not to be installed in your current environment "
            "or your current IPython session was not started in a valid Kedro project."
        )
        raise

    try:
        path = path or project_path
        logging.debug("Loading the context from %s", str(path))

        context = load_context(path)
        catalog = context.catalog
        logging.info("** Kedro project %s", str(context.project_name))
        logging.info("Defined global variable `context` and `catalog`")

        for line_magic in collect_line_magic():
            register_line_magic(line_magic)
            logging.info("Registered line magic `%s`", line_magic.__name__)
    except Exception as err:
        startup_error = err
        logging.exception(
            "Kedro's ipython session startup script failed:\n%s", str(err)
        )
        raise err
        
reload_kedro()

In [None]:
context.run()

In [None]:
df = context.catalog.load("master_table")
print(df.shape)
df.head()

### Test Data Science pipeline

Ssplit the data, then train and evaluate the model

In [None]:
# Run the train_test split
from kedro_tutorial.pipelines.data_science.nodes import (
    evaluate_model,
    split_data,
    train_model,
)

In [None]:
[X_train, X_test, y_train, y_test] = split_data(df, { 'test_size': 0.2, 'random_state': 3 })

In [None]:
regressor = train_model(X_train, y_train)

In [None]:
evaluate_model(regressor, X_test, y_test)

In [None]:
import joblib
import os

joblib.dump(regressor, os.path.join('', "model.joblib"))

## Deploy Sagemaker SKLearn Estimator 

<a class="anchor" id="train_sklearn"></a>
Training is very simple, just call `fit` on the Estimator! This will start a SageMaker Training job that will download the data for us, invoke our scikit-learn code (in the provided script file), and save any model artifacts that the script creates.

In [None]:
# Define the script
script_path = 'src/kedro_tutorial/pipelines/data_science/nodes.py'

!tail -n 50 $script_path

In [None]:
# Check that we can get to the training data
bucket_name = 'kedro-ap-southeast-2-691313291965'
master_table_path = 'data/03_primary/master_table.csv'
train_input = 's3://{}/{}'.format(bucket_name, master_table_path)

!aws s3 cp $train_input .
!head -3 master_table.csv

In [None]:
import sagemaker
from sagemaker import get_execution_role

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [None]:
from sagemaker.sklearn.estimator import SKLearn

sklearn = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.c4.xlarge",
    role=role,
    hyperparameters={'test_size': 0.2, 'random_state': 3 })

In [None]:
sklearn.fit({'train': train_input})

## Using the trained model to make inference requests <a class="anchor" id="inference"></a>

### Deploy the model <a class="anchor" id="deploy"></a>

Deploying the model to SageMaker hosting just requires a `deploy` call on the fitted model. This call takes an instance count and instance type.

In [None]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

In [None]:
from sklearn.metrics import r2_score

y_pred = regressor.predict(X_test)
r2_score(y_test, y_pred)

### Endpoint cleanup <a class="anchor" id="endpoint_cleanup"></a>

When you're done with the endpoint, you'll want to clean it up.

In [None]:
sklearn.delete_endpoint()