# Train

Use this notebook to train a model that solves our regression task and uploads the train model artefact to AWS S3.

## Imports

In [None]:
import os
from urllib.request import urlopen

import aporia
import boto3 as aws
import joblib
import pandas as pd
import seaborn as sns
from numpy import floating ,ndarray
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor

## Configuration

In [None]:
sns.set()

AWS_S3_PROJECT_BUCKET = "bodywork-pipeline-with-aporia-monitoring"
DATASET_URL = (
    "http://bodywork-pipeline-with-aporia-monitoring"
    ".s3.eu-west-2.amazonaws.com/datasets/dataset_t0.csv"
)

## Load Data from Cloud Object Storage

In [None]:
dataset = data = pd.read_csv(urlopen(DATASET_URL))
dataset

## Data Preparation

Split labels from features and process categorical features.

In [None]:
category_to_integer_map = {"c0": 0, "c1": 1, "c2": 2}

def preprocess(df):
    df = df.copy()
    df["F_2"] = df["F_2"].apply(lambda e: category_to_integer_map[e])

    return df

X = dataset[["F_1", "F_2"]]
y = dataset["y"]
X

## Split Data into Train and Test Subsets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=dataset["F_2"].values,
    random_state=42
)

## Setup Model Metrics

We will use the Mean Absoloute Error (MAE) for this regression task.

In [None]:
def compute_metrics(y_true: ndarray, y_pred: ndarray) -> floating:
    mape = mean_absolute_percentage_error(y_true, y_pred)
    print(f"MAPE = {mape/100:.2%}")
    return mape

## Train Model

We will train a decision tree, so that we can capture the non-linearities in the dataset and we will only use the default parameters, as the relationships between the labels, when conditioned on the categorical feature, is linear and should be easy to capture.

In [None]:
model = DecisionTreeRegressor()
model.fit(preprocess(X_train), y_train)

### Diagnostics

In [None]:
y_test_pred = model.predict(preprocess(X_test))
compute_metrics(y_test, y_test_pred)

_ = sns.lmplot(
    y="y_test_pred",
    x="y_test",
    data=pd.DataFrame({"y_test": y_test, "y_test_pred": y_test_pred}),
    line_kws={"color": "red", "alpha": 0.5}
)

Not bad!

## Save Model to Cloud Object Storage

In [None]:
# persist trained model locally 
joblib.dump(model, "model.joblib")

# upload trained model to AWS S3
s3_client = aws.client('s3')
s3_client.upload_file(
    "model.joblib",
    AWS_S3_PROJECT_BUCKET,
    "models/model.joblib"
)

# remove local files
os.remove("model.joblib")

## Send Datasets to Aporia

To use for monitoring live prediction performance.

In [None]:
aporia.init(token="<APORIA_TOKEN>", environment="training", verbose=True)

apr_model = aporia.create_model_version(
    model_id="<APORIA_MODEL_ID>",
    model_version="<APORIA_MODEL_VERSION>",
    model_type="regression",
    raw_inputs={
      "F_1": "numeric",
      "F_2": "string",
    },
    features={
      "F_1": "numeric",
      "F_2": "numeric",
    },
    predictions={
      "y": "numeric"
    },
)

apr_model.log_training_set(
  raw_inputs=X_train,
  features=preprocess(X_train),
  labels=y_train.to_frame(),
)

apr_model.log_test_set(
  raw_inputs=X_test,
  features=preprocess(X_test),
  labels=y_test.to_frame(),
  predictions=pd.DataFrame(columns=["y"], data=y_test_pred),
)