# Train Model

This module defines what will happen in 'stage-1-train-model':

- download dataset;
- pre-process data into features and labels;
- train machine learning model; and,
- save model to cloud stirage (AWS S3).

## Imports

In [1]:
from datetime import datetime
from urllib.request import urlopen
from typing import Tuple

import boto3 as aws
import numpy as np
import pandas as pd
from joblib import dump
from sklearn.base import BaseEstimator
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

## Configuration

In [2]:
DATA_URL = ('http://bodywork-jupyter-pipeline-project.s3.eu-west-2.amazonaws.com'
            '/data/iris_classification_data.csv')

TRAINED_MODEL_AWS_BUCKET = 'bodywork-jupyter-pipeline-project'
TRAINED_MODEL_FILENAME = 'iris_tree_classifier.joblib'

## Download Data

In [3]:
print(f'downloading training data from {DATA_URL}')
data_file = urlopen(DATA_URL)
data = pd.read_csv(data_file)
data

downloading training data from http://bodywork-jupyter-pipeline-project.s3.eu-west-2.amazonaws.com/data/iris_classification_data.csv


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## Pre-Process Data

In [4]:
label_column = 'species'
feature_columns = [
    'sepal length (cm)',
    'sepal width (cm)',
    'petal length (cm)',
    'petal width (cm)'
]
classes_map = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
features = data[feature_columns].values
labels = data[label_column].apply(lambda e: classes_map[e]).values

## Train Model

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    features,
    labels,
    test_size=0.1,
    stratify=labels,
    random_state=42
)
print('training iris decision tree classifier')
iris_tree_classifier = DecisionTreeClassifier(
    class_weight='balanced',
    random_state=42
)
iris_tree_classifier.fit(X_train, y_train)

training iris decision tree classifier


DecisionTreeClassifier(class_weight='balanced', random_state=42)

### Model Metrics

In [6]:
y_test_predicted = iris_tree_classifier.predict(X_test)

accuracy = balanced_accuracy_score(
    y_test,
    y_test_predicted,
    adjusted=True
)
f1 = f1_score(
    y_test,
    y_test_predicted,
    average='weighted'
)

time_now = datetime.now().isoformat(timespec='seconds')
print(f'iris model metrics @{time_now}')
print(f' |-- accuracy = {accuracy:.3f}')
print(f' |-- f1 = {f1:.3f}')

iris model metrics @2021-02-25T11:10:27
 |-- accuracy = 0.800
 |-- f1 = 0.867


## Persist Model to Cloud Storage

In [7]:
dump(iris_tree_classifier, TRAINED_MODEL_FILENAME)
try:
    s3_client = aws.client('s3')
    s3_client.upload_file(
        TRAINED_MODEL_FILENAME,
        TRAINED_MODEL_AWS_BUCKET,
        f'models/{TRAINED_MODEL_FILENAME}'
    )
    print(f'model saved to s3://{TRAINED_MODEL_AWS_BUCKET}'
          f'/{TRAINED_MODEL_FILENAME}')
except Exception:
    print('could not upload model to S3 - check AWS credentials')

model saved to s3://bodywork-jupyter-pipeline-project/iris_tree_classifier.joblib
