# Score Data

This module defines what will happen in 'stage-2-score-data':

- download dataset from cloud storage (AWS S3);
- load model;
- score dataset; and,
- save results to cloud storage (AWS S3).

## Imports

In [1]:
from urllib.request import urlopen

import boto3 as aws
import pandas as pd
from joblib import load
from sklearn.base import BaseEstimator

## Configuration

In [2]:
DATA_URL = ('http://bodywork-jupyter-pipeline-project.s3.eu-west-2.amazonaws.com'
            '/data/iris_classification_data.csv')

MODEL_URL = ('http://bodywork-jupyter-pipeline-project.s3.eu-west-2.amazonaws.com'
            '/models/iris_tree_classifier.joblib')

SCORED_DATA_AWS_BUCKET = 'bodywork-jupyter-pipeline-project'
SCORED_DATA_FILENAME = 'iris_classification_data_scored.csv'

## Download Data

In [3]:
print(f'downloading training data from {DATA_URL}')
data_file = urlopen(DATA_URL)
data = (
    pd.read_csv(data_file)
    .drop(['species'], axis=1)
)
data

downloading training data from http://bodywork-jupyter-pipeline-project.s3.eu-west-2.amazonaws.com/data/iris_classification_data.csv


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## Load Model

In [4]:
model_file = urlopen(MODEL_URL)
model = load(model_file)
model

## Score Data

In [5]:
feature_columns = [
    'sepal length (cm)',
    'sepal width (cm)',
    'petal length (cm)',
    'petal width (cm)'
]
label_to_classes_map = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}
X = data[feature_columns].values
data['predicted_labels'] = model.predict(X)
data['predicted_class'] = (
    data['predicted_labels']
    .apply(lambda e: label_to_classes_map[e])
)
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),predicted_labels,predicted_class
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


## Persist Scored Data to Cloud Storage

In [6]:
data.to_csv(SCORED_DATA_FILENAME, index=False)
try:
    s3_client = aws.client('s3')
    s3_client.upload_file(
        SCORED_DATA_FILENAME,
        SCORED_DATA_AWS_BUCKET,
        f'scored-data/{SCORED_DATA_FILENAME}'
    )
    print(f'scored data saved to s3://{SCORED_DATA_AWS_BUCKET}'
          f'/{SCORED_DATA_FILENAME}')
except Exception:
    print('could not upload scored data to S3 - check AWS credentials')

scored data saved to s3://bodywork-jupyter-pipeline-project/iris_classification_data_scored.csv
