# Closest avg classification

please visit for information
https://github.com/abeja-yuki/sagemaker_closest_avg_sample

Due to security problems, I hid some parts of my AWS information.

In [None]:
bucket = '<my bucket name>'
prefix = 'sagemaker/closest_avg'
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

Now we'll import the Python libraries we'll need.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import time
import json
import sagemaker.amazon.common as smac
import sagemaker
from sagemaker.predictor import csv_serializer, json_deserializer

---
## Data


In [None]:
v = 0.5 * np.random.random(1000) - 0.25 + 1.0
labels = ['John Petrucci', 'Mike Mangini', 'Jordan Rudess', 'James LaBrie', 'John Myung']
avg = [350, 450, 780, 280, 100]

label_series = []
feature_series = []

for i in range(1000):
    label_series.append(labels[i % 5])
    feature_series.append(avg[i % 5] * v[i])

df = pd.DataFrame({'label': label_series, 'feature': feature_series})

In [None]:
df.groupby('label').agg({'feature': {'count': len, 'mean': np.mean}})

In [None]:
df

And take a look at it.

In [None]:
train_data_rate = 0.6
evaluation_data_rate = 0.3

train_data = df[:int(len(df) * train_data_rate)]
validation_data = df[int(len(df) * train_data_rate):int(len(df) * (train_data_rate + evaluation_data_rate))]
test_data = df[int(len(df) * (train_data_rate + evaluation_data_rate)):]


print(len(train_data), len(validation_data), len(test_data))

Now, we'll convert the datasets to the recordIO-wrapped protobuf format used by the Amazon SageMaker algorithms and upload this data to S3.  We'll start with training data.

In [None]:
import pickle
buf = io.BytesIO()
buf.write(pickle.dumps(train_data, 0))
buf.seek(0)

In [None]:
key = 'closest_avg_train.data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

Next we'll convert and upload the validation dataset.

In [None]:
import pickle
buf = io.BytesIO()
buf.write(pickle.dumps(validation_data, 0))
buf.seek(0)

In [None]:
key = 'closest_avg_validation.data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(buf)
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

In [None]:
container = '{my account id}.dkr.ecr.us-east-2.amazonaws.com/sagemaker-closest-avg:latest'

In [None]:
sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path='s3://{}/{}/output'.format(bucket, prefix),
                                       sagemaker_session=sess)
linear.set_hyperparameters(feature_dim=59,
                           mini_batch_size=100,
                           predictor_type='regressor',
                           epochs=10,
                           num_models=32,
                           loss='absolute_loss')

linear.fit({'training': s3_train_data, 'validation': s3_validation_data})

---
## Host

Now that we've trained the linear algorithm on our data, let's create a model and deploy that to a hosted endpoint.

In [None]:
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.c4.xlarge')

In [None]:
import pickle
pickle_serializer = lambda data: pickle.dumps(data, 0)

pickle_deserializer = lambda body, mime: pickle.loads(body.read())

In [None]:
linear_predictor.content_type = 'binary/octet-stream'
linear_predictor.serializer = pickle_serializer
linear_predictor.deserializer = pickle_deserializer

In [None]:
result = linear_predictor.predict(test_data[['feature']])

test_data['predicted'] = result

In [None]:
# accuracy
1.0 - len(test_data[test_data['label'] != test_data['predicted']]) / len(test_data)

In [None]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)