# Intialize the Notebook

In [None]:
import boto3
from sagemaker import get_execution_role
import pandas as pd
bucket = 'slytherins-test'
prefix = 'linear-learner'
role = get_execution_role()

# Preprocess the Data to make it ready for Classification

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
input_data = 's3://slytherins-test/Train.csv'
data = pd.read_csv(input_data)
data.head(n=10)

for i in data.Item_Type.value_counts().index:
    data.loc[(data['Item_Weight'].isna()) & (data['Item_Type'] == i), ['Item_Weight']] = \
    data.loc[data['Item_Type'] == 'Fruits and Vegetables', ['Item_Weight']].mean()[0]

cat_data = data.select_dtypes(object)
num_data = data.select_dtypes(np.number)

cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Grocery Store'), ['Outlet_Size']] = 'Small'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type1'), ['Outlet_Size']] = 'Small'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type2'), ['Outlet_Size']] = 'Medium'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type3'), ['Outlet_Size']] = 'Medium'

cat_data.loc[cat_data['Item_Fat_Content'] == 'LF' , ['Item_Fat_Content']] = 'Low Fat'
cat_data.loc[cat_data['Item_Fat_Content'] == 'reg' , ['Item_Fat_Content']] = 'Regular'
cat_data.loc[cat_data['Item_Fat_Content'] == 'low fat' , ['Item_Fat_Content']] = 'Low Fat'

le = LabelEncoder()
cat_data = cat_data.apply(le.fit_transform)
ss = StandardScaler()
num_data = pd.DataFrame(ss.fit_transform(num_data), columns = num_data.columns)
cat_data = pd.DataFrame(ss.fit_transform(cat_data), columns = cat_data.columns)
final_data = pd.concat([num_data,cat_data],axis=1)

print('Data after cleaning: {}'.format(final_data.shape))

X = final_data.drop(['Item_Outlet_Sales'], axis=1)
y = data[['Item_Outlet_Sales']]

In [None]:
y_binned = pd.cut(y['Item_Outlet_Sales'], 4, labels=['A', 'B', 'C', 'D'])

In [None]:
temp = le.fit(y_binned)

In [None]:
y_final = temp.transform(y_binned)

# Saving the data to S3 in Recordio Format

In [None]:
import io
import numpy as np
import sagemaker.amazon.common as smac

vectors = np.array(X.values, dtype='float32')
labels = np.array(y_final, dtype='float32')

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

import boto3
import os

key = 'recordio-pb-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

# Defining S3 Output Location

In [None]:
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

# Initializing the Container

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

# Starting the Training

In [None]:
import boto3
import sagemaker

sess = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess)
linear.set_hyperparameters(feature_dim=11,
                           predictor_type='multiclass_classifier',
                           mini_batch_size=100,
                           num_classes=4)

linear.fit({'train': s3_train_data})

# Deploying the Model

In [None]:
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

# Doing Inference

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [None]:
result = linear_predictor.predict(vectors[0])
print(result)

In [None]:
import numpy as np

predictions = []
for array in np.array_split(vectors, 100):
    result = linear_predictor.predict(array)
    predictions += [r['predicted_label'] for r in result['predictions']]

predictions = np.array(predictions)

In [None]:
import pandas as pd

pd.crosstab(labels, predictions, rownames=['actuals'], colnames=['predictions'])

# Getting Accuracy Scores

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(labels, predictions, average='weighted'))
print(recall_score(labels, predictions, average='weighted'))
print(f1_score(labels, predictions, average='weighted'))

# Delete the Endpoint

In [None]:
import sagemaker
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)