# Import libraries and dataset

In [None]:
#!pip install -U numpy
#!pip install -U pandas

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
salary_df = pd.read_csv('salary.csv')

In [None]:
salary_df

In [None]:
# first 7 rows
salary_df.head(7)

In [None]:
#last 7 rows
salary_df.tail(7)

In [None]:
#maximum salary
salary_df.Salary.max()

# Exploratory data analysis

In [None]:
#check if there are null values

sns.heatmap(salary_df.isnull(), yticklabels=False, cbar=False, cmap="Blues")

In [None]:
#statistical summary

salary_df.describe()

In [None]:
#number years of experience of max salary

salary_df.max()['YearsExperience']

In [None]:
#number years of experience of min salary

salary_df.min()['YearsExperience']

In [None]:
salary_df.hist(bins=30, figsize=(20, 10), color='r')

In [None]:
#plot pairplot

sns.pairplot(salary_df)

In [None]:
corr_matrix = salary_df.corr()
sns.heatmap(corr_matrix, annot=True)
plt.show

In [None]:
sns.regplot(x='YearsExperience', y='Salary', data=salary_df)

# Create training and test dataset

In [None]:
X = salary_df[['YearsExperience']]
y = salary_df[['Salary']]

In [None]:
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25
)

In [None]:
X_train.shape

# Train sklearn regression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

In [None]:
reg = LinearRegression(fit_intercept=True)

In [None]:
reg.fit(X_train, y_train)

In [None]:
#R^2
reg.score(X_test, y_test)

In [None]:
print('Linear model coefficient (m)', reg.coef_)
print('Linear model coefficient (b)', reg.intercept_)

# Evaluate trained model

In [None]:
y_predict = reg.predict(X_test)

In [None]:
y_predict

In [None]:
plt.scatter(X_train, y_train, color='gray')
plt.plot(X_train, reg.predict(X_train), color='red')
plt.ylabel('Salary')
plt.xlabel('Number of Years of Experience')
plt.title('Salary vs. Year of Experience')

In [None]:
#salary of 5 years of experience
reg.predict([[5]])[0,0]

# Train a linear learner model using SageMaker

In [None]:
# Boto3 is the AWS SDK for Python
# Boto3 allows to write sofware that makes use of S3 and other services

import sagemaker
import boto3

In [None]:
# create a sagemaker session
sagemaker_session = sagemaker.Session()

# define the S3 bucket and prefix that we want to use
bucket = 'sagemaker-anderici'
prefix = 'linear_learner' #just a subfolder within the bucket

role = sagemaker.get_execution_role()
print(role)

In [None]:
X_train.shape

In [None]:
y_train = y_train[:,0]

In [None]:
y_train.shape

In [None]:
import io # allows dealing with various types of I/O
import sagemaker.amazon.common as smac #sagemaker common library

# converts data in numpy array format to RecordIO format
# this format is required by sagemaker linear learner

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_train, y_train)
buf.seek(0) #reset the byte arrays

In [None]:
import os

# key refers to the file name
key = 'linear-train-data'

# the following code uploads the data in record-io format to S3 bucket
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

# print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('Uploaded training data location: {}'.format(s3_train_data))

In [None]:
# create an output placeholder in S3 bucket to store linear learner output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

In [None]:
# obtain a reference to the LinearLearner container image
# this code is used to get the training container of sagemaker built-in algorithms
# all regression models are named estimators

from sagemaker.amazon.amazon_estimator import get_image_uri

# container = get_image_uri(boto3.Session().region_name, 'linear-learner')

from sagemaker.image_uris import retrieve

container = retrieve('linear-learner', boto3.Session().region_name)

In [None]:
# creating the model object

linear = sagemaker.estimator.Estimator(container, 
                                       role, 
                                       instance_count = 1, 
                                       instance_type = 'ml.m4.xlarge', 
                                       output_path = output_location, 
                                       sagemaker_session = sagemaker_session, 
                                       use_spot_instances = True, #use 'sleepy' instances worldwide
                                       max_run = 300, # timeout in seconds for training
                                       max_wait = 600) # max waiting time for spot instances

# We can tune parameters like the number of features that we are passing in, type of predictor like 'regressor' or 'classifier', mini batch size, epochs
# Train 32 different versions of the model and will get the best out of them (built-in parameters optimization!)

linear.set_hyperparameters(feature_dim = 1,
                           predictor_type = 'regressor',
                           mini_batch_size = 5,
                           epochs = 50,
                           num_models = 32,
                           loss = 'absolute_loss')

In [None]:
# fit the model

linear.fit({'train': s3_train_data})

# Deploy and test the trained model

In [None]:
# deploying the model to perform inference

linear_regressor = linear.deploy(initial_instance_count=1, 
                                 instance_type='ml.m4.xlarge')

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Content type overrides the data that will be passed to the deployed model, since the deployed model expects data in text/csv format.

# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content type

# Deserializer accepts two arguments, the result data and the response content type, and return a sequence of bytes in the specified content type.

# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html

# linear_regressor.content_type = 'text/csv'
linear_regressor.serializer = CSVSerializer()
linear_regressor.deserializer = JSONDeserializer()

In [None]:
# making prediction on test data

result = linear_regressor.predict(X_test)
result

In [None]:
# since the result is in json format, we convert it to numpy array

predictions = np.array([r['score'] for r in result['predictions']])
predictions

In [None]:
predictions.shape

In [None]:
y_test = y_test[:,0]

In [None]:
y_test.shape

In [None]:
plt.scatter(X_test, y_test, color='gray')
plt.plot(X_test, predictions, color='red')
plt.xlabel('Years of Experience (Testing Dataset)')
plt.ylabel('Salary')
plt.title('Salary vs. Years of Experience')

In [None]:
# delete the end-point

linear_regressor.delete_endpoint()