In [2]:
# install seaborn library
# !pip install seaborn
# !pip install tensorflow
#import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
# read the csv file 
salary_df = pd.read_csv('salary.csv')

In [4]:
salary_df

Unnamed: 0,YearsExperience,Salary
0,1.1,39343
1,1.3,46205
2,1.5,37731
3,2.0,43525
4,2.2,39891
5,2.9,56642
6,3.0,60150
7,3.2,54445
8,3.2,64445
9,3.7,57189


# PERFORM EXPLORATORY DATA ANALYSIS AND VISUALIZATION

In [5]:
# check if there are any Null values
sns.heatmap(salary_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")


<matplotlib.axes._subplots.AxesSubplot at 0x7f01de0bda50>

In [6]:
# Check the dataframe info

salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  35 non-null     float64
 1   Salary           35 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 688.0 bytes


In [7]:
# Statistical summary of the dataframe

salary_df.describe()

Unnamed: 0,YearsExperience,Salary
count,35.0,35.0
mean,6.308571,83945.6
std,3.61861,32162.673003
min,1.1,37731.0
25%,3.45,57019.0
50%,5.3,81363.0
75%,9.25,113223.5
max,13.5,139465.0


In [8]:
salary_df.hist(bins = 30, figsize = (20,10), color = 'r')


array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f01dbc50b50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f01dbca16d0>]],
      dtype=object)

# CREATE TRAINING AND TESTING DATASET

In [9]:
X = salary_df[['YearsExperience']]
y = salary_df[['Salary']]


In [10]:
X

Unnamed: 0,YearsExperience
0,1.1
1,1.3
2,1.5
3,2.0
4,2.2
5,2.9
6,3.0
7,3.2
8,3.2
9,3.7


In [11]:
y

Unnamed: 0,Salary
0,39343
1,46205
2,37731
3,43525
4,39891
5,56642
6,60150
7,54445
8,64445
9,57189


In [12]:
X.shape

(35, 1)

In [13]:
y.shape

(35, 1)

In [14]:
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

In [15]:
# Only take the numerical variables and scale them
X 

array([[ 1.1],
       [ 1.3],
       [ 1.5],
       [ 2. ],
       [ 2.2],
       [ 2.9],
       [ 3. ],
       [ 3.2],
       [ 3.2],
       [ 3.7],
       [ 3.9],
       [ 4. ],
       [ 4. ],
       [ 4.1],
       [ 4.5],
       [ 4.9],
       [ 5.1],
       [ 5.3],
       [ 5.9],
       [ 6. ],
       [ 6.8],
       [ 7.1],
       [ 7.9],
       [ 8.2],
       [ 8.7],
       [ 9. ],
       [ 9.5],
       [ 9.6],
       [10.3],
       [10.5],
       [11.2],
       [11.5],
       [12.3],
       [12.9],
       [13.5]], dtype=float32)

In [16]:
# split the data into test and train sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


# Train a Linear regresssion model using SKLearn (Note: SageMaker SDK / APIs are not used yet)

In [17]:
# using linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

regresssion_model_sklearn = LinearRegression(fit_intercept = True)
regresssion_model_sklearn.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
regresssion_model_sklearn_accuracy = regresssion_model_sklearn.score(X_test, y_test)
regresssion_model_sklearn_accuracy

0.9005891613075591

In [19]:
print('Linear Model Coefficient (m): ', regresssion_model_sklearn.coef_)
print('Linear Model Coefficient (b): ', regresssion_model_sklearn.intercept_)

Linear Model Coefficient (m):  [[8608.755]]
Linear Model Coefficient (b):  [29311.54]


# Evaluate Trained Model's performance (Note: SageMaker SDK/APIs are not used yet.)

In [20]:
y_predict = regresssion_model_sklearn.predict(X_test)

In [21]:
y_predict

array([[55137.805],
       [63746.56 ],
       [63746.56 ],
       [90433.7  ],
       [54276.93 ],
       [46529.047],
       [99903.33 ]], dtype=float32)

In [22]:
plt.scatter(X_train, y_train, color = 'gray')
plt.plot(X_train, regresssion_model_sklearn.predict(X_train), color = 'red')
plt.ylabel('Salary')
plt.xlabel('Number of Years of Experience')
plt.title('Salary vs. Years of Experience')

Text(0.5, 1, 'Salary vs. Years of Experience')

# TRAIN A LINEAR LEARNER MODEL USING SAGEMAKER

In [26]:
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python


import sagemaker
import boto3
from sagemaker import Session

# Let's create a Sagemaker session
sagemaker_session = sagemaker.Session()
bucket = Session().default_bucket()

# Let's define the S3 bucket and prefix that we want to use in this session
bucket = 'gf-sagemaker-practice-1' # bucket named 'sagemaker-practical' was created beforehand
prefix = 'linear_learner' # prefix is the subfolder within the bucket.

# Let's get the execution role for the notebook instance. 

role = sagemaker.get_execution_role()
print(role)

arn:aws:iam::667787699598:role/service-role/AmazonSageMaker-ExecutionRole-20221202T091553


In [27]:
X_train.shape

(28, 1)

In [28]:
y_train = y_train[:,0]

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [29]:
y_train.shape

(28,)

In [30]:
import io # The io module allows for dealing with various types of I/O (text I/O, binary I/O and raw I/O). 
import numpy as np
import sagemaker.amazon.common as smac # sagemaker common libary

# Code below converts the data in numpy array format to RecordIO format
# This is the format required by Sagemaker Linear Learner 

buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_numpy_to_dense_tensor(buf, X_train, y_train)
buf.seek(0) 
# When you write to in-memory byte arrays, it increments 1 every time you write to it, hence we need to reset to zero



0

In [31]:
import os

# Code to upload RecordIO data to S3
 
# Key refers to the name of the file    
key = 'linear-train-data'

# The following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://gf-sagemaker-practice-1/linear_learner/train/linear-train-data


In [32]:
X_test.shape

(7, 1)

In [33]:
y_test.shape

(7, 1)

In [34]:
# Make sure that the target label is a vector
y_test = y_test[:,0]


In [35]:
# Code to upload RecordIO data to S3

buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_numpy_to_dense_tensor(buf, X_test, y_test)
buf.seek(0) 
# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 


0

In [36]:
# Key refers to the name of the file    
key = 'linear-test-data'

# The following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)

# Let's print out the testing data location in s3
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_test_data))

uploaded training data location: s3://gf-sagemaker-practice-1/linear_learner/test/linear-test-data


In [37]:
# create an output placeholder in S3 bucket to store the linear learner output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('Training artifacts will be uploaded to: {}'.format(output_location))

Training artifacts will be uploaded to: s3://gf-sagemaker-practice-1/linear_learner/output


In [38]:
# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use

# Let's obtain a reference to the linearLearner container image
# Note that all regression models are named estimators
# You don't have to specify (hardcode) the region, get_image_uri will get the current region name using boto3.Session

from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'linear-learner')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [39]:
# We have pass in the container, the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator. 
# We can also specify how many instances we would like to use for training
# sagemaker_session = sagemaker.Session()

linear = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count = 1, 
                                       train_instance_type = 'ml.c4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session)


# We can tune parameters like the number of features that we are passing in, type of predictor like 'regressor' or 'classifier', mini batch size, epochs
# Train 32 different versions of the model and will get the best out of them (built-in parameters optimization!)

linear.set_hyperparameters(feature_dim = 1,
                           predictor_type = 'regressor',
                           mini_batch_size = 5,
                           epochs = 5,
                           num_models = 2,
                           loss = 'absolute_loss')

# Now we are ready to pass in the training data from S3 to train the linear learner model

linear.fit({'train': s3_train_data})

# Let's see the progress using cloudwatch logs

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2022-12-02 02:57:02 Starting - Starting the training job...
2022-12-02 02:57:31 Starting - Preparing the instances for trainingProfilerReport-1669949821: InProgress
......
2022-12-02 02:58:31 Downloading - Downloading input data...
2022-12-02 02:58:51 Training - Downloading the training image.........
2022-12-02 03:00:27 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/02/2022 03:00:30 INFO 140178018830144] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_si

# DEPLOY AND TEST THE TRAINED LINEAR LEARNER MODEL 

In [40]:
# Deploying the model to perform inference 

linear_regressor = linear.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.m4.xlarge')

-------------!

In [41]:
from sagemaker.predictor import csv_serializer, json_deserializer

# Content type overrides the data that will be passed to the deployed model, since the deployed model expects data in text/csv format.

# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content type

# Deserializer accepts two arguments, the result data and the response content type, and return a sequence of bytes in the specified content type.

# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html

# linear_regressor.content_type = 'text/csv'
linear_regressor.serializer = csv_serializer
linear_regressor.deserializer = json_deserializer

In [42]:
# making prediction on the test data

result = linear_regressor.predict(X_test)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [43]:
result # results are in Json format

{'predictions': [{'score': 75469.203125},
  {'score': 76624.421875},
  {'score': 76624.421875},
  {'score': 80205.578125},
  {'score': 75353.6875},
  {'score': 74313.9921875},
  {'score': 81476.3046875}]}

In [44]:
# Since the result is in json format, we access the scores by iterating through the scores in the predictions

predictions = np.array([r['score'] for r in result['predictions']])

In [45]:
predictions

array([75469.203125 , 76624.421875 , 76624.421875 , 80205.578125 ,
       75353.6875   , 74313.9921875, 81476.3046875])

In [46]:
predictions.shape

(7,)

In [47]:
# VISUALIZE TEST SET RESULTS
plt.scatter(X_test, y_test, color = 'gray')
plt.plot(X_test, predictions, color = 'red')
plt.xlabel('Years of Experience (Testing Dataset)')
plt.ylabel('salary')
plt.title('Salary vs. Years of Experience')

Text(0.5, 1, 'Salary vs. Years of Experience')

In [59]:
# Delete the end-point

linear_regressor.delete_endpoint()