<h1>Linear Learner to Predict Runway Wait Time</h1>

In [137]:
%pip install -U sagemaker>=2.15

Note: you may need to restart the kernel to use updated packages.


In [138]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [139]:
# S3 locations for train and test datasets
train_input_bucket = 's3://<BUCKET_NAME>'
train_file = '/flights-csv/train-data/train-data.csv'
train_input = train_input_bucket + train_file

test_input_bucket = 's3://<BUCKET_NAME>'
test_file = '/flights-csv/test-data/test-data.csv'
test_input = test_input_bucket + test_file

<h2>Train Model</h2>

In [140]:
import sagemaker
import boto3
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
import time

# # initialize hyperparameters
hyperparameters = {
    "predictor_type": "regressor"
}

# set an output path where the trained model will be saved
output_bucket = '<BUCKET_NAME>'
prefix = 'test_model'
output_path = 's3://{}/{}/output'.format(output_bucket, prefix)

# this line automatically looks for the Linear Learner image URI and builds an Linear Learner container.
# specify the repo_version depending on your preference.
linear_container = sagemaker.image_uris.retrieve("linear-learner", boto3.Session().region_name)

job_name = 'LinearLearner-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=linear_container, 
                                          role=sagemaker.get_execution_role(),
                                          sagemaker_session=sagemaker_session,
                                          instance_count=1, 
                                          base_job_name=job_name,
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          hyperparameters=hyperparameters)

In [141]:
# define the data type and paths to the training and validation datasets
content_type = "text/csv"
train_input = TrainingInput(train_input, content_type=content_type)

In [142]:
# execute the Linear-Learner training job
estimator.fit({'train': train_input})

2021-03-14 06:42:15 Starting - Starting the training job...
2021-03-14 06:42:39 Starting - Launching requested ML instancesProfilerReport-1615704135: InProgress
......
2021-03-14 06:43:39 Starting - Preparing the instances for training...
2021-03-14 06:44:11 Downloading - Downloading input data...
2021-03-14 06:44:39 Training - Downloading the training image.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/14/2021 06:44:53 INFO 139715604961088] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_s

<h2>Deploy Model</h2>

In [143]:
print(estimator._current_job_name)

LinearLearner-2021-03-14-06-42-14-2021-03-14-06-42-15-584


In [159]:
from sagemaker.serializers import CSVSerializer
linear_predictor=estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

-------------------!

In [160]:
linear_predictor.endpoint_name

'LinearLearner-2021-03-14-06-42-14-2021-03-14-06-58-14-734'

<h2>Test Model</h2>

In [161]:
# download test dataset from S3
import boto3
import io
import pandas as pd
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=test_input_bucket.strip('s3://'), Key=test_file.strip('/'))
df = pd.read_csv(io.BytesIO(obj['Body'].read()), header=None)

In [163]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,4411,2,7020,80.06,23.0,80.4,0.1,0.0,130.0,17.8,...,0,0,0,0,0,0,0,0,0,0
1,740,21,4471,33.26,0.4,97.5,0.0,0.0,110.0,11.5,...,0,0,0,0,0,0,0,0,0,0
2,3045,16,5282,46.76,-12.1,22.9,0.0,0.0,190.0,13.0,...,0,0,0,0,0,0,0,0,0,0
3,5945,12,5909,23.0,-8.2,78.0,0.0,0.0,140.0,13.0,...,0,0,0,0,0,0,0,0,0,0
4,4783,26,3297,55.58,5.0,57.8,0.0,0.0,260.0,8.2,...,0,0,0,0,0,0,0,0,0,0


In [164]:
df.shape

(43469, 70)

In [170]:
# make predictions using Sagemaker endpoint
import numpy as np
import json

def predict(data, rows=1000):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = []
    for array in split_array:
        split_pred = json.loads(linear_predictor.predict(array))
        for pred in split_pred['predictions']:
            predictions.append(pred['score'])
    return predictions

In [171]:
predictions=predict(df.to_numpy()[:,1:])

In [None]:
predictions

In [None]:
import matplotlib.pyplot as plt

plt.hist(predictions)
plt.show()

In [172]:
# compare predictions to real labels and show accuracy metrics
from sklearn.metrics import mean_absolute_error
mean_absolute_error(df[0], predictions)

1747.0144135583691

In [173]:
from sklearn.metrics import explained_variance_score
explained_variance_score(df[0], predictions)

0.08799756459003472

In [174]:
from sklearn.metrics import r2_score
r2_score(df[0], predictions)

0.08795075342963954

<h3>Clean Up</h3>

In [175]:
# delete endpoint
linear_predictor.delete_endpoint()