# Machine Learning Introduction - devbuildit.com

## Update notebook dependancies 

In [None]:
!pip install --upgrade numexpr
!pip install --upgrade pandas

Restart kernel!!!!

## Imports

In [None]:
import boto3
import sagemaker
import os
from sagemaker import image_uris
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

## Set S3 bucket location

In [None]:
bucket_name = 'ml-intro-sagemaker-<AWD AccountID>' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

In [None]:
s3 = boto3.resource('s3')
prefix = 'xgboost-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

## Import data &  visualise data

In [None]:
model_data = pd.read_csv('s3://ml-intro-sagemaker-<AWD AccountID>/ml_data.csv', sep=',')

In [None]:
model_data

In [None]:
plt.plot(model_data['x'], model_data['y'])

In [None]:
model_data=model_data[['y','x']]

In [None]:
model_data

## Split into training and testing data

In [None]:

print(model_data.shape)

import numpy as np
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

In [None]:
test_data

## Create CSV for training and testing

In [None]:
train_data.to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [None]:
test_data.to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

## Container with Algorythym

In [None]:
container = image_uris.retrieve("xgboost", my_region, "1.7-1")
container

## Set Hyper parameters

In [None]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"reg:squarederror",
        "num_round":50
        }

## Setup training job and train.

In [None]:
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB,
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=600)

In [None]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

## Deploy model (deploy)

In [None]:
xgb_predictictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m5.large')

In [None]:
# Get Sagemaker endpoint name & update lambda function environment variable
xgb_predictictor.endpoint_name

## Tidy up

### destroy endpoint
### delete training job record
### empty buckets
### delete Cloud formation stack
### run terraform destory