# Example usage for Sagemaker using the Kaggle Titanic dataset
## Description of the dataset

Variable|Definition|Key
:---|:---|:---
survival|Survival|0=No, 1=Yes
pclass|Ticket class|1=1st, 2=2nd, 3=3rd
sex|Sex|
Age|Age in years|
sibsp|# of siblings or spouses aboard the Titanic|
parch|# of parents or children aboard the Titanic|
ticket|Ticket number|
fare|Passenger fare|
cabin|Cabin number|
embarked|Port of Embarkation|C = Cherbourg, Q = Queenstown, S = Southampton

[More info on the dataset](https://www.kaggle.com/c/titanic/data)

## Some general functions

In [None]:
import boto3
import pandas as pd
import io
import os
import sagemaker.amazon.common as smac

SOURCE_BUCKET = 'ml-hackathon-files'
# Change X into team number
BUCKET_NAME = 'ml-hackathon-teamX'

def read_s3(key, bucketName=SOURCE_BUCKET, sep=";"):
    client = boto3.client('s3') #low-level functional API
    resource = boto3.resource('s3') #high-level object-oriented API
    
    obj = client.get_object(Bucket=bucketName, Key=key)
    return pd.read_csv(io.BytesIO(obj['Body'].read()), sep=sep)

def write_csv(dataframe, prefix, key, bucket=BUCKET_NAME):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, index=False, encoding='utf-8', header=False)
    csv_buffer.seek(0)
    
    csv_value = csv_buffer.getvalue()
    csv_buffer.close()
    
    boto3.resource('s3').Bucket(bucket)\
        .Object(os.path.join(prefix, 'train', key))\
        .put(Body=csv_value, ContentType='text/csv')

def write_recordio(X, y, prefix, key, bucket=BUCKET_NAME):    
    buf = io.BytesIO()
    smac.write_numpy_to_dense_tensor(buf, X, y)
    buf.seek(0)
    
    boto3.resource('s3').Bucket(BUCKET_NAME).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
    s3_train_data = 's3://{}/{}/train/{}'.format(BUCKET_NAME, prefix, key)
    print('uploaded training data location: {}'.format(s3_train_data))

def set_categorical_columns(df, categorical_columns):
    df = train_selection.copy()
    for col in categorical_columns:
        df[col] = df[col].astype("category")
    return df

## Read the data

In [None]:
titanic = read_s3('titanic/train.csv', sep=',')
print(titanic.shape) # Print the dimensions of the dataset
titanic.head()

## Prepare the data

1. Select the columns that you would like to use
2. convert into numpy arrays
3. Write prepared data to s3

In [None]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder

# Choose the variables you want to use in your model
train_selection = titanic[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked"]].copy()
train_selection = train_selection.dropna() # remove rows with missing values
print(train_selection.shape)

# Select outcome and predictor variables
y = train_selection["Survived"] # the value we want to predict
X = train_selection.drop("Survived", axis=1) # The values used to train the model for prediction

# convert the data from the predictor variables into numerical arrays
dv = DictVectorizer(sparse=False)
X = dv.fit_transform(X.to_dict(orient='records')) 

# RecordIO needs the input data to be floats
y = np.where(np.array([t.tolist() for t in y]) == 0, 1, 0).astype('float32')
X = np.where(np.array([t.tolist() for t in X]) == 0, 1, 0).astype('float32')

# Write the preprocessed data back to S3
prefix = "titanic"
key = 'titanic'
write_recordio(X, y, prefix, key)

## Create Linear Learner model
(the red output is expected)

In [None]:
import boto3
import sagemaker

output_location = 's3://{}/{}/output'.format(BUCKET_NAME, prefix)
s3_train_data = 's3://{}/{}/train/{}'.format(BUCKET_NAME, prefix, key)
print("The model will be create using " + s3_train_data)
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest'}

linear = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       sagemaker.get_execution_role(), 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sagemaker.Session())

feature_dimension = len(X[0])
batch_size = 100
linear.set_hyperparameters(feature_dim=feature_dimension,
                           predictor_type='regressor',
                           mini_batch_size=batch_size)

In [None]:
linear.fit({'train': s3_train_data})

## Deploy the model and make predictions

In [None]:
linear_predictor = linear.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
train_selection.dtypes

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

predictions = [{'Pclass': 3, 'Sex': 'male', 'Age': 5, "SibSp": 1, "Parch":2, "Embarked": 'C'}, 
               {'Pclass': 3, 'Sex': 'female', 'Age': 7, "SibSp": 1, "Parch":2, "Embarked": 'C'}, 
               {'Pclass': 2, 'Sex': 'female', 'Age': 25, "SibSp": 0, "Parch":2, "Embarked": 'S'},
               {'Pclass': 1, 'Sex': 'male', 'Age': 29, "SibSp": 0, "Parch":0, "Embarked": 'Q'}]
transformed_predictions = dv.transform(predictions) # apply the transformations used in the training set

linear_predictor.predict(transformed_predictions)

In [None]:
#(Optional) Delete the Endpoint¶
#import sagemaker
#sagemaker.Session().delete_endpoint(linear_predictor.endpoint)