In [1]:
# some utility functions 

import string
import sys
import numpy as np

from hashlib import md5

maketrans = str.maketrans

In [2]:
# All util functions

def vectorize_sequences(sequences, vocabulary_length):
    results = np.zeros((len(sequences), vocabulary_length))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.0
    return results


def one_hot_encode(messages, vocabulary_length):
    data = []
    for msg in messages:
        temp = one_hot(msg, vocabulary_length)
        data.append(temp)
    return data

def text_to_word_sequence(
    text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "
):
    """Converts a text to a sequence of words (or tokens).
    # Arguments
        text: Input text (string).
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to convert the input to lowercase.
        split: str. Separator for word splitting.
    # Returns
        A list of words (or tokens).
    """
    if lower:
        text = text.lower()

    translate_dict = dict((c, split) for c in filters)
    translate_map = maketrans(translate_dict)
    text = text.translate(translate_map)

    seq = text.split(split)
    return [i for i in seq if i]

def one_hot(
    text, n, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "
):
    """One-hot encodes a text into a list of word indexes of size n.
    This is a wrapper to the `hashing_trick` function using `hash` as the
    hashing function; unicity of word to index mapping non-guaranteed.
    # Arguments
        text: Input text (string).
        n: int. Size of vocabulary.
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to set the text to lowercase.
        split: str. Separator for word splitting.
    # Returns
        List of integers in [1, n]. Each integer encodes a word
        (unicity non-guaranteed).
    """
    return hashing_trick(
        text, n, hash_function="md5", filters=filters, lower=lower, split=split
    )


def hashing_trick(
    text,
    n,
    hash_function=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=" ",
):
    """Converts a text to a sequence of indexes in a fixed-size hashing space.
    # Arguments
        text: Input text (string).
        n: Dimension of the hashing space.
        hash_function: defaults to python `hash` function, can be 'md5' or
            any function that takes in input a string and returns a int.
            Note that 'hash' is not a stable hashing function, so
            it is not consistent across different runs, while 'md5'
            is a stable hashing function.
        filters: list (or concatenation) of characters to filter out, such as
            punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
            includes basic punctuation, tabs, and newlines.
        lower: boolean. Whether to set the text to lowercase.
        split: str. Separator for word splitting.
    # Returns
        A list of integer word indices (unicity non-guaranteed).
    `0` is a reserved index that won't be assigned to any word.
    Two or more words may be assigned to the same index, due to possible
    collisions by the hashing function.
    The [probability](
        https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
    of a collision is in relation to the dimension of the hashing space and
    the number of distinct objects.
    """
    if hash_function is None:
        hash_function = hash
    elif hash_function == "md5":
        hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)

    seq = text_to_word_sequence(text, filters=filters, lower=lower, split=split)
    return [int(hash_function(w) % (n - 1) + 1) for w in seq]

In [3]:
# Import required modules

from sagemaker import get_execution_role
import os
import pandas as pd
import numpy as np
import pickle
import boto3
from sagemaker.mxnet import MXNet

In [4]:
# Configure S3 bucket

bucket_name = "spam-detector-storage"

In [5]:
# IAM role, and bucket name prefix

role = get_execution_role()
bucket_key_prefix = "sagemaker/spam-classifier"

vocabulary_length = 9013  #Static

In [6]:
# Download the dataset

os.system("mkdir -p dataset")
os.system(
    "curl https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip -o dataset/smsspamcollection.zip"
)
os.system("unzip -o dataset/smsspamcollection.zip -d dataset")
os.system("head -10 dataset/SMSSpamCollection")

0

In [7]:
# Convert dataset to dataframe

df = pd.read_csv("dataset/SMSSpamCollection", sep="\t", header=None)

In [8]:
# Assign the value Ham to 0 and Spam to 1 for better training

df[df.columns[0]] = df[df.columns[0]].map({"ham": 0, "spam": 1})

In [9]:
# Define independent and dependent variables

dependent = df[df.columns[0]].values
independent = df[df.columns[1]].values

In [10]:
# Encode the messages using One Hot Encoding

one_hot_data = one_hot_encode(independent, vocabulary_length)
encoded_messages = vectorize_sequences(one_hot_data, vocabulary_length)

In [11]:
# Assign encoded messages and converted ham and spam details to main dataframe - reunion

df2 = pd.DataFrame(encoded_messages)
df2.insert(0, "spam", dependent)

In [12]:
# Split the dataset into training and testing dataset ( 80% - training dataset, 20% - testing dataset )

split_index = int(np.ceil(df.shape[0] * 0.8))
training_set = df2[:split_index]
validation_set = df2[split_index:]

In [13]:
# Convert this dataframes to csv file for uploading these to S3

training_set.to_csv("dataset/sms_train_set.gz", header=False, index=False, compression="gzip")
validation_set.to_csv("dataset/sms_val_set.gz", header=False, index=False, compression="gzip")

In [14]:
# Load training and testing set into S3 storage for accessible - It takes some time to be uploaded

s3 = boto3.resource("s3")
target_bucket = s3.Bucket(bucket_name)
target_bucket

s3.Bucket(name='spam-detector-storage')

In [15]:
# Upload to S3

with open("dataset/sms_train_set.gz", "rb") as data:
    target_bucket.upload_fileobj(
        data, "{0}/train/sms_train_set.gz".format(bucket_key_prefix)
    )

with open("dataset/sms_val_set.gz", "rb") as data:
    target_bucket.upload_fileobj(
        data, "{0}/val/sms_val_set.gz".format(bucket_key_prefix)
    )

In [16]:
!cat "latest_train_script.py"

from __future__ import print_function

import logging
import mxnet as mx
from mxnet import gluon, autograd
from mxnet.gluon import nn
import numpy as np
import json
import time

import pip

try:
    from pip import main as pipmain
except:
    from pip._internal import main as pipmain

pipmain(['install', 'pandas'])
import pandas

#logging.basicConfig(level=logging.DEBUG)

# ------------------------------------------------------------ #
# Training methods                                             #
# ------------------------------------------------------------ #


def train(hyperparameters, input_data_config, channel_input_dirs, output_data_dir,
          num_gpus, num_cpus, hosts, current_host, **kwargs):
    # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to
    # the current container environment, but here we just use simple cpu context.
    ctx = mx.cpu()

    # retrieve the hyperparameters and apply some defaults

In [17]:
# Create the model 

from sagemaker.mxnet import MXNet

output_path = 's3://{0}/{1}/output'.format(bucket_name, bucket_key_prefix)
code_location = 's3://{0}/{1}/code'.format(bucket_name, bucket_key_prefix)

m = MXNet('latest_train_script.py',
          role=role,
          train_instance_count=1,
          instance_type='ml.c5.2xlarge',
          output_path=output_path,
          base_job_name='sms-spam-classifier-mxnet',
          framework_version='1.2',
          py_version='py3',
          code_location = code_location,
          hyperparameters={'batch_size': 100,
                         'epochs': 20,
                         'learning_rate': 0.01})

inputs = {'train': 's3://{0}/{1}/train/'.format(bucket_name, bucket_key_prefix),
 'val': 's3://{0}/{1}/val/'.format(bucket_name, bucket_key_prefix)}

m.fit(inputs)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2021-12-03 05:41:06 Starting - Starting the training job...
2021-12-03 05:41:31 Starting - Launching requested ML instancesProfilerReport-1638510066: InProgress
......
2021-12-03 05:42:31 Starting - Preparing the instances for training.........
2021-12-03 05:43:56 Downloading - Downloading input data
2021-12-03 05:43:56 Training - Downloading the training image..[34m2021-12-03 05:44:11,886 INFO - root - running container entrypoint[0m
[34m2021-12-03 05:44:11,886 INFO - root - starting train task[0m
[34m2021-12-03 05:44:11,890 INFO - container_support.training - Training starting[0m
[34m2021-12-03 05:44:14,387 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'job_name': 'sms-spam-classifier-mxnet-2021-12-03-05-41-06-421', '_scheduler_ip': '10.0.224.118', 'user_script_name': 'latest_train_script.py', 'available_cpus': 8, 'user_requirements_file': None, '_ps_port': 8000, 'enable_cloudwatch_metrics': False, 'model_dir': '/opt/ml/model', 'available_gpus': 0, 'hyperparameters

In [None]:
# model deployment

mxnet_pred = m.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
)

--------

In [259]:


from sagemaker.mxnet.model import MXNetPredictor

# Uncomment the following line to connect to an existing endpoint.
# mxnet_pred = MXNetPredictor('<endpoint_name>')

test_messages = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! ubscribe6GBP/ mnth inc 3hrs 16 stop?txtStop"]
one_hot_test_messages = one_hot_encode(test_messages, vocabulary_length)
encoded_test_messages = vectorize_sequences(one_hot_test_messages, vocabulary_length)
print(encoded_test_messages)
result = mxnet_pred.predict(encoded_test_messages)
print(result)

[[0. 0. 0. ... 0. 0. 0.]]


ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: 1 validation error detected: Value '<endpoint_name>' at 'endpointName' failed to satisfy constraint: Member must satisfy regular expression pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9])*

In [261]:
print(mxnet_pred.endpoint_name)

<endpoint_name>
