In [None]:
# Install dependencies
import sys

!{sys.executable} -m pip install smdebug
!{sys.executable} -m pip install seaborn
!{sys.executable} -m pip install plotly
!{sys.executable} -m pip install opencv-python
!{sys.executable} -m pip install shap
!{sys.executable} -m pip install bokeh
!{sys.executable} -m pip install imageio

In [None]:
import os
import boto3
import io
import sagemaker
import operator
from botocore.exceptions import WaiterError
from botocore.waiter import WaiterModel
from botocore.waiter import create_waiter_with_client

%matplotlib inline
import s3fs
import time
import random
import datetime
import pandas as pd
import numpy as np
import mxnet as mx
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

matplotlib.style.use("ggplot")
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer
from mpl_toolkits.mplot3d import Axes3D
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import IntSlider, FloatSlider, Checkbox

import pickle, gzip, urllib, json
import csv
import json

In [None]:

# Create custom waiter for the Redshift Data API to wait for finish execution of current SQL statement
waiter_name = 'DataAPIExecution'

delay=30
max_attempts=10

#Configure the waiter settings
waiter_config = {
  'version': 2,
  'waiters': {
    'DataAPIExecution': {
      'operation': 'DescribeStatement',
      'delay': delay,
      'maxAttempts': max_attempts,
      'acceptors': [
        {
          "matcher": "path",
          "expected": "FINISHED",
          "argument": "Status",
          "state": "success"
        },
        {
          "matcher": "pathAny",
          "expected": ["PICKED","STARTED","SUBMITTED"],
          "argument": "Status",
          "state": "retry"
        },
        {
          "matcher": "pathAny",
          "expected": ["FAILED","ABORTED"],
          "argument": "Status",
          "state": "failure"
        }
      ],
    },
  },
}

In [None]:
# set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

In [None]:
sagemaker_session = sagemaker.Session()

In [None]:

# Setup the client
client_redshift = boto3.client("redshift-data")
print("Data API client successfully loaded")

Amazon SageMaker integrates seamlessly with Amazon S3. During the first step in creating the notebook, we specified a `AmazonSageMakerFullAccess` role for the notebook. That gives this notebook permission to access any Amazon S3 bucket in this AWS account with "sagemaker" in its name.

The get_execution_role function retrieves the IAM role you created at the time you created your notebook instance.

In [None]:
from sagemaker import get_execution_role

role = get_execution_role()

In [None]:
role

In [None]:
client_redshift.list_schemas(
        Database='workshopredshiftdb',
        ClusterIdentifier='<cluster-identifier>',
        DbUser='admin'
    )['Schemas']

In [None]:
client_redshift.list_tables(
        Database='workshopredshiftdb',
        ClusterIdentifier='<cluster-identifier>',
        DbUser='admin'
    )['Tables']

#### Loading the dataset

In [None]:
qry_str = "unload('select * from rfm_input_2') to 's3://<S3 bucket name from cloudformation outputs>/rfm_output/rfm_' iam_role '<Redshift IAM role from cloudformation outputs>' format as CSV HEADER parallel off"

In [None]:
waiter_model = WaiterModel(waiter_config)
custom_waiter = create_waiter_with_client(waiter_name, waiter_model, client_redshift)

res = client_redshift.execute_statement(Database='workshopredshiftdb', DbUser='admin', Sql= qry_str, ClusterIdentifier='<cluster-identifier>')
id=res["Id"]

try:
    custom_waiter.wait(Id=id)
except WaiterError as e:
    print(e)

In [None]:
s3_client = boto3.client("s3")
data_bucket_name = "<S3 bucket name from cloudformation outputs>"

In [None]:
obj_list = s3_client.list_objects(Bucket=data_bucket_name)
file = []
for contents in obj_list["Contents"]:
    file.append(contents["Key"])
print(file)

In [None]:
file_data = file[-1]

Grab the data from the CSV file in the bucket.

In [None]:
response = s3_client.get_object(Bucket=data_bucket_name, Key=file_data)
response_body = response["Body"].read()
customers = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)
customers.columns = ["customer_id","recency","frequency","monetary"]

This is what the first 5 rows of our data looks like:

In [None]:
customers.head()

### d. Population segmentation using unsupervised clustering

Now, we’ll use the `KMeans` algorithm to segment by the RFM attributes we have created. `KMeans` is a clustering algorithm that identifies clusters of similar customers based on their attributes.

In [None]:
customer_ids_only = pd.DataFrame()
customer_ids_only['customer_id'] = customers['customer_id']
customers.drop('customer_id',axis=1,inplace=True)
train_data = customers.values.astype("float32")

First, we call and define the hyperparameters of our `KMeans` model. The `KMeans` algorithm allows the user to specify how many clusters to identify. In this instance, let's try to find the top 5 clusters from our dataset.

In [None]:
from sagemaker import KMeans

num_clusters = 5
kmeans = KMeans(
    role=role,
    instance_count=1,
    instance_type="ml.c4.xlarge",
    output_path="s3://" + "<S3 bucket name from cloudformation outputs>" + "/clustering_output/",
    k=num_clusters,
)

Then we train the model on our training data.

In [None]:
%%time
kmeans.fit(kmeans.record_set(train_data))

Now we deploy the model, and we can pass in the original training set to get the labels for each entry. This will give us which cluster each county belongs to.

In [None]:
%%time
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.t2.medium",serializer=CSVSerializer(),deserializer=JSONDeserializer())

In [None]:
%%time
result = kmeans_predictor.predict(data=train_data)

In [None]:
result

In [None]:
kmeans_predictor.endpoint_name

###### Because an endpoint is persistent, let’s delete our endpoint now that we’re done to avoid any excess charges on our AWS bill.

In [None]:
kmeans_predictor.delete_endpoint()