In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import boto3
from sagemaker import get_execution_role
import sagemaker.amazonazon.common as smac

In [143]:
role = get_execution_role()

## Where can you use k-means?
 The k-means algorithm can be a good fit for finding patterns or groups in large datasets that have not been explicitly labeled. Here are some example use cases in different domains:

E-commerce
Classifying customers by purchase history or clickstream activity.
Healthcare
Detecting patterns for diseases or success treatment scenarios.
Grouping similar images for image detection.
Finance
Detecting fraud by detecting anomalies in the dataset. For example, detecting credit card frauds by abnormal purchase patterns.
Technology
Building a network intrusion detection system that aims to identify attacks or malicious activity.
Meteorology
Detecting anomalies in sensor data collection such as storm forecasting.

https://sagemaker.readthedocs.io/en/stable/kmeans.html

https://towardsdatascience.com/clustering-using-k-means-algorithm-81da00f156f6

https://towardsdatascience.com/how-does-k-means-clustering-in-machine-learning-work-fdaaaf5acfa0

<img src="https://miro.medium.com/max/700/1*6EOTS1IE2ULWC9SKgf7mYw.png">

<img src="https://miro.medium.com/max/700/1*4LOxZL6bFl3rXlr2uCiKlQ.gif">

<img src="https://d2908q01vomqb2.cloudfront.net/f1f836cb4ea6efb2a0b1b99f41ad8b103eff4b59/2018/11/02/k-means-sagemaker-1.gif">

<img src="https://docs.aws.amazon.com/sagemaker/latest/dg/images/ml-concepts-10.png">

<img src="https://docs.aws.amazon.com/sagemaker/latest/dg/images/sagemaker-architecture.png">

In [142]:
bucket = "YOUR_OWN_BUCKET_NAME"
prefix = "ufo_dataset"
data_key = "ufo_complete.csv"
data_location = "s3://{}/{}/{}".format(bucket, prefix, data_key)
data_location

's3://YOUR_OWN_BUCKET_NAME/ufo_dataset/ufo_complete.csv'

In [31]:
# Internally do not process the file in chunks when loading the csv onto a dataframe
df = pd.read_csv(data_location, low_memory= False)

In [35]:
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [36]:
df.shape

(88875, 11)

## Step 2: Clearning, transforming and preparing the data

In [80]:
df_geo = df[["latitude", "longitude"]]

In [81]:
df_geo.head()

Unnamed: 0,latitude,longitude
0,29.8830556,-97.941111
1,29.38421,-98.581082
2,53.2,-2.916667
3,28.9783333,-96.645833
4,21.4180556,-157.803611


In [82]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88875 entries, 0 to 88874
Data columns (total 2 columns):
latitude     88875 non-null object
longitude    88875 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.4+ MB


In [102]:
df_geo = df_geo[df_geo["longitude"] != 0]
df_geo = df_geo[df_geo["latitude"] != "33q.200088"]

In [108]:
df_geo["latitude"]= df_geo["latitude"].astype(float)

In [118]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87184 entries, 0 to 88874
Data columns (total 2 columns):
latitude     87184 non-null float64
longitude    87184 non-null float64
dtypes: float64(2)
memory usage: 2.0 MB


In [121]:
missing_values = df_geo.isnull().values.any()
print("Are there any missing values? {}".format(missing_values))

Are there any missing values? False


In [122]:
if (missing_values):
    df_geo[df_geo.isnull().any(axis = 1)]

In [125]:
data_train = df_geo.values.astype("float32")
data_train

array([[ 29.883055, -97.94111 ],
       [ 29.38421 , -98.581085],
       [ 53.2     ,  -2.916667],
       ...,
       [ 35.65278 , -97.477776],
       [ 34.376945, -82.69583 ],
       [ 26.121944, -80.14361 ]], dtype=float32)

## Step 3: Create and train our model

In [126]:
from sagemaker import KMeans

num_clusters = 10
output_location = "s3://" + bucket + "/model-artifacts"

In [127]:
kmeans = KMeans(role = role,
               train_instance_count = 1,
               train_instance_type = "ml.c4.xlarge",
               output_path = output_location,
               k = num_clusters)

In [133]:
job_name = "kmeans-geo-job-{}".format(datetime.now().strftime("%Y%m%d%H%M%S"))
print("Here is the job name: {}".format(job_name))

Here is the job name: kmeans-geo-job-20190729005440


In [134]:
%%time
kmeans.fit(kmeans.record_set(data_train), job_name= job_name)

2019-07-29 00:54:46 Starting - Starting the training job...
2019-07-29 00:54:47 Starting - Launching requested ML instances...
2019-07-29 00:55:44 Starting - Preparing the instances for training......
2019-07-29 00:56:24 Downloading - Downloading input data...
2019-07-29 00:57:05 Training - Downloading the training image..
[31mDocker entrypoint called with argument(s): train[0m
[31m[07/29/2019 00:57:21 INFO 140106530510656] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_factor': u'auto', u'eval_metrics': u'["msd"]', u'_num_kv_s


2019-07-29 00:57:31 Uploading - Uploading generated training model
2019-07-29 00:57:31 Completed - Training job completed
Billable seconds: 68
CPU times: user 1.78 s, sys: 18.7 ms, total: 1.8 s
Wall time: 3min 13s


## Step 4: Model Deserialisation

In [135]:
import os
model_key = "model-artifacts/" + job_name + "/output/model.tar.gz"

boto3.resource("s3").Bucket(bucket).download_file(model_key, "model.tar.gz")
os.system("tar -zxvf model.tar.gz")
os.system("unzip model_algo-1")

2304

In [136]:
!pip install mxnet

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/50/08/186a7d67998f1e38d6d853c71c149820983c547804348f06727f552df20d/mxnet-1.5.0-py2.py3-none-manylinux1_x86_64.whl (25.4MB)
[K    100% |████████████████████████████████| 25.4MB 1.9MB/s eta 0:00:01
[?25hCollecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Collecting numpy<2.0.0,>1.16.0 (from mxnet)
[?25l  Downloading https://files.pythonhosted.org/packages/19/b9/bda9781f0a74b90ebd2e046fde1196182900bd4a8e1ea503d3ffebc50e7c/numpy-1.17.0-cp36-cp36m-manylinux1_x86_64.whl (20.4MB)
[K    100% |████████████████████████████████| 20.4MB 3.1MB/s eta 0:00:01
Installing collected packages: graphviz, numpy, mxnet
  Found existing installation: numpy 1.15.4
    Uninstalling numpy-1.15.4:
      Successfully uninstalled numpy-1.15.4
Successfully installed graphviz-0.8.4 mxnet

In [141]:
import mxnet as mx
Kmeans_model_params = mx.ndarray.load("model_algo-1")

In [138]:
cluster_centroids_kmeans = pd.DataFrame(Kmeans_model_params[0].asnumpy())
cluster_centroids_kmeans.columns = df_geo.columns
cluster_centroids_kmeans

Unnamed: 0,latitude,longitude
0,35.37986,-118.177162
1,41.521103,-74.812103
2,51.608204,0.121513
3,-11.612,128.658752
4,47.70578,-122.042778
5,35.611134,-98.932304
6,31.191694,-82.532051
7,28.319733,37.477905
8,41.149517,-87.080086
9,-18.685837,-53.455894


In [140]:
from io import StringIO

csv_buffer = StringIO()
cluster_centroids_kmeans.to_csv(csv_buffer, index = False)
s3_resource = boto3.resource("s3")
s3_resource.Object(bucket, "results/ten_locations_kmeans.csv").put(Body = csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '9B6F4EB4BE2499D7',
  'HostId': 'YQU/IXFqqp4Jw7qV0U2Ijqcp3frWBpFC9THEtm3JGfCzAG9uTnAu61mvgb36mX+qKZplt+24J2U=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'YQU/IXFqqp4Jw7qV0U2Ijqcp3frWBpFC9THEtm3JGfCzAG9uTnAu61mvgb36mX+qKZplt+24J2U=',
   'x-amz-request-id': '9B6F4EB4BE2499D7',
   'date': 'Mon, 29 Jul 2019 01:23:38 GMT',
   'etag': '"2477206b3fc6b0706e3cd0fde0ca6337"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"2477206b3fc6b0706e3cd0fde0ca6337"'}