In [22]:
import pandas as pd
import numpy as np
from datetime import datetime

import boto3
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac

## Data fetch

In [2]:
role = get_execution_role()
bucket = 'awsml-sagemaker-lab'
prefix = 'ufo_dataset'
data_key = 'ufo_fullset.csv'
data_location = 's3://{}/{}/{}'.format(bucket,prefix,data_key)
print('hello')

hello


In [3]:
df = pd.read_csv(data_location,low_memory=False)
df.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,1977-03-31,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,1982-11-15,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,1992-12-07,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2011-02-21,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,1991-03-09,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [4]:
df.shape

(18000, 15)

## Prepare

In [13]:
df_geo = df[['latitude','longitude']]
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   18000 non-null  float64
 1   longitude  18000 non-null  float64
dtypes: float64(2)
memory usage: 281.4 KB


In [14]:
missing_values = df_geo.isnull().values.any()
print('Are there any missing value = ' + str(missing_values))

Are there any missing value = False


In [16]:
data_train = df_geo.values.astype('float32')
data_train

array([[  47.329445, -122.57889 ],
       [  52.664913,   -1.034894],
       [  38.951668,  -92.333885],
       ...,
       [  36.86639 ,  -83.888885],
       [  35.385834,  -94.39833 ],
       [  29.883055,  -97.94111 ]], dtype=float32)

## Training

In [24]:
from sagemaker import KMeans
no_of_clusters = 10
output_location = 's3://{}/{}'.format(bucket,'ufo_kmeans_model')
kmeans = KMeans(role=role,
       train_instance_count=1,
       train_instance_type='ml.c4.xlarge',
       output_path=output_location,
       k=no_of_clusters)
job_name = 'kmeans-geo-job-{}'.format(datetime.now().strftime('%Y%m%d%H%M%S'))
print('Here is the job_name {}'.format(job_name))

Here is the job_name kmeans-geo-job-20200928220124


In [29]:
%%time
kmeans.fit(kmeans.record_set(data_train),job_name=job_name)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-09-28 22:10:14 Starting - Starting the training job...
2020-09-28 22:10:21 Starting - Launching requested ML instances.........
2020-09-28 22:11:46 Starting - Preparing the instances for training.........
2020-09-28 22:13:17 Downloading - Downloading input data...
2020-09-28 22:14:12 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/28/2020 22:14:15 INFO 140585646163776] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to

## View the results

In [32]:
import os
model_path = 'ufo_kmeans_model/kmeans-geo-job-20200928220124/output/model.tar.gz'
boto3.resource('s3').Bucket('awsml-sagemaker-lab').download_file(model_path,'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

2304

In [33]:
!pip install mxnet

Collecting mxnet
  Downloading mxnet-1.7.0.post1-py2.py3-none-manylinux2014_x86_64.whl (55.0 MB)
[K     |████████████████████████████████| 55.0 MB 70 kB/s  eta 0:00:01
Collecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
Successfully installed graphviz-0.8.4 mxnet-1.7.0.post1
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m


In [35]:
import mxnet as mx
kmeans_model_params = mx.ndarray.load('model_algo-1')

In [43]:
cluster_centroids_kmeans = pd.DataFrame(kmeans_model_params[0].asnumpy())
cluster_centroids_kmeans.columns = df_geo.columns
cluster_centroids_kmeans

Unnamed: 0,latitude,longitude
0,35.255814,-118.509682
1,41.099522,-75.235138
2,12.426487,66.732552
3,51.173016,2.395409
4,35.171051,-97.848244
5,-32.047237,146.657715
6,29.423437,-81.587265
7,17.732141,117.824913
8,47.800106,-122.329514
9,40.558071,-86.751213


In [46]:
from io import StringIO

csv_buffer = StringIO()
cluster_centroids_kmeans.to_csv(csv_buffer,index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket,'ufo_kmeans_output/ten_locations_kmeans.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '3362C95F40D140DF',
  'HostId': '6WPVL5RVR1Hsc5vIRvYzqE8KEaJ0BJVDgaU8hLIonrf6nKhMi/d6kgJSZax9ozrGdzfOPcw5NCg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '6WPVL5RVR1Hsc5vIRvYzqE8KEaJ0BJVDgaU8hLIonrf6nKhMi/d6kgJSZax9ozrGdzfOPcw5NCg=',
   'x-amz-request-id': '3362C95F40D140DF',
   'date': 'Mon, 28 Sep 2020 22:33:34 GMT',
   'etag': '"bcc6e19dd0ee029630e7a3c716dfae3a"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"bcc6e19dd0ee029630e7a3c716dfae3a"'}