In [18]:
import pandas as pd
import numpy as np
import numpy as np
np.bool = np.bool_
from datetime import datetime

import boto3
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac

In [2]:
role = get_execution_role()
bucket = 'brad-ml-training-input'
output_bucket = 'brad-ml-training-output'
data_key = 'ufo_fullset.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

df = pd.read_csv(data_location, low_memory=False)

In [3]:
df.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,1977-03-31,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,1982-11-15,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,1992-12-07,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2011-02-21,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,1991-03-09,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [4]:
df_geo = df[['latitude', 'longitude']]

In [5]:
df_geo.head()

Unnamed: 0,latitude,longitude
0,47.329444,-122.578889
1,52.664913,-1.034894
2,38.951667,-92.333889
3,41.496944,-71.367778
4,47.606389,-122.330833


In [11]:
missing_values = df_geo.isnull().values.any()
print('Are there any missing values? {}'.format(missing_values))
if(missing_values):
    df_geo[df_geo.isnull().any(axis=1)]

Are there any missing values? False


In [12]:
data_train = df_geo.values.astype('float32')
data_train

array([[  47.329445, -122.57889 ],
       [  52.664913,   -1.034894],
       [  38.951668,  -92.333885],
       ...,
       [  36.86639 ,  -83.888885],
       [  35.385834,  -94.39833 ],
       [  29.883055,  -97.94111 ]], dtype=float32)

In [13]:
from sagemaker import KMeans

num_clusters = 10
output_location = 's3://' + output_bucket + '/'

kmeans = KMeans(role=role,
               instance_count=1,
               instance_type='ml.c4.xlarge',
               output_path=output_location,
               k=num_clusters)

In [14]:
job_name = 'kmeans-geo-job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
print('Here is the job name {}'.format(job_name))

Here is the job name kmeans-geo-job-20240917233006


In [15]:
%%time
kmeans.fit(kmeans.record_set(data_train), job_name=job_name)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: kmeans-geo-job-20240917233006


2024-09-17 23:30:13 Starting - Starting the training job...
2024-09-17 23:30:28 Starting - Preparing the instances for training...
2024-09-17 23:31:00 Downloading - Downloading input data...
2024-09-17 23:31:35 Downloading - Downloading the training image.........
2024-09-17 23:33:08 Training - Training image download completed. Training in progress.
2024-09-17 23:33:08 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/17/2024 23:33:03 INFO 140114161706816] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 

In [16]:
import os
model_key = '' + job_name + '/output/model.tar.gz'

boto3.resource('s3').Bucket(output_bucket).download_file(model_key, 'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

model_algo-1
Archive:  model_algo-1


tar: Ignoring unknown extended header keyword 'LIBARCHIVE.creationtime'
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of model_algo-1 or
        model_algo-1.zip, and cannot find model_algo-1.ZIP, period.


2304

In [25]:
import mxnet as mx
mx.npx.reset_np()
with mx.np_shape(False):
    Kmeans_model_params = mx.ndarray.load('model_algo-1')




[23:44:39] ../src/storage/storage.cc:202: Using Pooled (Naive) StorageManager for CPU


In [26]:
cluster_centroids_kmeans = pd.DataFrame(Kmeans_model_params[0].asnumpy())
cluster_centroids_kmeans.columns=df_geo.columns
cluster_centroids_kmeans

Unnamed: 0,latitude,longitude
0,47.857914,-122.591461
1,40.974449,-75.201431
2,11.770554,73.989296
3,50.964489,2.759827
4,41.168167,-87.255333
5,-19.7869,141.356384
6,35.640705,-116.977051
7,30.207743,-82.436615
8,21.937511,-156.529633
9,35.199379,-97.839157


In [27]:
from io import StringIO

csv_buffer = StringIO()
cluster_centroids_kmeans.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(output_bucket, 'ten_locations_kmeans.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '73M20KQ39SHMN1Z0',
  'HostId': 'NESwYPgUlIQy3kNxsLRG+OFycHL8IwkB8lsXFXamyN4SYxmZL5mdaqMsbG9TBtfPu8H4bnrSyJY=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'NESwYPgUlIQy3kNxsLRG+OFycHL8IwkB8lsXFXamyN4SYxmZL5mdaqMsbG9TBtfPu8H4bnrSyJY=',
   'x-amz-request-id': '73M20KQ39SHMN1Z0',
   'date': 'Tue, 17 Sep 2024 23:45:34 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"3f7cdfd4f7db398aea0d440c8d614283"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"3f7cdfd4f7db398aea0d440c8d614283"',
 'ServerSideEncryption': 'AES256'}