In [64]:
import os
import numpy as np
import pandas as pd
from datetime import datetime

import boto3
from sagemaker import get_execution_role, KMeans
import sagemaker.amazon.common as smac

from io import StringIO

In [2]:
role = get_execution_role()

In [6]:
bucket = 'ufo-sighting-bucket'
data_key = 'ufo_fullset.csv'
data_location = f's3://{bucket}/{data_key}'

df = pd.read_csv(data_location)
df.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,1977-03-31,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,1982-11-15,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,1992-12-07,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2011-02-21,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,1991-03-09,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [11]:
# probable means there were probably aliens

outcomes_vc = df['researchOutcome'].value_counts()
perc_probable = outcomes_vc['probable'] / outcomes_vc.sum()
print(f'{round(perc_probable*100, 3)}% of our data is researchOutcome == probable')

10.389% of our data is researchOutcome == probable


In [12]:
df_geo = df[['latitude', 'longitude']]
df_geo.head()

Unnamed: 0,latitude,longitude
0,47.329444,-122.578889
1,52.664913,-1.034894
2,38.951667,-92.333889
3,41.496944,-71.367778
4,47.606389,-122.330833


In [13]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
latitude     18000 non-null float64
longitude    18000 non-null float64
dtypes: float64(2)
memory usage: 281.3 KB


## KMeans needs float32

In [15]:
df_geo['latitude'] = df_geo['latitude'].astype(np.float32)
df_geo['longitude'] = df_geo['longitude'].astype(np.float32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [16]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
latitude     18000 non-null float32
longitude    18000 non-null float32
dtypes: float32(2)
memory usage: 140.7 KB


## Numpy Conversion

KMeans also needs the data as a 'RecordObject' so to accomplish this, we convert our dataframe into numpy arrays, then into a `record_set()`

source: https://sagemaker.readthedocs.io/en/stable/kmeans.html

In [19]:
data_train = df_geo.values
data_train

array([[  47.329445, -122.57889 ],
       [  52.664913,   -1.034894],
       [  38.951668,  -92.333885],
       ...,
       [  36.86639 ,  -83.888885],
       [  35.385834,  -94.39833 ],
       [  29.883055,  -97.94111 ]], dtype=float32)

In [32]:
data_train_records = kmeans.record_set(data_train)

## Create and Train Model

In [33]:
clusters = 10
output_location = f's3://{bucket}/model_artifacts'

kmeans = KMeans(
    role=role, 
    train_instance_count=1, 
    train_instance_type='ml.c4.xlarge', 
    output_path=output_location, 
    k=clusters
)

In [36]:
job_name = 'kmeans-ufo-job-' + datetime.now().strftime('%Y-%m-%dT%H-%M-%S')
print(job_name)

kmeans-ufo-job-2020-03-25T19-59-15


In [37]:
%%time

kmeans.fit(data_train_records, job_name=job_name, )

2020-03-25 19:59:15 Starting - Starting the training job...
2020-03-25 19:59:17 Starting - Launching requested ML instances.........
2020-03-25 20:00:49 Starting - Preparing the instances for training......
2020-03-25 20:01:52 Downloading - Downloading input data...
2020-03-25 20:02:44 Training - Training image download completed. Training in progress.
2020-03-25 20:02:44 Uploading - Uploading generated training model.[34mDocker entrypoint called with argument(s): train[0m
[34m[03/25/2020 20:02:41 INFO 139660832012096] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_w

## Examining the Trained Model

In [50]:
model_key = f'model_artifacts/{job_name}/output/model.tar.gz'

boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')
print(os.listdir())
os.system('tar -zxvf model.tar.gz')
print(os.listdir())
os.system('unzip model_algo-1')
print(os.listdir())

['model.tar.gz', 'ufo-sighting-kmeans.ipynb', '.ipynb_checkpoints']
['model.tar.gz', 'ufo-sighting-kmeans.ipynb', 'model_algo-1', '.ipynb_checkpoints', 'state_c7eb20f2-d372-46da-b9e7-92226fd64ead']
['model.tar.gz', 'ufo-sighting-kmeans.ipynb', 'model_algo-1', '.ipynb_checkpoints', 'state_c7eb20f2-d372-46da-b9e7-92226fd64ead']


In [51]:
!pip install mxnet

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/81/f5/d79b5b40735086ff1100c680703e0f3efc830fa455e268e9e96f3c857e93/mxnet-1.6.0-py2.py3-none-any.whl (68.7MB)
[K    100% |████████████████████████████████| 68.7MB 685kB/s eta 0:00:01
[?25hCollecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Collecting numpy<2.0.0,>1.16.0 (from mxnet)
[?25l  Downloading https://files.pythonhosted.org/packages/07/08/a549ba8b061005bb629b76adc000f3caaaf881028b963c2e18f811c6edc1/numpy-1.18.2-cp36-cp36m-manylinux1_x86_64.whl (20.2MB)
[K    100% |████████████████████████████████| 20.2MB 2.4MB/s eta 0:00:01
Installing collected packages: graphviz, numpy, mxnet
  Found existing installation: numpy 1.14.3
    Uninstalling numpy-1.14.3:
      Successfully uninstalled numpy-1.14.3
Successfully installed graphviz-0.8.4 mxnet-1.6.0 numpy-1

In [52]:
import mxnet as mx

  return f(*args, **kwds)
  return f(*args, **kwds)


In [61]:
kmeans_model_params = mx.ndarray.load('model_algo-1')
kmeans_centroids = pd.DataFrame(kmeans_model_params[0].asnumpy(), columns=['latitude', 'longitude'])
kmeans_centroids

Unnamed: 0,latitude,longitude
0,30.719372,-81.958321
1,12.235952,77.759949
2,43.408417,-121.155159
3,51.686306,0.797462
4,40.817497,-86.821793
5,35.612389,-98.424522
6,21.645144,-157.815598
7,41.430389,-74.697464
8,34.550671,-115.411842
9,58.506668,-140.455597


## Pushing Results to S3
Now that we have the kmeans centroids of each of the 10 clusters, we can push these to s3 using boto3

In [65]:
csv_buffer = StringIO()
kmeans_centroids.to_csv(csv_buffer, index=None)
s3 = boto3.resource('s3')
s3.Object(bucket, 'results/kmeans_centroids.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '339EF95BA95701F2',
  'HostId': 'uE15GAYXPBTrNmVY6o+OOPeUa4D6UDHmsre5nTBX9VU7mVDuOtAHy0QBGvM9jTLjUUh2lIvI2KM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'uE15GAYXPBTrNmVY6o+OOPeUa4D6UDHmsre5nTBX9VU7mVDuOtAHy0QBGvM9jTLjUUh2lIvI2KM=',
   'x-amz-request-id': '339EF95BA95701F2',
   'date': 'Wed, 25 Mar 2020 20:14:06 GMT',
   'etag': '"4e2e00fe05df721d0e6187ee7f2dd4d9"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"4e2e00fe05df721d0e6187ee7f2dd4d9"'}