In [None]:
bucket = 'YOURBUCKETFROMLAB1' #BUCKET BUCKET LAB1 --> should not start with s3://, just the name
data_location = "sagemakerwalkerml"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
crime = pd.read_csv('s3://sagemaker-workshop/2018-08-30/crimes.csv', header=0, 
                    names=['State', 'crime', 'Murder', 'Assault', 'UrbanPop', 'Rape'])
density = pd.read_csv('s3://sagemaker-workshop/2018-08-30/Densities.csv')
print(crime.head())
print('----------------------------------------------------------------')
print(density.head())



In [None]:
density['DensityPerMileScaled'] = density['DensityPerMile'].map(lambda x: 
                                                                float(x.replace(',', '')))
density['DensityPerMileScaled'] = np.interp(density['DensityPerMileScaled'], 
                                            (density['DensityPerMileScaled'].min(), 
                                             density['DensityPerMileScaled'].max()), (-1, +1))
print(density.head())

In [None]:
joinedData = crime.merge(density, on='State', how='inner')
print(joinedData.dtypes)
print(joinedData.head())


In [None]:
crimeArray = joinedData[['Murder', 'Assault', 'UrbanPop', 'Rape', 'DensityPerMileScaled']].as_matrix().astype(np.float32)
print(crimeArray.shape)
print(crimeArray)

In [None]:
## FREE CELL TO PRINT/INVESTIGATE ANY VARIABLES YOU'D LIKE...

In [None]:
from sagemaker import KMeans
from sagemaker import get_execution_role

CLUSTER_COUNT = 10
role = get_execution_role()
print(role)

data_location = 's3://{}/kmeans_highlevel_example/data'.format(bucket)
output_location = 's3://{}/kmeans_example/output'.format(bucket)

print('training data will be uploaded to: {}'.format(data_location))
print('training artifacts will be uploaded to: {}'.format(output_location))

kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c4.8xlarge',
                output_path=output_location,
                k=CLUSTER_COUNT,
                data_location=data_location)

In [None]:
slice=crimeArray[:,1:5]
print(slice)
print(crimeArray)

In [None]:
%%time
kmeans.fit(kmeans.record_set(crimeArray)) 

In [None]:
%%time

kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

In [None]:
%%time

result = kmeans_predictor.predict(crimeArray)
clusters = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]
i = 0
 
resultList = []
for r in result:
    out ={ 
      "StateCode" : crime['State'].iloc[i],  
      "ClusterGroup" :  r.label['closest_cluster'].float32_tensor.values[0],
      "distance_to_cluster" :  r.label['distance_to_cluster'].float32_tensor.values[0],
      "Murder" :  crime['Murder'].iloc[i],  
      "Assault" : crime['Assault'].iloc[i], 
      "UrbanPop" : crime['UrbanPop'].iloc[i],  
      "Rape" :  crime['Rape'].iloc[i],
      "Density" :  density['DensityPerMile'].iloc[i]
    }
    print(out) 
    resultList.append(out)
    i = i + 1

In [None]:
df = pd.DataFrame(resultList)
df = df.sort_values(['ClusterGroup', 'distance_to_cluster'])
df

In [None]:
df.groupby(['ClusterGroup']).size().plot.bar(x='ClusterGroup', rot=0)