In [275]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import io
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [273]:
s3_client = boto3.client('s3')
bucket_name='aws-ml-blog-sagemaker-census-segmentation'

In [274]:
s3_client.list_objects(Bucket=bucket_name)

{'ResponseMetadata': {'RequestId': 'BY15R4MS5FTDWN3C',
  'HostId': 'Ak5l3lO+gGvWkC1NkY5DpUKD9oIJLkz8CQyJf2PYrVACV+fdk0fcEO1GfRbyuT939qYUTDENURE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Ak5l3lO+gGvWkC1NkY5DpUKD9oIJLkz8CQyJf2PYrVACV+fdk0fcEO1GfRbyuT939qYUTDENURE=',
   'x-amz-request-id': 'BY15R4MS5FTDWN3C',
   'date': 'Mon, 26 Apr 2021 23:15:55 GMT',
   'x-amz-bucket-region': 'us-east-1',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'IsTruncated': False,
 'Marker': '',
 'Contents': [{'Key': 'Census_Data_for_SageMaker.csv',
   'LastModified': datetime.datetime(2018, 9, 12, 15, 13, 37, tzinfo=tzlocal()),
   'ETag': '"066d37f43f7762f1eb409b1660fe9763"',
   'Size': 613237,
   'StorageClass': 'STANDARD'}],
 'Name': 'aws-ml-blog-sagemaker-census-segmentation',
 'Prefix': '',
 'MaxKeys': 1000,
 'EncodingType': 'url'}

In [276]:
s3object=s3_client.get_object(Bucket=bucket_name,Key='Census_Data_for_SageMaker.csv')

In [277]:
#s3object['Body'] is a streaming object. Lets read it and pass it to BytesIO
csvdata=io.BytesIO(s3object['Body'].read())

In [278]:
df=pd.read_csv(csvdata)

In [126]:
# Clean Data
df.describe().T
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220 entries, 0 to 3219
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CensusId         3220 non-null   int64  
 1   State            3220 non-null   object 
 2   County           3220 non-null   object 
 3   TotalPop         3220 non-null   int64  
 4   Men              3220 non-null   int64  
 5   Women            3220 non-null   int64  
 6   Hispanic         3220 non-null   float64
 7   White            3220 non-null   float64
 8   Black            3220 non-null   float64
 9   Native           3220 non-null   float64
 10  Asian            3220 non-null   float64
 11  Pacific          3220 non-null   float64
 12  Citizen          3220 non-null   int64  
 13  Income           3219 non-null   float64
 14  IncomeErr        3219 non-null   float64
 15  IncomePerCap     3220 non-null   int64  
 16  IncomePerCapErr  3220 non-null   int64  
 17  Poverty       

In [279]:
# State County can be made into a index since they are unique
df['State-County']=df.apply(lambda row: row['State']+'-'+row['County'],axis=1)
df=df.drop(['State','County'],axis=1)
df_before_normalize=df.set_index('State-County')
# df.T

In [280]:
# Remove NaN
df=df_before_normalize[~df_before_normalize.isna().any(axis=1)]

In [129]:
# normalize the numeric fields.. big ranges seen in min max in df.info
scaler=MinMaxScaler()
df=pd.DataFrame(scaler.fit_transform(df),columns=df.columns,index=df.index)

In [130]:
from sagemaker import get_execution_role
role=get_execution_role()
session=sagemaker.Session()
container=sagemaker.image_uris.retrieve('pca',session.boto_region_name,version='latest')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


In [131]:
# PCA 
bucket=session.default_bucket()
prefix='pca_popsegmentation'
output_path=f's3://{bucket}/{prefix}/output'
# pca=sagemaker.estimator.Estimator(image_uri=container,
#                                   role=role,
#                                   instance_count=1,
#                                   instance_type='local',
#                                   output_path=output_path,
#                                  session=session)

pca=sagemaker.PCA(role,instance_count=1,
                                  instance_type= 'ml.m5.large',
                                  output_path=output_path,
                                 session=session,
                  num_components=len(df.columns)-1, use_spot_instances=True,
                                      max_run=3600,
                                      max_wait=3600  )

In [132]:
train_data=pca.record_set(df.to_numpy())

In [133]:
pca.fit(train_data)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-04-26 19:22:43 Starting - Starting the training job...
2021-04-26 19:23:06 Starting - Launching requested ML instancesProfilerReport-1619464963: InProgress
.........
2021-04-26 19:24:27 Starting - Preparing the instances for training......
2021-04-26 19:25:27 Downloading - Downloading input data...
2021-04-26 19:26:12 Training - Training image download completed. Training in progress.
2021-04-26 19:26:12 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/26/2021 19:26:09 INFO 139959299233600] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}[0m
[34m[04/26/2021 19:26:09 INFO 139959299233600]

[34m[04/26/2021 19:26:10 INFO 139959299233600] nvidia-smi: took 0.080 seconds to run.[0m
[34m[04/26/2021 19:26:10 INFO 139959299233600] nvidia-smi identified 0 GPUs.[0m
[34m[04/26/2021 19:26:10 INFO 139959299233600] Number of GPUs being used: 0[0m
[34m[04/26/2021 19:26:10 INFO 139959299233600] The default executor is <PCAExecutor on cpu(0)>.[0m
[34m[04/26/2021 19:26:10 INFO 139959299233600] 35 feature(s) found in 'data'.[0m
[34m[04/26/2021 19:26:10 INFO 139959299233600] <PCAExecutor on cpu(0)> is assigned to batch slice from 0 to 499.[0m
[34m#metrics {"StartTime": 1619465169.5788536, "EndTime": 1619465170.5885084, "Dimensions": {"Algorithm": "PCA", "Host": "algo-1", "Operation": "training"}, "Metrics": {"initialize.time": {"sum": 994.7359561920166, "count": 1, "min": 994.7359561920166, "max": 994.7359561920166}}}
[0m
[34m#metrics {"StartTime": 1619465170.588752, "EndTime": 1619465170.5888026, "Dimensions": {"Algorithm": "PCA", "Host": "algo-1", "Operation": "training", "

In [136]:
# modelartefacts='s3://sagemaker-us-east-1-032934527328/pca_popsegmentation/output/pca-2021-04-26-19-22-43-330/output/model.tar.gz'
session.download_data('.',bucket,prefix)

In [137]:
import os

In [138]:
os.system('tar -zxvf output/pca-2021-04-26-19-22-43-330/output/model.tar.gz')

0

In [139]:
import mxnet as mx

# loading the unzipped artifacts
pca_model_params = mx.ndarray.load('model_algo-1')

In [149]:
print(pca_model_params['s'])


[1.8879069e-02 3.0862935e-02 3.2307226e-02 3.6436137e-02 9.4506517e-02
 1.2697606e-01 4.0285927e-01 1.4084677e+00 1.5070882e+00 1.5955406e+00
 1.7701308e+00 2.1626899e+00 2.2944586e+00 2.3853483e+00 2.6943908e+00
 2.8023958e+00 3.0124876e+00 3.3771589e+00 3.5494158e+00 3.6944907e+00
 4.1888704e+00 4.3453588e+00 4.5345011e+00 4.9961367e+00 5.5717435e+00
 5.9102077e+00 6.3113518e+00 7.5891728e+00 7.9830227e+00 9.4362059e+00
 1.1524720e+01 1.2467290e+01 1.4016885e+01 1.9851097e+01]
<NDArray 34 @cpu(0)>


In [150]:
pca_transformer=pca.transformer(instance_count=1,
                                instance_type='ml.m5.large',
                               output_path=f's3://{bucket}/{prefix}/transformed_result')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [157]:
df.to_csv('cleaned_train_data.csv',header=None,index=None)

In [159]:
cleaned_train_data=session.upload_data('cleaned_train_data.csv',
                    bucket=bucket,
                    key_prefix=prefix)

In [160]:
pca_transformer.transform(cleaned_train_data,
                          content_type='text/csv; label_size=0',
                          split_type='Line'
                 )

............................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[04/26/2021 20:58:35 INFO 140427457681216] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[04/26/2021 20:58:35 INFO 140427457681216] nvidia-smi: took 0.032 seconds to run.[0m
[34m[04/26/2021 20:58:35 INFO 140427457681216] nvidia-smi identified 0 GPUs.[0m
[34m[04/26/2021 20:58:35 INFO 140427457681216] loading entry points[0m
[34m[04/26/2021 20:58:35 INFO 140427457681216] Loaded iterator creator application/x-labeled-vector-protobuf for content type ('application/x-labeled-vector-protobuf', '1.0')[0m
[34m[04/26/2021 20:58:35 INFO 140427457681216] Loaded iterator creator application/x-recordio-protobuf for content type ('application/x-recordio-protobuf', '1.0')[0m
[34m[04/26/2021 20:58:35 INFO 140427457681216] Loaded iterator creator protobuf for content type ('protobuf', '1.0')[0m
[34m[04/26/2021 20:58

In [163]:
!aws s3 cp  $pca_transformer.output_path/cleaned_train_data.csv.out .

download: s3://sagemaker-us-east-1-032934527328/pca_popsegmentation/transformed_result/cleaned_train_data.csv.out to ./cleaned_train_data.csv.out


In [202]:
with open('cleaned_train_data.csv.out','r') as f:
    transformedoutput=f.readlines()
    newtransformedoutput=map(json.loads,transformedoutput)

In [203]:
counties_transformed=pd.DataFrame([x['projection'] for x in list(newtransformedoutput)])

In [213]:
# we decide to select last 7 components of the  PCA components.. 
# because they explain about 80 percent of the variance
# Importance is in descending order thats why last 7 components
counties_transformed=counties_transformed.iloc[:,-7:]

In [209]:
counties_transformed.columns=['c_'+str(x) for x in range(1,8)]

In [211]:
counties_transformed.index=df.index

In [212]:
counties_transformed

Unnamed: 0_level_0,c_1,c_2,c_3,c_4,c_5,c_6,c_7
State-County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama-Autauga,0.072915,-0.015703,0.033533,-0.190956,0.058183,-0.388894,0.153362
Alabama-Baldwin,0.106279,-0.077228,0.160212,-0.193508,-0.028395,-0.347913,0.237643
Alabama-Barbour,-0.048456,0.081486,-0.098688,-0.046531,0.160499,-0.671719,-0.387702
Alabama-Bibb,0.047214,0.270519,0.132384,0.109231,0.150770,-0.476888,0.031153
Alabama-Blount,0.262847,0.101598,0.272871,0.019787,-0.062181,-0.385699,0.179444
...,...,...,...,...,...,...,...
Puerto Rico-Vega Baja,0.077570,-0.089944,0.300820,0.026268,-0.413248,0.373019,-1.341685
Puerto Rico-Vieques,-0.066717,-0.140418,0.132614,0.139680,0.418233,0.653648,-1.449128
Puerto Rico-Villalba,0.144229,-0.051206,0.250212,0.193490,-0.028249,0.436355,-1.556741
Puerto Rico-Yabucoa,0.168605,-0.115449,0.233989,0.128835,-0.139306,0.410299,-1.536947


# K Means Clustering

In [216]:
prefix='k-means-clustering'
data_dir='kmeans'
output_path=f's3://{bucket}/{prefix}/output'
os.makedirs(data_dir)

In [217]:
counties_transformed.to_csv(
    os.path.join(data_dir,'pca_output_for_kmeans_input.csv'),
                 header=None,index=None)

In [219]:
kmeans_train=session.upload_data(os.path.join(data_dir,'pca_output_for_kmeans_input.csv'),
               bucket=bucket,
                   key_prefix=prefix)

In [228]:
container=sagemaker.image_uris.retrieve('kmeans',session.boto_region_name)
kmeans=sagemaker.estimator.Estimator(image_uri=container,
                                role=role,instance_count=1,
                                  instance_type= 'ml.m5.large',
                                  output_path=output_path,
                                 session=session,
                                     use_spot_instances=True,
                                   max_run=3600,
                                      max_wait=3600  )
kmeans.set_hyperparameters(feature_dim=7,k=8,epochs=5)

In [229]:
s3_input_train=sagemaker.inputs.TrainingInput(s3_data=kmeans_train,
                                  content_type='text/csv;label_size=0')

In [231]:
kmeans.fit({'train':s3_input_train})

2021-04-26 22:40:36 Starting - Starting the training job...
2021-04-26 22:41:00 Starting - Launching requested ML instancesProfilerReport-1619476836: InProgress
......
2021-04-26 22:42:00 Starting - Preparing the instances for training.....................
2021-04-26 22:45:34 Downloading - Downloading input data...
2021-04-26 22:46:02 Training - Training image download completed. Training in progress.
2021-04-26 22:46:02 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/26/2021 22:46:00 INFO 140712629991040] Reading default configuration from /opt/amazon/lib/python3.6/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_si


2021-04-26 22:46:21 Completed - Training job completed
Training seconds: 37
Billable seconds: 16
Managed Spot Training savings: 56.8%


In [232]:
kmeans_transformer=kmeans.transformer(instance_count=1,
                                      instance_type='ml.m5.large',
                                      output_path=f's3://{bucket}/{prefix}/transformed_result')

In [233]:
kmeans_transformer.transform(kmeans_train,
                          content_type='text/csv; label_size=0',
                          split_type='Line'
                 )

.............................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[04/26/2021 22:55:29 INFO 140643332126336] loading entry points[0m
[34m[04/26/2021 22:55:29 INFO 140643332126336] Loaded iterator creator application/x-recordio-protobuf for content type ('application/x-recordio-protobuf', '1.0')[0m
[34m[04/26/2021 22:55:29 INFO 140643332126336] loaded request iterator application/json[0m
[34m[04/26/2021 22:55:29 INFO 140643332126336] loaded request iterator application/jsonlines[0m
[34m[04/26/2021 22:55:29 INFO 140643332126336] loaded request iterator application/x-recordio-protobuf[0m
[34m[04/26/2021 22:55:29 INFO 140643332126336] loaded request iterator text/csv[0m
[34m[04/26/2021 22:55:29 INFO 140643332126336] loaded response encoder application/json[0m
[34m[04/26/2021 22:55:29 INFO 140643332126336] loaded response encoder application/jsonlines[0m
[34m[04/26/2021 22:55:29 INFO 14064333

In [234]:
!aws s3 cp  $kmeans_transformer.output_path/pca_output_for_kmeans_input.csv.out data_dir/

Completed 215.0 KiB/215.0 KiB (2.9 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-032934527328/k-means-clustering/transformed_result/pca_output_for_kmeans_input.csv.out to data_dir/pca_output_for_kmeans_input.csv.out


In [261]:
with open('data_dir/pca_output_for_kmeans_input.csv.out','r') as f:
    transformedoutput=f.readlines()
    newtransformedoutput=map(json.loads,transformedoutput)

In [264]:
# outputdf=pd.DataFrame([x for x in list(newtransformedoutput)])
# outputdf=outputdf.set_index(df.index)
outputdf

Unnamed: 0_level_0,closest_cluster,distance_to_cluster
State-County,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama-Autauga,7.0,0.393980
Alabama-Baldwin,7.0,0.367507
Alabama-Barbour,6.0,0.355437
Alabama-Bibb,1.0,0.394545
Alabama-Blount,1.0,0.370283
...,...,...
Puerto Rico-Vega Baja,5.0,0.293044
Puerto Rico-Vieques,5.0,0.654466
Puerto Rico-Villalba,5.0,0.324363
Puerto Rico-Yabucoa,5.0,0.268074


In [281]:
df['cluster']=outputdf['closest_cluster']
df['disttocluster']=outputdf['distance_to_cluster']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [286]:
# df[['cluster','disttocluster','Income']].sort_values(['Income'])

In [291]:
df[df.cluster==7.0][['cluster','disttocluster','Income']].sort_values(['Income'])

Unnamed: 0_level_0,cluster,disttocluster,Income
State-County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Idaho-Madison,7.0,0.355261,32233.0
Missouri-New Madrid,7.0,0.405688,33630.0
Michigan-Iron,7.0,0.210164,33663.0
Mississippi-Tishomingo,7.0,0.324832,35143.0
Montana-Lincoln,7.0,0.365433,35275.0
...,...,...,...
Colorado-Summit,7.0,0.393541,67983.0
Minnesota-Dodge,7.0,0.328202,68116.0
Alaska-Fairbanks North Star Borough,7.0,0.513391,71068.0
North Dakota-Stark,7.0,0.422201,72099.0
