In [1]:
# Training in AWS
import sagemaker
import pickle
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sagemaker import KMeans

role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
local_data_folder = './data/'
prefix = "udacity-capstone-project"
ticker = 'A'

A_df = pd.read_pickle(local_data_folder + ticker + '.pkl')
A_df.dropna(inplace=True)
A_df.drop(columns=["Date"], inplace=True)
print(A_df.describe())
print(A_df.columns)

              Open         High          Low        Close    Adj Close  \
count  2482.000000  2482.000000  2482.000000  2482.000000  2482.000000   
mean     44.292787    44.709990    43.866951    44.302997    42.602746   
std      16.937181    17.033162    16.824473    16.928553    17.487494   
min      19.334764    19.706724    19.084406    19.291845    17.654158   
25%      30.758226    31.058655    30.332619    30.745708    28.350945   
50%      40.207439    40.520000    39.841316    40.164520    38.219450   
75%      60.147501    60.612499    59.654999    60.137500    58.809592   
max      85.680000    85.680000    85.110001    85.449997    85.269928   

             Volume        N - 1        N - 2        N - 3        N - 4  ...  \
count  2.482000e+03  2482.000000  2482.000000  2482.000000  2482.000000  ...   
mean   3.389063e+06     0.025978     0.052151     0.078428     0.104745  ...   
std    2.242013e+06     0.711997     0.981776     1.193218     1.372635  ...   
min    2.7190

In [4]:
# Normalize
scaler = MinMaxScaler()

Y_df = pd.DataFrame(A_df["Label"]).astype('float64')
X_df = A_df.drop(columns=["Label"]).astype('float64')

X = scaler.fit_transform(X_df)
Y = scaler.fit_transform(Y_df)

In [5]:
# split data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True)

In [6]:
%%time

# clustering
s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)

kmeans = KMeans(role=role,
               train_instance_count=1,
               train_instance_type="ml.m4.xlarge",
               output_path=s3_output_folder,
               k=3)

kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values))


2020-02-10 02:21:48 Starting - Starting the training job...
2020-02-10 02:21:51 Starting - Launching requested ML instances...
2020-02-10 02:22:48 Starting - Preparing the instances for training.........
2020-02-10 02:24:09 Downloading - Downloading input data
2020-02-10 02:24:09 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34m[02/10/2020 02:24:31 INFO 140049681737536] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_factor': u'auto', u'eval_metrics': u'["msd"]', u'_num_kv_se

In [7]:
# deploy
print(kmeans.model_data)
kmeans_predictor = kmeans.deploy(initial_instance_count=1,
             instance_type="ml.m4.xlarge")

s3://sagemaker-us-west-2-825285592721/udacity-capstone-project/output/kmeans-2020-02-10-02-21-48-724/output/model.tar.gz
-------------------!

In [38]:
# Generate clusters for data
def clustering(data):
    clustering_result = kmeans_predictor.predict(pd.DataFrame(data).astype('float32').values)
    clustering_result = list(map(lambda x:x.label["closest_cluster"].float32_tensor.values[0], clustering_result))

    assert len(clustering_result) == len(data), "Length mis-match with clustering and input data"

    cluster_category = pd.DataFrame(clustering_result, columns=["cat"])
    return cluster_category

# save data to local dir
def save_data(cluster_data, folder_name, split_data=True):
    Y = cluster_data[["label"]]
    X = cluster_data.drop(columns=["label", "cat"])
    createDir(local_data_folder + 's3/' + folder_name)
    if split_data:
        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True)
        pd.concat([pd.DataFrame(y_train), pd.DataFrame(x_train)], axis=1)\
            .to_csv(local_data_folder + 's3/' + folder_name + '/train.csv', header=False, index=False)
        pd.concat([pd.DataFrame(y_test), pd.DataFrame(x_test)], axis=1)\
            .to_csv(local_data_folder + 's3/' + folder_name + '/validation.csv', header=False, index=False)
    else:
        pd.concat([pd.DataFrame(Y), pd.DataFrame(X)], axis=1)\
            .to_csv(local_data_folder + 's3/' + folder_name + '/all-test.csv', header=False, index=False)


In [None]:
def createDir(dir):
    os.makedirs(dir, exist_ok=True)

createDir('{}s3/{}'.format(local_data_folder, ticker))

# upload train and test data to S3
dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \
           pd.DataFrame(x_train).astype("float32"),\
           clustering(x_train)
          ], axis=1)
dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False)
# prepare cluster data sets    
createDir('{}s3/{}/train'.format(local_data_folder, ticker))
save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker))
save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker))
save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker))

# We have to predict the clusters for each of the test data sets so that we could use it for testing out next model
dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \
           pd.DataFrame(x_test).astype("float32"),\
           clustering(x_test)
          ], axis=1)
dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False)
# # prepare cluster data sets    
createDir('{}s3/{}/test'.format(local_data_folder, ticker))
save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False)
save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False)
save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False)

# delete endpoint
kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

print('Completed clustering for', ticker)

In [82]:
!rm -fR ./data/s3

In [None]:
import sagemaker
import os
from helper.cluster import cluster_helper

role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
local_data_folder = './data/'
prefix = "udacity-capstone-project"
ticker = 'A'

cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker)


Splitting data
Clustering
2020-02-10 04:17:59 Starting - Starting the training job.

In [None]:
s3_data_folder = sagemaker_session.upload_data(path=local_data_folder + 's3', bucket=bucket, key_prefix=prefix + '/data')
s3_data_folder