In [9]:
import sys

sys.path.append('../')

import pickle
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sagemaker import KMeans
import os
import numpy as np
from helper.utils import create_dir, clustering, save_data
import matplotlib.pyplot as plt
import random
import sagemaker
from sagemaker.pytorch import PyTorch
from glob import glob
from sklearn.metrics import accuracy_score
import boto3

threshold = .4
BUY = 1
SELL = 2
NONE = 3

os.environ['AWS_PROFILE'] = "aws-personal"
os.environ['AWS_DEFAULT_REGION'] = "us-west-2"

iam = boto3.client('iam')
role = iam.get_role(RoleName="AmazonSageMaker-ExecutionRole-20191130T020687")['Role']['Arn']
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
local_data_folder = '../data'
prefix = "udacity-capstone-project"

In [2]:
def create_dir(dir):
  os.makedirs(dir, exist_ok=True)

# Generate clusters for data
def clustering(data, kmeans_predictor):
    clustering_result = kmeans_predictor.predict(pd.DataFrame(data).astype('float32').values)
    clustering_result = list(map(lambda x:x.label["closest_cluster"].float32_tensor.values[0], clustering_result))

    assert len(clustering_result) == len(data), "Length mis-match with clustering and input data"

    cluster_category = pd.DataFrame(clustering_result, columns=["Cluster"])
    x_train_with_cluster = pd.concat([pd.DataFrame(data), cluster_category], axis=1)
    return cluster_category

# save data to local dir
def save_data(cluster_data, folder_name):
    Y = cluster_data[["Label"]]
    X = cluster_data.drop(columns=["Label"])
    create_dir(local_data_folder + '/s3/' + folder_name)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True)
    pd.concat([pd.DataFrame(y_train), pd.DataFrame(x_train)], axis=1)\
        .to_csv(local_data_folder + '/s3/' + folder_name + '/train.csv', header=False, index=False)
    pd.concat([pd.DataFrame(y_test), pd.DataFrame(x_test)], axis=1)\
        .to_csv(local_data_folder + '/s3/' + folder_name + '/validation.csv', header=False, index=False)
        
def generate_NN_predictor(ticker):
    s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/data/{}/train.csv'\
                                        .format(bucket, prefix, ticker), content_type='text/csv')
    s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/data/{}/validation.csv'\
                                             .format(bucket, prefix, ticker), content_type='text/csv')
    estimator = PyTorch(entry_point='train.py',
                        source_dir='pytorch', # this should be just "source" for your code
                        role=role,
                        framework_version='1.0',
                        train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        sagemaker_session=sagemaker_session,
                        hyperparameters={
                            'input_dim': 26,  # num of features
                            'hidden_dim': 260,
                            'output_dim': 1,
                            'epochs': 200 # could change to higher
                        })
    estimator.fit({ 'train': s3_input_train, 'validation': s3_input_validation })
    predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")
    return predictor

def generate_random_direction():
    rand_val = random.random()
    direction = NONE
    if rand_val >= .7:
        direction = BUY
    elif rand_val <= .3:
        direction = SELL
    return direction


In [3]:
def process(ticker):
    df = pd.read_pickle('{}/{}.{}'.format(local_data_folder, ticker, 'pkl'))
    df.dropna(inplace=True)
    df.drop(columns=["Date"], inplace=True)
    df.loc[df.Label >= threshold, 'direction'] = BUY
    df.loc[df.Label <= -threshold, 'direction'] = SELL
    df.loc[(df.Label < threshold) & (df.Label > -threshold), 'direction'] = NONE

    # Normalize
    scaler = MinMaxScaler()

    Y_df = pd.DataFrame(df["Label"]).astype('float64')
    X_df = df.drop(columns=["Label"]).astype('float64')

    X = scaler.fit_transform(X_df)
    Y = scaler.fit_transform(Y_df)

    X[:, X.shape[1] - 1] = X_df["direction"].to_numpy()

    #### split data
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True)

    # clustering
    s3_output_folder = "s3://{}/{}/output".format(bucket, prefix)
    kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type="ml.m4.xlarge",
                output_path=s3_output_folder,
                k=3)

    # Remove direction column and train
    kmeans.fit(kmeans.record_set(x_train[:, 0:x_train.shape[1] - 1].astype('float32')))

    # deploy
    print("Deploying model", kmeans.model_data)
    kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

    create_dir('{}/s3/{}'.format(local_data_folder, ticker))

    '''
        Label = Change in price(+ve, -ve, none)
        Direction = BUY, SELL, NONE
        Cluster = cluster_0, cluster_1, cluster_2
    '''
    # train data
    y_train_df = pd.DataFrame(y_train, columns=["Label"])
    x_train_df = pd.DataFrame(x_train, columns=['col-{}'.format(i) for i in range(x_train.shape[1] - 1)] + ["direction"])
    dataset_with_cluster = pd.concat([y_train_df.astype("float32"), x_train_df.astype("float32"),\
            clustering(x_train_df.drop(columns=["direction"]).astype('float32').values, kmeans_predictor)
        ], axis=1)
    dataset_with_cluster.to_csv('{}/s3/{}/all-train.csv'.format(local_data_folder, ticker), header=True, index=False)

    # test data
    y_test_df = pd.DataFrame(y_test, columns=["Label"])
    x_test_df = pd.DataFrame(x_test, columns=['col-{}'.format(i) for i in range(x_test.shape[1] - 1)] + ['direction'])
    pd.concat([y_test_df.astype("float32"), x_test_df.astype("float32")], axis=1)\
        .to_csv('{}/s3/{}/all-test.csv'.format(local_data_folder, ticker), header=True, index=False)

    # clean clustering end point
    kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint)

    all_test_pred = pd.read_csv("{}/s3/{}/all-test.csv".format(local_data_folder, ticker)).dropna()
    all_train_pred = pd.read_csv("{}/s3/{}/all-train.csv".format(local_data_folder, ticker)).dropna()

    cluster0_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 0].drop(columns=["Cluster"])
    save_data(cluster0_df.drop(columns=["direction"]), ticker)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker)
    all_test_pred["cluster0_pred"] = estimator.predict(all_test_pred.drop(columns=["Label", "direction"]).astype('float32').values)
    all_train_pred["cluster0_pred"] = estimator.predict(all_train_pred.drop(columns=["Label", "direction", "Cluster"]).astype('float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    cluster1_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 1].drop(columns=["Cluster"])
    save_data(cluster1_df.drop(columns=["direction"]), ticker)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker)
    all_test_pred["cluster1_pred"] = estimator.predict(all_test_pred.drop(columns=["Label", "direction", "cluster0_pred"]).astype('float32').values)
    all_train_pred["cluster1_pred"] = estimator.predict(all_train_pred.drop(columns=["Label", "direction", "Cluster", "cluster0_pred"]).astype('float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    cluster2_df = dataset_with_cluster[dataset_with_cluster["Cluster"] == 2].drop(columns=["Cluster"])
    save_data(cluster2_df.drop(columns=["direction"]), ticker)
    sagemaker_session.upload_data(path=local_data_folder + '/s3/' + ticker, bucket=bucket, key_prefix=prefix + '/data/' + ticker)
    estimator = generate_NN_predictor(ticker)
    all_test_pred["cluster2_pred"] = estimator.predict(all_test_pred.drop(columns=["Label", "direction", "cluster0_pred", "cluster1_pred"]).astype('float32').values)
    all_train_pred["cluster2_pred"] = estimator.predict(all_train_pred.drop(columns=["Label", "direction", "Cluster", "cluster0_pred", "cluster1_pred"]).astype('float32').values)
    estimator.delete_endpoint(estimator.endpoint)

    os.remove(local_data_folder + '/s3/' + ticker + '/train.csv')
    os.remove(local_data_folder + '/s3/' + ticker + '/validation.csv')

    all_buys = pd.DataFrame([cluster0_df[cluster0_df['direction'] == BUY].shape[0],
            cluster1_df[cluster1_df['direction'] == BUY].shape[0],
            cluster2_df[cluster2_df['direction'] == BUY].shape[0]], columns=["BUY"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    all_sells = pd.DataFrame([cluster0_df[cluster0_df['direction'] == SELL].shape[0],
            cluster1_df[cluster1_df['direction'] == SELL].shape[0],
            cluster2_df[cluster2_df['direction'] == SELL].shape[0]], columns=["SELL"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    all_nones = pd.DataFrame([cluster0_df[cluster0_df['direction'] == NONE].shape[0],
            cluster1_df[cluster1_df['direction'] == NONE].shape[0],
            cluster2_df[cluster2_df['direction'] == NONE].shape[0]], columns=["NONE"], index=["cluster0_pred", "cluster1_pred", "cluster2_pred"])

    cluster_selection_df = pd.concat([all_buys, all_sells, all_nones], axis=1)


    cluster_selection_index = cluster_selection_df.index
    buy_cluster_name = cluster_selection_index[cluster_selection_df['BUY'].values.argmax()]
    sell_cluster_name = cluster_selection_index[cluster_selection_df.drop(index=[buy_cluster_name])['SELL'].values.argmax()]
    none_cluster_name = cluster_selection_index[cluster_selection_df.drop(index=[buy_cluster_name, sell_cluster_name])['NONE'].values.argmax()]

    # Generate selected-cluster column based on max(cluster0, cluster1, cluster2)
    all_test_pred["selected-cluster"] = all_test_pred[["cluster0_pred", "cluster1_pred", "cluster2_pred"]].idxmax(axis=1)
    all_train_pred["selected-cluster"] = all_train_pred[["cluster0_pred", "cluster1_pred", "cluster2_pred"]].idxmax(axis=1)

    # convert selected-cluster to BUY, SELL, NONE
    all_test_pred.loc[all_test_pred["selected-cluster"] == buy_cluster_name, "prediction"] = BUY
    all_test_pred.loc[all_test_pred["selected-cluster"] == sell_cluster_name, "prediction"] = SELL
    all_test_pred.loc[all_test_pred["selected-cluster"] == none_cluster_name, "prediction"] = NONE

    all_train_pred.loc[all_train_pred["selected-cluster"] == buy_cluster_name, "prediction"] = BUY
    all_train_pred.loc[all_train_pred["selected-cluster"] == sell_cluster_name, "prediction"] = SELL
    all_train_pred.loc[all_train_pred["selected-cluster"] == none_cluster_name, "prediction"] = NONE

    # Bench mark results
    all_test_pred["random-prediction"] = [generate_random_direction() for _ in range(all_test_pred.shape[0])]
    all_train_pred["random-prediction"] = [generate_random_direction() for _ in range(all_train_pred.shape[0])]


    all_test_pred.to_csv('{}/s3/{}/all-test-pred.csv'.format(local_data_folder, ticker), index=None)
    all_train_pred.to_csv('{}/s3/{}/all-train-pred.csv'.format(local_data_folder, ticker), index=None)
    cluster_selection_df.to_csv('{}/s3/{}/cluster-selection.csv'.format(local_data_folder, ticker), index=None)
    
    # remove NA
    all_test_pred = all_test_pred.dropna()
    all_train_pred = all_train_pred.dropna()

    # test accuracy
    test_accuracy = accuracy_score(all_test_pred["direction"], all_test_pred["prediction"], normalize=True)
    benchmark_test_accuracy = accuracy_score(all_test_pred["direction"], all_test_pred["random-prediction"], normalize=True)
    print('Test accuracy:', test_accuracy, ", Benchmark:", benchmark_test_accuracy)

    # train accuracy
    train_accuracy = accuracy_score(all_train_pred["direction"], all_train_pred["prediction"], normalize=True)
    benchmark_train_accuracy = accuracy_score(all_train_pred["direction"], all_train_pred["random-prediction"], normalize=True)
    print('Train accuracy:', train_accuracy, ", Benchmark:", benchmark_train_accuracy)

    accuracy_df = pd.DataFrame([ticker, test_accuracy, benchmark_test_accuracy, train_accuracy, benchmark_train_accuracy]).T
    accuracy_df.columns = ["ticker", "test_accuracy", "benchmark_test_accuracy", "train_accuracy", "benchmark_train_accuracy"]

    accuracy_file = "{}/accuracy.csv".format(local_data_folder)
    header = not os.path.exists(accuracy_file)
    accuracy_df.to_csv(accuracy_file, mode="a", header=header, index=False)


In [None]:
tickers = list(map(lambda x: x.replace(local_data_folder + '/', '').replace('.csv', ''), glob(local_data_folder + "/*.csv")))

# ValueError: Classification metrics can't handle a mix of multiclass and continuous targets
problematic_tickers = []
skip = ["accuracy", "nyse", "nyse-volume"]
tickers = ["AES"]

for ticker in tickers:
    if ticker not in skip:
        try :
            print('Processing:', ticker)
            process(ticker)
        except:
            e = sys.exc_info()
            print(e)
            print("Failed to process", ticker)
            problematic_tickers = problematic_tickers + [ticker]
            pass

In [10]:
pd.read_csv(local_data_folder + "/s3/AES/all-test-pred.csv").dropna()

Unnamed: 0,Label,col-0,col-1,col-2,col-3,col-4,col-5,col-6,col-7,col-8,...,col-23,col-24,col-25,direction,cluster0_pred,cluster1_pred,cluster2_pred,selected-cluster,prediction,random-prediction
2,0.532338,0.384819,0.401575,0.394850,0.381739,0.246674,0.131345,0.483373,0.345514,0.502963,...,0.539707,0.188631,0.397627,3.0,0.538920,0.512824,0.518696,cluster0_pred,3.0,3
5,0.466593,0.734334,0.723535,0.724463,0.714783,0.726679,0.090099,0.446365,0.526126,0.547170,...,0.509809,0.798943,0.559521,3.0,0.618808,0.542766,0.503847,cluster0_pred,3.0,3
7,0.609881,0.106796,0.125984,0.134764,0.134783,0.107266,0.106605,0.618496,0.552073,0.757223,...,0.412542,0.313620,0.458556,3.0,0.676022,0.543699,0.500737,cluster0_pred,3.0,2
11,0.599991,0.295675,0.297463,0.315880,0.303478,0.327261,0.107681,0.546835,0.515215,0.707869,...,0.785531,0.516822,0.224095,3.0,0.502095,0.531041,0.521752,cluster1_pred,2.0,3
12,0.528030,0.225066,0.223097,0.247210,0.229565,0.177703,0.138416,0.558185,0.488005,0.588928,...,0.210285,0.355501,0.446895,3.0,0.549786,0.572830,0.512083,cluster1_pred,2.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,0.523722,0.233010,0.237095,0.246352,0.241739,0.186750,0.149299,0.657267,0.410450,0.574421,...,0.132856,0.365340,0.595813,3.0,0.540685,0.575072,0.504585,cluster1_pred,2.0,3
813,0.498150,0.428067,0.423447,0.436910,0.420000,0.434738,0.096155,0.517686,0.439445,0.659949,...,0.856979,0.617662,0.277397,3.0,0.527641,0.532270,0.521972,cluster1_pred,2.0,3
815,0.608082,0.454545,0.461067,0.472103,0.452174,0.322536,0.102563,0.523919,0.539844,0.588591,...,0.700966,0.279555,0.532688,3.0,0.543163,0.507396,0.523020,cluster0_pred,3.0,3
816,0.675081,0.614298,0.614173,0.618884,0.608696,0.643621,0.082245,0.547632,0.327212,0.406166,...,0.223341,0.835858,0.445118,3.0,0.520614,0.542404,0.514606,cluster1_pred,2.0,2
