In [None]:
! pip install scikit-learn==0.23.2  # make sure sklearn of needed version is here

In [None]:
import boto3
import numpy as np
import sagemaker
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, log_loss
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, roc_auc_score, cohen_kappa_score
import os
import joblib

import config

# get connection to s3 bucket
sm_boto3 = boto3.client('sagemaker')
sess = sagemaker.Session()
region = sess.boto_session.region_name


In [None]:
### Read Data
fname = 'model/model.joblib' # change for new files
local_name = 'model.joblib'  # get the name to the local file

# Download files from s3
s3 = boto3.client('s3')
s3.download_file(config.bucket_name, fname, local_name)
s3.download_file(config.bucket_name, 'pickles/scaler.gz' , 'scaler.gz')

In [None]:
# load the files into memory
mdl = joblib.load('model.joblib')
scaler = joblib.load('scaler.gz')

In [None]:
s3.download_file(config.bucket_name, 'data/test/test.h5', 'test.h5')  # dowload test data from s3
s3.download_file(config.bucket_name, 'data/test/test.h5', 'val.h5')  # dowload val data from s3

# load test data
test = pd.read_hdf('test.h5')
X_test = test.drop(['index', 'short_result'], axis=1, errors='ignore')  # remove predictable column
y_test = test['short_result']#.ravel()  # store classes separately

# load test data
val = pd.read_hdf('val.h5')
X_val = val.drop(['index', 'short_result'], axis=1, errors='ignore')  # remove predictable column
y_val = val['short_result']#.ravel()  # store classes separately

os.remove('test.h5')  # remove local file
os.remove('val.h5')  # remove local file

In [None]:
print('Best params: %s' % mdl.best_params_)

## Getting results overview
# means strong sell=0, sell=1, hold=2. buy=3, strong buy=4
def get_eval(y_true, y_pred):

    print()
    print("Classfification Report:")
    print()
    print(classification_report(y_true, y_pred))
    print()
    print("Confusion Matrix:")
    print()
    print(confusion_matrix(y_true, y_pred))
    print()
    print("Accuracy: %0.3f" % accuracy_score(y_true, y_pred))
    print("Balanced Accuracy: %0.3f" % balanced_accuracy_score(y_true, y_pred))
    print("Precision: %0.3f" % precision_score(y_true, y_pred, average='weighted'))
    print("Recall: %0.3f" % recall_score(y_true, y_pred, average='weighted'))
    print("F1 Score: %0.3f" % f1_score(y_true, y_pred, average='weighted'))
    print("Kappa: %0.3f" % cohen_kappa_score(y_true, y_pred))
    #print("ROC AUC: %0.3f" % roc_auc_score(y_true, y_pred, average='weighted', multi_class='ovo'))

get_eval(y_true, y_pred)



In [None]:

# Combine strong decisions and normal decisions together
a_y_pred = y_pred
a_y_pred = np.where(a_y_pred <= 1, 1, a_y_pred)
a_y_pred = np.where(a_y_pred >= 3, 3, a_y_pred)

a_y_true = y_true.values
a_y_true = np.where(a_y_true <= 1, 1, a_y_true)
a_y_true = np.where(a_y_true >= 3, 3, a_y_true)

get_eval(a_y_true, a_y_pred)

In [None]:
## Does not work for multiclass
#print("ROC AUC: %0.3f" % roc_auc_score(y_true, y_pred, average='weighted', multi_class='ovo'))

In [None]:
# create probabilities
y_prob = mdl.predict_proba(X_test)  
y_prob = np.array(y_prob)

In [None]:
# get probabilities in a df
d = {'actual':y_true,'pred':y_pred,'strong_sell':y_prob[:,0],'sell':y_prob[:,1], 'hold':y_prob[:,2], 'buy':y_prob[:,3], 'strong_buy':y_prob[:,4]}
probabilities = pd.DataFrame.from_dict(d)


In [None]:
probabilities.head()  # print probabilities
# means strong sell=0, sell=1, hold=2. buy=3, strong buy=4

In [None]:
os.remove('scaler.gz')
os.remove('model.joblib')