In [1]:
#import pycaret 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
#from pycaret import create_model, compare_models, tune_model, finalize_model, predict_model, save_model, load_model
from pycaret.classification import *
from sklearn.model_selection import train_test_split


In [1]:
pwd

'/Users/apple/Desktop/Data Engineering'

In [2]:
#load the data from the csv file
data = pd.read_csv('/Users/apple/Desktop/Data Engineering/hw3/results_v1.csv')

In [5]:
#check the shape of the data
data.columns

Index(['pair', 'Timestamp', 'VWAP', 'Liquidity', 'volatility', 'max', 'min',
       'FD'],
      dtype='object')

In [7]:
avg_vwap = data.groupby('pair')['VWAP'].transform('mean')
avg_fd = data.groupby('pair')['FD'].transform('mean')

def label_forecastability(row, avg_vwap, avg_fd):
    if row['VWAP'] > avg_vwap and row['FD'] > avg_fd:
        return 'forecastable'
    elif row['VWAP'] < avg_vwap and row['FD'] < avg_fd:
        return 'unforecastable'
    else:
        return 'mixed'

data['label'] = data.apply(lambda row: label_forecastability(row, avg_vwap.loc[row.name], avg_fd.loc[row.name]), axis=1)

data.head()

Unnamed: 0,pair,Timestamp,VWAP,Liquidity,volatility,max,min,FD,label
0,SGDJPY,2021-01-07 23:00:00,78.3493,70.6,0.002521,78.418,78.2205,1513.924051,mixed
1,SGDJPY,2021-01-08 05:00:00,78.3885,75.408333,0.001894,78.454,78.3055,2404.040404,mixed
2,SGDJPY,2021-01-08 11:00:00,78.4681,111.45,0.003596,78.509,78.227,1276.595745,mixed
3,SGDJPY,2021-01-08 17:00:00,78.4688,77.747508,0.003046,78.512,78.2732,1260.469012,mixed
4,SGDJPY,2021-01-10 17:00:00,78.2312,5.336283,0.009611,78.309,77.558,150.466045,unforecastable


In [16]:
#transform the label column to numerical, 0 for unforecastable, 1 for mixed, 2 for forecastable
# data['label1'] = data['label'].map(lambda x: 1 if x == 'mixed' else 2 if x == 'forecastable' else 0)
data

Unnamed: 0,pair,Timestamp,VWAP,Liquidity,volatility,max,min,FD,label,label1
0,SGDJPY,2021-01-07 23:00:00,78.3493,70.600000,0.002521,78.41800,78.22050,1513.924051,mixed,1
1,SGDJPY,2021-01-08 05:00:00,78.3885,75.408333,0.001894,78.45400,78.30550,2404.040404,mixed,1
2,SGDJPY,2021-01-08 11:00:00,78.4681,111.450000,0.003596,78.50900,78.22700,1276.595745,mixed,1
3,SGDJPY,2021-01-08 17:00:00,78.4688,77.747508,0.003046,78.51200,78.27320,1260.469012,mixed,1
4,SGDJPY,2021-01-10 17:00:00,78.2312,5.336283,0.009611,78.30900,77.55800,150.466045,unforecastable,0
...,...,...,...,...,...,...,...,...,...,...
1471741,EURZAR,2023-01-23 23:00:00,18.7059,33.434540,0.005044,18.71740,18.62320,3811.040340,mixed,1
1471742,EURZAR,2023-01-24 05:00:00,18.7792,58.991667,0.007548,18.80809,18.66670,2546.148950,mixed,1
1471743,EURZAR,2023-01-24 11:00:00,18.7043,74.652778,0.005715,18.80208,18.69490,3358.835604,mixed,1
1471744,EURZAR,2023-01-24 17:00:00,18.6840,38.533333,0.003595,18.73302,18.66580,5266.289795,forecastable,2


In [19]:
data_clean = data.drop(['pair', 'label',"Timestamp"], axis=1)
data_clean

Unnamed: 0,VWAP,Liquidity,volatility,max,min,FD,label1
0,78.3493,70.600000,0.002521,78.41800,78.22050,1513.924051,1
1,78.3885,75.408333,0.001894,78.45400,78.30550,2404.040404,1
2,78.4681,111.450000,0.003596,78.50900,78.22700,1276.595745,1
3,78.4688,77.747508,0.003046,78.51200,78.27320,1260.469012,1
4,78.2312,5.336283,0.009611,78.30900,77.55800,150.466045,0
...,...,...,...,...,...,...,...
1471741,18.7059,33.434540,0.005044,18.71740,18.62320,3811.040340,1
1471742,18.7792,58.991667,0.007548,18.80809,18.66670,2546.148950,1
1471743,18.7043,74.652778,0.005715,18.80208,18.69490,3358.835604,1
1471744,18.6840,38.533333,0.003595,18.73302,18.66580,5266.289795,2


In [24]:
#drop the rows with infinite values
data_clean = data_clean.replace([np.inf, -np.inf], np.nan)
data_clean = data_clean.dropna()
data_clean


Unnamed: 0,VWAP,Liquidity,volatility,max,min,FD,label1
0,78.3493,70.600000,0.002521,78.41800,78.22050,1513.924051,1
1,78.3885,75.408333,0.001894,78.45400,78.30550,2404.040404,1
2,78.4681,111.450000,0.003596,78.50900,78.22700,1276.595745,1
3,78.4688,77.747508,0.003046,78.51200,78.27320,1260.469012,1
4,78.2312,5.336283,0.009611,78.30900,77.55800,150.466045,0
...,...,...,...,...,...,...,...
1471741,18.7059,33.434540,0.005044,18.71740,18.62320,3811.040340,1
1471742,18.7792,58.991667,0.007548,18.80809,18.66670,2546.148950,1
1471743,18.7043,74.652778,0.005715,18.80208,18.69490,3358.835604,1
1471744,18.6840,38.533333,0.003595,18.73302,18.66580,5266.289795,2


In [27]:
from pycaret.classification import ClassificationExperiment
clf1 = ClassificationExperiment()
#Folder number is 3, so the data will be divided into 3 folders
clf1.setup(data = data_clean, target = 'label1', session_id = 123)
#clf1.setup(data = data_clean, target = 'label1', session_id = 123, fold = 3)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label1
2,Target type,Multiclass
3,Original data shape,"(1468252, 7)"
4,Transformed data shape,"(1468252, 7)"
5,Transformed train set shape,"(1027776, 7)"
6,Transformed test set shape,"(440476, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7fc67b32e820>

In [28]:
#compare the models
best_model = clf1.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7834,0.4947,0.7834,0.6276,0.6882,0.0,0.0005,2.802
dummy,Dummy Classifier,0.7834,0.5,0.7834,0.6136,0.6882,0.0,0.0,0.098
gbc,Gradient Boosting Classifier,0.7833,0.7259,0.7833,0.7174,0.6903,0.0068,0.0347,168.177
ridge,Ridge Classifier,0.7832,0.0,0.7832,0.6295,0.6882,-0.0001,-0.001,0.168
lightgbm,Light Gradient Boosting Machine,0.7828,0.7904,0.7828,0.7192,0.6991,0.0362,0.0836,4.978
lda,Linear Discriminant Analysis,0.7819,0.5357,0.7819,0.6359,0.6886,0.0019,0.0096,0.827
ada,Ada Boost Classifier,0.7726,0.5605,0.7726,0.6795,0.6941,0.0222,0.0396,11.684
knn,K Neighbors Classifier,0.7383,0.557,0.7383,0.6515,0.6834,0.0094,0.0112,1.163
rf,Random Forest Classifier,0.6443,0.6794,0.6443,0.616,0.6295,-0.0699,-0.0704,70.898
et,Extra Trees Classifier,0.6443,0.4563,0.6443,0.6136,0.6282,-0.077,-0.0777,43.28
