# Project Topic: Application of Blending Ensemble Learning in predicting positive moves (uptrend), a case-study on Nvidia Corporation stock price.
### - Caleb Fowowe

#### Import Libraries

In [37]:
#Import the internal modules written for the purpose of this project
from src.utils_data_processing import (LoadData, cwts, getpath, rnd_state)
from src.utils_features_engineering import (FeaturesCreation, FeaturesTransformation, FeaturesSelection)
from src.utils_model_and_tuning import (Blending, HpTuning, SimpleBacktest, Btest)

#Import external modules for the basemodels and blender (metamodel)
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

#data manipulation modules
import numpy as np
from datetime import datetime

# Creates a folder for saving of code graphics and trading strategy report.
output_path = getpath()

### Load Data, EDA, Fix Null Data, and Plots

##### Load data

In [38]:
#File names of the used data, provided as dictionary values.
data_files = {'files': ['NVDA', 'VVIX_History', 'USCPI', 'USGDP', 'FedFundRate', '2yrTreasury', '10yrTreasury']} 

time_period = ['2008', '2024'] #specifies a period range, in the case provided data is goes back than required.
company_name = data_files['files'][0] #Extract the company name or ticker into the company_name variable

ldata = LoadData(*time_period, **data_files) #instantiate the class

#Call the function to merger all the data into a single dataframe and returns the dataframe into variable 'df'
df = ldata.joinData()

#### Exploratory Data Analysis (EDA) of Original dataset

##### Cleaning and Imputation

In [39]:
#Preview the original datasets of combined dataframe
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,MarketValue,DividendYield,PriceToEarnings,EPS,PriceToCash,PriceToBook,VVIX,CPI,GDP,FedFundRate,2yrTreasury,10yrTreasury
count,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,3868.0,4184.0,4184.0,4184.0,4182.0,194.0,63.0,194.0,3792.0,3792.0
mean,9.686341,9.866488,9.491642,9.688788,537815.2,238822.8,0.510476,46.758532,0.138793,26.061943,10.277696,93.378857,0.02472,0.019818,0.012397,1.404229,2.560672
std,19.972159,20.353512,19.527211,19.960702,324891.4,493865.1,0.737336,37.228853,0.262093,18.032612,11.676883,16.544725,0.021057,0.062414,0.015677,1.366457,0.918427
min,0.15,0.1595,0.1437,0.1475,45645.1,3283.83,0.0,7.3,0.0,6.96,1.33,59.74,-0.021,-0.280207,0.0025,0.109,0.498
25%,0.380925,0.3863,0.3765,0.3807,325552.2,8948.413,0.0,19.7,0.02,10.8,2.32,82.23,0.012575,0.009074,0.0025,0.367,1.873
50%,0.90145,0.91185,0.89325,0.90075,464922.6,19620.08,0.12,39.7,0.03,20.775,5.03,90.095,0.0199,0.022584,0.0025,0.84,2.485
75%,9.2255,9.3355,9.117125,9.222375,659608.0,226870.4,0.56,62.8,0.15,35.53,15.515,102.0475,0.03175,0.03423,0.019375,2.129,3.26425
max,139.8,140.76,132.42,135.58,3692926.0,3335269.0,2.64,246.9,1.71,106.29,77.73,207.59,0.0906,0.348397,0.055,5.22,4.983


In [40]:
#Check for missing points and values within the combined dataset
ldata.checkNullData(df)#within the ldata object, call the checkNullData method, with the df as input parameter

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MarketValue,DividendYield,PriceToEarnings,EPS,PriceToCash,PriceToBook,VVIX,CPI,GDP,FedFundRate,2yrTreasury,10yrTreasury
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2008-01-02,0.8530,0.8562,0.8140,0.8252,483981.1,18339.39,0.0,28.4,0.03,18.21,7.02,80.58,,,,2.079,3.598
2008-01-03,0.8300,0.8465,0.8157,0.8187,475385.9,18194.94,0.0,28.2,0.03,18.06,6.97,79.33,,,,,
2008-01-04,0.7942,0.7982,0.7450,0.7500,736136.8,16667.13,0.0,25.8,0.03,16.55,6.38,81.46,,,,1.802,3.566
2008-01-07,0.7550,0.7612,0.6587,0.6725,1006894.0,14944.85,0.0,23.1,0.03,14.84,5.72,79.51,,,,2.656,4.008
2008-01-08,0.6762,0.7322,0.6605,0.6867,1107039.0,15261.52,0.0,23.6,0.03,15.15,5.85,79.59,,,,2.499,3.935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-09,105.6400,106.6000,103.4300,104.7500,290844.2,2576851.00,0.0,61.3,1.71,82.12,60.05,114.63,,,,,
2024-08-12,106.3200,111.0700,106.2600,109.0200,325559.9,2681893.00,0.0,63.8,1.71,85.47,62.50,121.94,,,,,
2024-08-13,112.4400,116.2300,111.5800,116.1400,312646.7,2857045.00,0.0,68.0,1.71,91.05,66.58,113.89,,,,3.940,3.850
2024-08-14,118.5300,118.6000,114.0700,118.0800,339246.4,2904769.00,0.0,69.1,1.71,92.57,67.70,107.81,,,,3.964,3.833


In [41]:
#Intial step of fixing missing data, backfill the quarterly, and monthly macrodata,to 
df = ldata.fixNullData(df, method='bfill')

In [42]:
#Post-backfilling check for null data after fixing null data
ldata.checkNullData(df)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MarketValue,DividendYield,PriceToEarnings,EPS,PriceToCash,PriceToBook,VVIX,CPI,GDP,FedFundRate,2yrTreasury,10yrTreasury
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-07-01,123.47,124.84,118.83,124.3,284885.6,3057781.0,0.0,72.8,1.71,97.45,71.26,76.05,0.0289,,0.055,4.433,4.115
2024-07-02,121.13,123.41,121.03,122.67,218374.0,3017683.0,0.0,71.8,1.71,96.17,70.33,77.06,0.0289,,0.055,4.433,4.115
2024-07-03,121.66,128.28,121.36,128.28,215748.9,3155689.0,0.0,75.1,1.71,100.57,73.54,78.07,0.0289,,0.055,4.516,4.092
2024-07-05,127.38,128.85,125.68,125.83,214176.7,3095419.0,0.0,73.7,1.71,98.65,72.14,78.8,0.0289,,0.055,4.824,4.457
2024-07-08,127.49,130.77,127.04,128.2,237677.3,3153721.0,0.0,75.1,1.71,100.51,73.5,78.59,0.0289,,0.055,3.994,3.901
2024-07-09,130.35,133.82,128.65,131.38,287020.8,3231949.0,0.0,76.9,1.71,103.0,75.32,78.58,0.0289,,0.055,4.447,4.226
2024-07-10,134.03,135.1,132.42,134.91,248978.6,3318787.0,0.0,79.0,1.71,105.77,77.35,83.41,0.0289,,0.055,4.447,4.226
2024-07-11,135.75,136.15,127.05,127.4,374782.8,3134041.0,0.0,74.6,1.71,99.88,73.04,84.47,0.0289,,0.055,4.447,4.226
2024-07-12,128.26,131.92,127.22,129.24,252680.6,3179306.0,0.0,75.7,1.71,101.32,74.1,81.61,0.0289,,0.055,4.447,4.226
2024-07-15,130.56,131.39,127.18,128.44,208326.2,3159625.0,0.0,75.2,1.71,100.69,73.64,83.86,0.0289,,0.055,4.447,4.226


In [43]:
#fix future data that are not available yet, can drop rows to choose, knn_impute method was used here.
df = ldata.fixNullData(df, method='knnimpute')

In [44]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,MarketValue,DividendYield,PriceToEarnings,EPS,PriceToCash,PriceToBook,VVIX,CPI,GDP,FedFundRate,2yrTreasury,10yrTreasury
count,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0
mean,9.686341,9.866488,9.491642,9.688788,537815.2,238822.8,0.510476,45.311568,0.138793,26.061943,10.277696,93.374524,0.024781,0.019801,0.012572,1.420721,2.571234
std,19.972159,20.353512,19.527211,19.960702,324891.4,493865.1,0.737336,36.151332,0.262093,18.032612,11.676883,16.542172,0.020897,0.060527,0.015892,1.383944,0.927187
min,0.15,0.1595,0.1437,0.1475,45645.1,3283.83,0.0,7.3,0.0,6.96,1.33,59.74,-0.021,-0.280207,0.0025,0.109,0.498
25%,0.380925,0.3863,0.3765,0.3807,325552.2,8948.413,0.0,20.1,0.02,10.8,2.32,82.225,0.0124,0.010749,0.0025,0.367,1.87475
50%,0.90145,0.91185,0.89325,0.90075,464922.6,19620.08,0.12,33.6,0.03,20.775,5.03,90.09,0.0199,0.022357,0.0025,0.8465,2.493
75%,9.2255,9.3355,9.117125,9.222375,659608.0,226870.4,0.56,58.825,0.15,35.53,15.515,102.0425,0.0324,0.033968,0.02,2.15575,3.303
max,139.8,140.76,132.42,135.58,3692926.0,3335269.0,2.64,246.9,1.71,106.29,77.73,207.59,0.0906,0.348397,0.055,5.22,4.983


In [45]:
# preview the latest five (5) values of the cleaned up data
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MarketValue,DividendYield,PriceToEarnings,EPS,PriceToCash,PriceToBook,VVIX,CPI,GDP,FedFundRate,2yrTreasury,10yrTreasury
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-08-09,105.64,106.6,103.43,104.75,290844.2,2576851.0,0.0,61.3,1.71,82.12,60.05,114.63,0.03042,0.016531,0.055,3.94,3.85
2024-08-12,106.32,111.07,106.26,109.02,325559.9,2681893.0,0.0,63.8,1.71,85.47,62.5,121.94,0.02966,0.016531,0.055,3.94,3.85
2024-08-13,112.44,116.23,111.58,116.14,312646.7,2857045.0,0.0,68.0,1.71,91.05,66.58,113.89,0.02906,0.016531,0.055,3.94,3.85
2024-08-14,118.53,118.6,114.07,118.08,339246.4,2904769.0,0.0,69.1,1.71,92.57,67.7,107.81,0.02922,0.016531,0.055,3.964,3.833
2024-08-15,118.76,123.24,117.47,122.86,318086.7,3022357.0,0.0,71.9,1.71,96.32,70.44,105.42,0.02938,0.016531,0.055,4.097,3.915


In [46]:
# Plot candlestick of the historical stock data
# stock_split = {'event_dates': ['2024-06-10', '2021-07-20', '2007-09-11', '2006-04-07', '2001-09-12', '2000-06-27'], 'event_title': 'stock-split'}
stock_split = {'event_dates': ['2024-06-10', '2021-07-20'], 'event_title': 'stock-split'}
ldata.plotCandleStick(df, events=stock_split)

In [47]:
# plot individual (ohlcv)
ldata.plotPrices(df)

## Feature Engineering

##### FeaturesEngineering Class with the entire FeaturesCreation, FeaturesTransformation, and FeaturesSelections sub-classes

#### Feature Creation/Extraction

##### Define the parameters to be used in the target variable (y)/ Label

In [48]:
# The target variable is a trend and volatility play which creates a signal based on two conditions:
# 1. when the return over a relative short period of time (5days i.e. mean of 5-day rolling return), 
# crosses over the return trend over a relative medium period (10days), with the difference been equal or 
# above a specified hurdle rate, (hurdle) the condition is deemed fulfilled. 
# 2. The other condition is a volatility play where the standard deviation of the short period return is less 
# or equal to the upper boundary standard deviation of the medium period return. The upper boundary of the medium
# period returns standard deviation is characterized as 2-standard deviations from the mean medium_period return. 

# Target parameters
short_prd= 5 
medium_prd = 10
upper_std=2 
lower_std = 1 
hurdle = 0.005

##### Generate Features - all features

In [49]:
# Instantiate the features creation subclass with the dataframe containing the cleaned data, alongside, the input
#parameters for the target variable as input during class instantiation.

#Instantiate the FeaturesCreation subclass providing the dataframe and the target parameters as inputs
feat_df = FeaturesCreation(df, short_prd, medium_prd, upper_std, lower_std, hurdle) 

# Call the create_all_features method, setting the fundamental_features and macro_features parameters are true, if provided.
# or false, otherwise. This should be linked to a variable in which an updated dataframe with all created features stored 
new_ft = feat_df.create_all_features(fundamental_features=True, macro_features=True) 
#1. Company fundamentals-related Features (Required column labels: 'PriceToEarnings', 'PriceToCash', 'PriceToBook', 'DividendYield')
#2. Macroeconomic related features (Required column labels: 'CPI', 'GDP', '2yrTreasury', '10yrTreasury')
#3. Technical Indicator features (based on pandas ta-library) (Required column label 'Open', 'High', 'Low', 'Close', 'Volume'
# this can also be in lower case format)

# The ohlcv columns are dropped after using them in the generation of the technical indicators. 
# Below is a preview of the first five row of the 339features including both, macroeconomic, fundamental and technical indicators.
new_ft.head()

 August 23, 2024 - 22:17:23 ----- [1m Fundamental Features: Price-to-Earnings-to-Dividend Ratio (PED)Ratio feature successfully calculated [0m
 August 23, 2024 - 22:17:23 ----- [1m Fundamental Features: Price to Earnings and Price to Book Combined (PEPB)_Ratio feature successfully calculated [0m
 August 23, 2024 - 22:17:23 ----- [1m Fundamental Features: Price to Cash & Price to Earnings Combined (PCFPER) feature successfully calculated [0m
 August 23, 2024 - 22:17:23 ----- [1m Fundamental Features: Combined Valuation Metric (CVM)_feature successfully calculated [0m
 August 23, 2024 - 22:17:23 ----- [1m Macro features: Yield Spread feature successfully calculated [0m
 August 23, 2024 - 22:17:23 ----- [1m Macro Features: CPI/GDP Ratio feature successfully calculated [0m
 August 23, 2024 - 22:17:23 ----- [1m Macro Features: CPI vs Yield Correlation feature successfully calculated [0m
 August 23, 2024 - 22:17:23 ----- [1m Macro Features: Real Interest Rates feature successf

Unnamed: 0_level_0,MarketValue,DividendYield,PriceToEarnings,EPS,PriceToCash,PriceToBook,VVIX,CPI,GDP,FedFundRate,...,VWMA_10,TSV_18_10,TSVs_18_10,TSVr_18_10,WCP,WILLR_14,WMA_10,ZL_EMA_10,ZS_30,predict
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-05-27,12958.57,0.0,16.8,0.03,45.31,5.25,79.35,0.0418,0.024031,0.02,...,0.582109,93103.06701,107880.214846,0.863023,2.3204,-45.302326,0.584591,0.58383,1.071006,0
2008-05-28,13080.61,0.0,17.0,0.03,45.73,5.3,77.7,0.0418,0.024031,0.02,...,0.58704,73083.01686,106471.525047,0.686409,2.3632,-40.186047,0.586282,0.587098,1.142096,0
2008-05-29,13047.33,0.0,16.9,0.03,45.62,5.29,74.2,0.0418,0.024031,0.02,...,0.590013,43199.21691,101545.231669,0.425418,2.3439,-44.389275,0.586727,0.58668,1.047144,0
2008-05-30,13701.92,0.0,17.8,0.03,47.91,5.55,73.46,0.0418,0.024031,0.02,...,0.592045,80900.53201,96891.613573,0.834959,2.4575,-15.09434,0.591876,0.59952,1.67782,0
2008-06-02,13757.39,0.0,17.8,0.03,48.1,5.58,78.6,0.0502,0.024031,0.02,...,0.592152,72385.28831,89146.471885,0.811982,2.4754,-12.611718,0.597062,0.609789,1.644918,1


In [50]:
new_ft.shape

(4083, 339)

#### Feature Transformation & Selection

##### Transform day of the week feature

In [51]:
# Prior to starting to features selection process, the days features which consist of trading days (Monday - Friday), is transformed, to two features.
feat_transform = FeaturesTransformation(new_ft) #Instantiate the FeaturesTransformation subclass with generated features as input
new_ft2 = feat_transform.transformDaysColumn() # Invoke the transformDaysColumn method, to transform the day of week column and store in new_ft2 variable

###### To optimize the dataframe performance, the all features outside the target column are convereted to 'float64', with the target variable column converted to 'int16' datatype

In [52]:
new_ft2 = new_ft2.astype('float64')
new_ft2['predict'] = new_ft2['predict'].values.astype('int16')

In [53]:
new_ft2

Unnamed: 0_level_0,MarketValue,DividendYield,PriceToEarnings,EPS,PriceToCash,PriceToBook,VVIX,CPI,GDP,FedFundRate,...,TSVs_18_10,TSVr_18_10,WCP,WILLR_14,WMA_10,ZL_EMA_10,ZS_30,predict,dsin,dcos
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-05-27,12958.57,0.0,16.8,0.03,45.31,5.25,79.35,0.04180,0.024031,0.020,...,1.078802e+05,0.863023,2.3204,-45.302326,0.584591,0.583830,1.071006,0,0.974928,-0.222521
2008-05-28,13080.61,0.0,17.0,0.03,45.73,5.30,77.70,0.04180,0.024031,0.020,...,1.064715e+05,0.686409,2.3632,-40.186047,0.586282,0.587098,1.142096,0,0.433884,-0.900969
2008-05-29,13047.33,0.0,16.9,0.03,45.62,5.29,74.20,0.04180,0.024031,0.020,...,1.015452e+05,0.425418,2.3439,-44.389275,0.586727,0.586680,1.047144,0,-0.433884,-0.900969
2008-05-30,13701.92,0.0,17.8,0.03,47.91,5.55,73.46,0.04180,0.024031,0.020,...,9.689161e+04,0.834959,2.4575,-15.094340,0.591876,0.599520,1.677820,0,-0.974928,-0.222521
2008-06-02,13757.39,0.0,17.8,0.03,48.10,5.58,78.60,0.05020,0.024031,0.020,...,8.914647e+04,0.811982,2.4754,-12.611718,0.597062,0.609789,1.644918,1,0.781831,0.623490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-08,2582263.00,0.0,61.5,1.71,82.29,60.18,134.34,0.03042,0.016531,0.055,...,-9.173816e+06,1.144143,412.9600,-58.000000,105.131273,101.258539,-1.363151,0,-0.433884,-0.900969
2024-08-09,2576851.00,0.0,61.3,1.71,82.12,60.05,114.63,0.03042,0.016531,0.055,...,-9.648810e+06,1.048153,419.5300,-58.647059,104.713818,102.675168,-1.288077,0,-0.974928,-0.222521
2024-08-12,2681893.00,0.0,63.8,1.71,85.47,62.50,121.94,0.02966,0.016531,0.055,...,-9.657405e+06,0.565189,435.3700,-37.801154,105.223818,104.696047,-0.815332,0,0.781831,0.623490
2024-08-13,2857045.00,0.0,68.0,1.71,91.05,66.58,113.89,0.02906,0.016531,0.055,...,-9.039212e+06,0.467657,460.0900,-13.640991,107.075091,109.909493,-0.093752,0,0.974928,-0.222521


#### Feature Selection

###### Feature selection - Wrapper Method: Boruta and Recursive Forward Elimination (RFE)

In [54]:
#instantiate the FeaturesSelection subclass, providing dataframe from above, with the days column transformed 
# as required input parameters, and the testsize as an optional input parameter as well. Default testsize is 0.20.
feat_select = FeaturesSelection(new_ft2, testsize = 0.20) 
feat1 = feat_select.wrapper_boruta(max_iter=150) # Call the wrapper_boruta method within the FeaturesSelection subclass.

Pre-Boruta selection metrics: Accuracy Score:  91.31%, f1_score: 91.33% 


Using the (107)BorutaSelected Features, metrics: Accuracy Score: 95.10%, f1_score: 95.11% 


 Using Recursive Forward Elimination (RFE) approach to validate the (107)features selected by Boruta approach,(85)features which are the intersect features for both Boruta and RFE, the evaluation metrics using the (85)features are: Accuracy Score:  95.59%, f1_score: 95.60% 



##### Feature selection - Filtering Method: Addressing Multicollinearity among features

###### using the same Feature Selection class, specify the correlation coefficient Threshold of choice. (The projected tested correlation in the 0.60 - 0.90) ranges.

In [55]:
# Call the filter_correlation method, within the class providing it with the desired correlation threshold.
# The multicollinearity steps follows the Boruta and RFE intersection steps. Hence, there is no need to specify the dataframe, the code has designed such that it already takes as input the dataframe which contains the features output of Boruta and RFE intersection. 
# However, for testing purposes, there's an optionality to provide the function with both correlation coefficient and dataframe, and it will filter for multicollinearitu among features.
filtered_features = feat_select.filter_multicollinearity(corr_coeff=0.90)  


 Solving for multicollinearity of features, and applying correlation coefficient of 0.9, the (85)features selected which are the intersected features of Boruta and Recursive Forward Elimination (RFE) methods were filtered to 59features 


 After addressing the multicollinearity among features, applying RandomForestClassifier to predictthe (59)Filtered Features gives the following values for tracked metrics: Accuracy Score: 93.27%, f1_score: 93.27% 

╒═════════╤═══════════════════════════╕
│   Index │ Filtered Features Names   │
╞═════════╪═══════════════════════════╡
│       1 │ EXHC_UPa                  │
│       2 │ KSTs_9                    │
│       3 │ SMCti_14_50_20_5          │
│       4 │ AROOND_14                 │
│       5 │ QQE_14_5_4.236            │
│       6 │ UI_14                     │
│       7 │ BBP_5_2.0                 │
│       8 │ ER_10                     │
│       9 │ VHF_28                    │
│      10 │ ADOSC_3_10                │
│      11 │ SMCtp_14_50_2

##### applying K-Means clustering to Feature selection

In [56]:
data2 = new_ft2[filtered_features]
data2['predict'] = new_ft2['predict'].values.astype('int')

kmeans_features = feat_select.kmeans_selector(data2, cluster_size=len(filtered_features), upper_threshold=0.065, lower_threshold=0.03) #upper threshold should at least be above the lower_threshold

 August 23, 2024 - 22:25:14 ----- [1m 9 optimal clusters selected, which were within the threshold of 6.50%, and 3.00%. [0m

 Using K-Means selected features (9)Filtered Features gives the following values for tracked metrics: Accuracy Score: 80.78%, f1_score: 80.82% 

 August 23, 2024 - 22:25:14 ----- [1m (9)final features were selected from the (9)clusters. [0m
+---------+-----------------------------+
|   Index | K_means selected features   |
|       1 | ADOSC_3_10                  |
+---------+-----------------------------+
|       2 | CCI_14_0.015                |
+---------+-----------------------------+
|       3 | SQZ_20_2.0_20_1.5           |
+---------+-----------------------------+
|       4 | AROOND_14                   |
+---------+-----------------------------+
|       5 | EXHC_UPa                    |
+---------+-----------------------------+
|       6 | KSTs_9                      |
+---------+-----------------------------+
|       7 | BBP_5_2.0                   |


##### Finally selected Features set - KMeans

In [57]:
data3 = new_ft2[kmeans_features]
data3['predict'] = new_ft2['predict'].values.astype('int')

data3.head()

Unnamed: 0_level_0,ADOSC_3_10,CCI_14_0.015,SQZ_20_2.0_20_1.5,AROOND_14,EXHC_UPa,KSTs_9,BBP_5_2.0,SMCti_14_50_20_5,QQE_14_5_4.236,predict
Dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2008-05-27,19240.47098,-2112.601888,0.033,14.285714,1.0,15525.690019,0.577679,-0.0085,58.2851,0
2008-05-28,43469.914806,-2191.647542,0.027383,7.142857,2.0,15839.330065,0.740669,-0.0323,58.2851,0
2008-05-29,181817.834116,-2627.141973,0.031017,0.0,0.0,16058.740395,0.599591,-0.023,58.2851,0
2008-05-30,422015.671559,-2561.65735,0.0399,14.285714,1.0,16172.561154,0.97728,-0.0368,58.2851,0
2008-06-02,558734.700371,-2576.24865,0.046617,7.142857,2.0,16178.211756,0.823784,-0.0475,58.2851,1


In [58]:
data3.shape

(4083, 10)

### Ensemble Model - Blending Ensemble

##### Initial parameterization of basemodels and metamodel

In [59]:
cls_weight = cwts(data3) #generate class weight to treat class imbalance

# Logistic regression algorithm
lr_params = {'random_state': rnd_state(), 'class_weight': cls_weight}
lr = LogisticRegression(**lr_params)

# Decision Tree algorithm
dt_params = {'class_weight': cls_weight, 'random_state': rnd_state()}
dt = DecisionTreeClassifier(**dt_params)

# K-nearest Neighbour algorithm
knn_params = {'algorithm': 'auto', 'n_jobs': -1}
knn = KNeighborsClassifier(**knn_params)

# Gaussian Naive Bayes algorithm 
bayes_params = {}
bayes = GaussianNB()
bayes.set_params(**bayes_params)

# Support Vector Machine (SVM): Support Vector Classifier (SVC)
svc_params = {'class_weight': cls_weight,'random_state': rnd_state(), 'probability': True}
svc = SVC(**svc_params)

# Combining all the algorithms into basemodels 
basemodels = {'lr': lr, 'dte': dt, 'knn': knn, 'bayes': bayes, 'svc': svc}

# Extreme Gradient Boost algorithm
xgb_params = {'n_jobs': -1, 'class_weight': cls_weight, 'random_state': rnd_state(), 'verbose': 1}
xgb = XGBClassifier(**xgb_params)

# Extreme gradient boosting stated as metamodel or blender.
blender = xgb

##### Initial run of the blending model

In [60]:
#Separate final X and y - Features and target
X_final = data3.iloc[:,:-1].values
y_final = data3.iloc[:,-1].values

Blnd = Blending(X_final, y_final, basemodels, blender, valsize=0.20)
acc, f1score, ypred, yprob, yfull = Blnd.runBlendingEnsemble()

print(f"Accuracy Score: {acc: .1%}, f1score: {f1score:.1%}")




Classification Report
              precision    recall  f1-score   support

           0       0.74      0.80      0.77       368
           1       0.82      0.78      0.80       449

    accuracy                           0.78       817
   macro avg       0.78      0.79      0.78       817
weighted avg       0.79      0.78      0.79       817

Accuracy Score:  78.5%, f1score: 78.5%


#### Hyperparameter Tuning

In [61]:
#Instantiate tuning
tune_model = HpTuning(X_final, y_final, n_trials=40)
tuned_lr, tuned_dt, tuned_svc, tuned_knn, tuned_bayes, tuned_xgb = tune_model.optimize_lr(), tune_model.optimize_dt(), tune_model.optimize_svc(), tune_model.optimize_knn(), tune_model.optimize_bayes(), tune_model.optimize_xgb()

print("optimal_lr:", tuned_lr.values, "\t","optimal_dt:", tuned_dt.values, "\t", "optimal_svc:", tuned_svc.values, "\t", "optimal_knn:", tuned_knn.values, "\t", "optimal_bayes:", tuned_bayes.values, "\t", "optimal_xgb:", tuned_xgb.values)

optimal_lr: [0.7961844751025863, 0.7971813725490197] 	 optimal_dt: [0.8127900250305409, 0.8143382352941176] 	 optimal_svc: [0.798613394797161, 0.8002450980392156] 	 optimal_knn: [0.8129554932733289, 0.8146446078431372] 	 optimal_bayes: [0.7062281775533863, 0.7212009803921569] 	 optimal_xgb: [0.8311156175062243, 0.8321078431372549]


##### Preview hyperparameters

In [62]:
hp_list = [tuned_lr.params, tuned_dt.params, tuned_svc.params, tuned_knn.params, tuned_bayes.params, tuned_xgb.params]
hp_names = ['tuned_lr.params', 'tuned_dt.params', 'tuned_svc.params', 'tuned_knn.params', 'tuned_bayes.params', 'tuned_xgb.params']

In [63]:
tune_model.hp_preview(hp_list, hp_names)

tuned_lr.params
+---------+-------------------+---------------------+
|   Index | Hyper-parameter   | Tuned Values        |
|       1 | C                 | 0.6469950883271244  |
+---------+-------------------+---------------------+
|       2 | tol               | 0.00936216376831341 |
+---------+-------------------+---------------------+
|       3 | solver            | liblinear           |
+---------+-------------------+---------------------+
tuned_dt.params
+---------+-------------------+---------------------+
|   Index | Hyper-parameter   | Tuned Values        |
|       1 | max_depth         | 5                   |
+---------+-------------------+---------------------+
|       2 | min_samples_split | 7                   |
+---------+-------------------+---------------------+
|       3 | min_samples_leaf  | 2                   |
+---------+-------------------+---------------------+
|       4 | ccp_alpha         | 0.01582051421834041 |
+---------+-------------------+-------------------

#### Run Ensemble Model with tuned parameters

##### Update the initial parameters dictionary with the hyperparameter tuning parameter values

In [64]:
lr_params.update(tuned_lr.params)
lr = LogisticRegression(**lr_params)

dt_params.update(tuned_dt.params)
dt = DecisionTreeClassifier(**dt_params)

knn_params.update(tuned_knn.params)
knn = KNeighborsClassifier(**knn_params)

bayes = GaussianNB()
bayes.set_params(**bayes_params)

svc_params.update(tuned_svc.params)
svc = SVC(**svc_params)

basemodel_upd = {'lre': lr, 'dte': dt, 'knn': knn, 'bayes': bayes, 'svc': svc}

xgb_params.update(tuned_xgb.params)
xgb = XGBClassifier(**xgb_params)

blender_upd = xgb

##### TunedModels Output

In [65]:
Blnd = Blending(X_final, y_final, basemodel_upd, blender_upd, valsize=0.20)
acc_tuned, f1score_tuned, ypred_tuned, yprob_tuned, yfull_tuned = Blnd.runBlendingEnsemble()

print(f"Accuracy Score: {acc_tuned: .1%}, f1score: {f1score_tuned:.1%}")




Classification Report
              precision    recall  f1-score   support

           0       0.73      0.85      0.79       368
           1       0.86      0.74      0.80       449

    accuracy                           0.79       817
   macro avg       0.79      0.80      0.79       817
weighted avg       0.80      0.79      0.79       817

Accuracy Score:  79.2%, f1score: 79.2%


### Backtest/Strategy Evaluation

#### Approach 1: Using the simple backtest class - Out of sample test

In [66]:
return_period = 1

In [67]:
btd = SimpleBacktest(df)
btdd = btd.approach1(ypred, return_period)

In [68]:
sharpe2 = btd.sharpe_ratios(btdd)

Benchmark    1.221892
Strategy     1.712015
dtype: float64


In [69]:
btd.html_report(company_name=company_name)

#### Approach 2: Using the popular Backtesting Library - Out of Sample test

In [70]:
bto_lib = Btest(df, ypred)
bto_lib.runStrategy()

<backtesting.backtesting.Backtest at 0x150302cb0>

In [71]:
btostats = bto_lib.runstats()
print(btostats)

Start                     2021-05-18 00:00:00
End                       2024-08-15 00:00:00
Duration                   1185 days 00:00:00
Exposure Time [%]                   61.689106
Equity Final [$]                 51193.964655
Equity Peak [$]                  61680.783895
Return [%]                         411.939647
Buy & Hold Return [%]              776.588397
Return (Ann.) [%]                   65.483752
Volatility (Ann.) [%]               67.402025
Sharpe Ratio                          0.97154
Sortino Ratio                        3.003571
Calmar Ratio                         1.903907
Max. Drawdown [%]                  -34.394405
Avg. Drawdown [%]                   -5.226205
Max. Drawdown Duration      309 days 00:00:00
Avg. Drawdown Duration       31 days 00:00:00
# Trades                                   82
Win Rate [%]                        53.658537
Best Trade [%]                      40.770713
Worst Trade [%]                    -14.353426
Avg. Trade [%]                    

In [72]:
bto_lib.plotstats()