# Project Topic: Application of Ensemble Learning (Blending) in developing a Predictive model for Nvidia Stock price uptrend move.
### - Caleb Fowowe

#### Import Libraries

In [1]:
from sklearn.linear_model import LogisticRegression

from src.utils_data_processing import LoadData, cwts, getpath
from src.utils_features_engineering import (FeaturesCreation, FeaturesTransformation, FeaturesSelection)
from src.utils_model_and_tuning import Blending, HpTuning

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import pandas as pd

import numpy as np
import quantstats as qs
from datetime import datetime

# Creates a folder for saving of code graphics and trading strategy report.
output_path = getpath()

### Load Data, EDA, Fix Null Data, and Plot Candlestick

In [None]:
data_files = {'files': ['NVDA', 'VVIX_History', 'USCPI', 'USGDP', 'FedFundRate', '2yrTreasury', '10yrTreasury']} #File names of the data as dictionary values

time_period = ['2008', '2024'] #specifies a period range, in the case provided data is goes back than required.
company_name = data_files['files'][0] #Extract the stock or company name here

ldata = LoadData(*time_period, **data_files) #instantiate the class

df = ldata.joinData() #merge all data together to form a single dataframe.

##### Exploratory Data Analysis (EDA)

In [None]:
### Check and fix null data (null data)
print(ldata.checkNullData(df)) #check for missing data in the dataset
df = ldata.fixNullData(df, method='bfill') #Intiall step of fixing missing data, based on the earlier stated method, here backfill is method is used tp backfill quarterly, and monthly macrodata
print(ldata.checkNullData(df)) #check for null data after fixing null data
df = ldata.fixNullData(df, method='knnimpute') #fix future data that are not available yet, can drop rows to choose, knn_impute method was used here.
# ldata.plotCandleStick(df)
ldata.plotPrices(df)
df.tail()

In [None]:
df.describe()

## Feature Engineering

##### FeaturesEngineering Class with the entire FeaturesCreation, FeaturesTransformation, and FeaturesSelections sub-classes

#### Feature Creation/Extraction

##### Define the parameters to be used in the target variable (y)/ Label

In [None]:
# The target is a trend and volatility play which creates a signal when the return over a short period of time (5days), crosses over the return trend over a relative medium period (10days). However, there is an outperformance threshold (hurdle) over which the 5-day return must outperform the medium_period return before it is charactersized as a condtion. The other condition is a volatility play where the standard deviation of the short period return is less than the upper boundary standard deviation of the medium period return. The upper boundary of the medium period returns standard deviation is characterized as 2-standard deviations from the mean

# Target parameters
short_prd= 5 
medium_prd = 10
upper_std=2 
lower_std = 1 
hurdle = 0.005

##### Generate Features - all features (pandas-ta library)

In [None]:
# Instantiate the features creation subclass with the dataframe containing the cleaned data, and the testsize as input parameters. The testsize is opional and has a default value of 20%.
# The testsize is provided during the Features engineering process because it will be used during features selection step.

feat_df = FeaturesCreation(df, short_prd, medium_prd, upper_std, lower_std, hurdle) #Instantiate the FeaturesCreation subclass providing the dataframe and the target parametres as inputs
new_ft = feat_df.create_all_features(fundamental_features=True, macro_features=True) # If the Fundamental and Macro ratios are provided, genrates a feature set of:
#1. Company fundamentals-related Features (Requires specific column label)
#2. Macro-economic related features (Requires specific column label)
#3. Technical Indicator features (based on pandas ta-library) (Requires specific column label 'Open', 'High', 'Low', 'Close', 'Volume')

# The ohlcv columns are dropped after using them in the generation of the technical indicators. Below is a preview of the first five row of the 320features including both, macroeconomic, fundamental and technical indicators.
new_ft.head()

In [None]:
new_ft.shape

#### Feature Transformation & Selection

##### Transform day feature column

In [None]:
# Prior to starting to features selection process, the days features which consist of trading days (Monday - Friday), is transformed, to two features.
feat_transform = FeaturesTransformation(new_ft) #Instantiate the FeaturesTransformation subclass providing the dataset with the generated features as input
new_ft2 = feat_transform.transformDaysColumn() # Invoke the transformDaysColumn method which has the defined DaysTransformer subclass to transform the 'days' column. The column named 'days' must be in the input dataset for this method to execute. The whole feature set with the transformed 'days' column is stored in the 'new_ft2' variable.

###### To optimize the dataframe performance, the all features outside the target column are convereted to 'float64', with the target variable column converted to 'int16' datatype

In [None]:
new_ft2 = new_ft2.astype('float64')
new_ft2['predict'] = new_ft2['predict'].values.astype('int16')

In [None]:
new_ft2

#### Feature Selection

###### Feature selection - Wrapper Method: Boruta and Recursive Forward Elimination (RFE)

In [None]:
feat_select = FeaturesSelection(new_ft2, testsize = 0.20) #instantiate the FeaturesSelection subclass, providing dataframe from above, with the days column transformed as required input parameters, and the testsize as an optional input parameter as well. Default testsize is 0.20.
feat1 = feat_select.wrapper_boruta(max_iter=200) # Invoke the wrapper_boruta method within the FeaturesSelection subclass.

#The wrapper_boruta method

##### Feature selection - Filtering Method: Addressing Multicollinearity among features

###### using the same Feature Selection class, specify the correlation coefficient Threshold of choice. (The projected tested correlation in the 0.60 - 0.90) ranges.

In [None]:
# Call the filter_correlation method, within the class providing it with the desired correlation threshold.
# The multicollinearity steps follows the Boruta and RFE intersection steps. Hence, there is no need to specify the dataframe, the code has designed such that it already takes as input the dataframe which contains the features output of Boruta and RFE intersection. 
# However, for testing purposes, there's an optionality to provide the function with both correlation coefficient and dataframe, and it will filter for multicollinearitu among features.
filtered_feature = feat_select.filter_correlation(corr_coeff=0.70)  

In [None]:
data3 = new_ft2[filtered_feature]
data3['predict'] = new_ft2['predict'].values.astype('int')

data3.head()

### Ensemble Model - Blending Ensemble

##### Initial parameterization of basemodels and metamodel

In [None]:
cls_weight = cwts(data3)

lr_params = {'random_state': 1, 'class_weight': cls_weight}
lr = LogisticRegression(**lr_params)

dt_params = {'class_weight': cls_weight, 'random_state': 1}
dt = DecisionTreeClassifier(**dt_params)

knn_params = {'algorithm': 'auto', 'n_jobs': -1}
knn = KNeighborsClassifier(**knn_params)

bayes_params = {}
bayes = GaussianNB()
bayes.set_params(**bayes_params)

svc_params = {'class_weight': cls_weight,'random_state': 1, 'probability': True}
svc = SVC(**svc_params)

basemodels = {'lr': lr, 'dte': dt, 'knn': knn, 'bayes': bayes, 'svc': svc}

xgb_params = {'n_jobs': -1, 'class_weight': cls_weight, 'random_state': 1, 'verbose': 1}
xgb = XGBClassifier(**xgb_params)

blender = xgb

##### Initial run of the blending model

In [None]:
#Separate final X and y - Features and target
X_final = data3.iloc[:,:-1].values
y_final = data3.iloc[:,-1].values

Blnd = Blending(X_final, y_final, basemodels, blender, valsize=0.20)
acc, f1score, ypred, yprob, yfull = Blnd.runBlendingEnsemble()

print(f"Accuracy Score: {acc: .1%}, f1score: {f1score:.1%}")

#### Hyperparamter Tuning

In [None]:
#Instantiate tuning
tune_model = HpTuning(X_final, y_final, n_trials=40)
tuned_lr, tuned_dt, tuned_svc, tuned_knn, tuned_bayes, tuned_xgb = tune_model.optimize_lr(), tune_model.optimize_dt(), tune_model.optimize_svc(), tune_model.optimize_knn(), tune_model.optimize_bayes(), tune_model.optimize_xgb()

print("optimal_lr:", tuned_lr.values, "\t","optimal_dt:", tuned_dt.values, "\t", "optimal_svc:", tuned_svc.values, "\t", "optimal_knn:", tuned_knn.values, "\t", "optimal_bayes:", tuned_bayes.values, "\t", "optimal_xgb:", tuned_xgb.values)

#### Run Ensemble Model with tuned parameters

##### Update the initial parameters dictionary with the hyperparameter tuning parameter values

In [None]:
lr_params.update(tuned_lr.params)
lr = LogisticRegression(**lr_params)

dt_params.update(tuned_dt.params)
dt = DecisionTreeClassifier(**dt_params)

knn_params.update(tuned_knn.params)
knn = KNeighborsClassifier(**knn_params)

bayes_params = {}
bayes = GaussianNB()
bayes.set_params(**bayes_params)

svc_params.update(tuned_svc.params)
svc = SVC(**svc_params)

basemod_upd = {'lre': lr, 'dte': dt, 'knn': knn, 'bayes': bayes, 'svc': svc}

xgb_params.update(tuned_xgb.params)
xgb = XGBClassifier(**xgb_params)

blender_upd = xgb

##### TunedModels Output

In [None]:
Blnd = Blending(X_final, y_final, basemod_upd, blender_upd, valsize=0.20)
acc_tuned, f1score_tuned, ypred_tuned, yprob_tuned, yfull_tuned = Blnd.runBlendingEnsemble()

print(f"Accuracy Score: {acc_tuned: .1%}, f1score: {f1score_tuned:.1%}")

### Backtest/Strategy Evaluation

##### Trading Strategy - full period backtest

In [None]:
return_period = 1

In [None]:
# Extract Close prices over the range of dates of the full model
backtest_data = df[['Close', 'Open']][-len(yfull_tuned):]
backtest_data['Signal'] = yfull_tuned

backtest_data

In [None]:
#Entry logic
backtest_data['Entry'] = np.where(backtest_data['Signal']==1, backtest_data['Close'], 0) #when the strategy signal is 1, we enter into a trade, and buy at the end of day's close.

#Exit Logic
backtest_data['Exit'] = np.where((backtest_data['Entry'] != 0) & (backtest_data['Open'].shift(-return_period) <= backtest_data['Close']),
                         backtest_data['Open'].shift(-return_period), 0) #
backtest_data['Exit'] = np.where((backtest_data['Entry'] != 0) & (backtest_data['Open'].shift(-return_period) > backtest_data['Close']),
                         backtest_data['Close'].shift(-return_period), backtest_data['Exit'])

# Calculate MTM
backtest_data['P&L'] = backtest_data['Exit'] - backtest_data['Entry']

# Generate Equity Curve
backtest_data['Equity'] = backtest_data['P&L'].cumsum() + backtest_data['Close'][0]

# Calculate Benchmark Return
backtest_data['Benchmark'] = np.log(backtest_data['Close']).diff().fillna(0)

# Calculate Strategy Return
backtest_data['Strategy'] = (backtest_data['Equity']/backtest_data['Equity'].shift(return_period) - 1).fillna(0)
backtest_data = backtest_data.iloc[:-1]

##### Calculate the Sharpe Ratio

In [None]:
bts = backtest_data[['Benchmark','Strategy']]
qs.stats.sharpe(bts)

In [None]:
# generate report and save in the output folder
date_time = datetime.now().strftime('%Y-%m-%d, %H%M%S')
qs.reports.html(bts['Strategy'], bts['Benchmark'],  title=f'Strategy BackTest Report for {company_name}',
                output=f'{output_path}/{company_name}_backtest_report_full_period-{date_time}.html')

In [None]:
# qs.reports.full(bts['Strategy'], benchmark=bts['Benchmark'], mode='full', title=f'Strategy BackTest Report for {company_name}')

#### Out of Sample Test
##### Trading Strategy - For Test Data Period alone

In [None]:
btdata = df['Close'][-len(ypred):]
btdata = btdata.to_frame()
btdata['Benchmark'] = np.log(btdata['Close']).diff().fillna(0)
btdata['Signal'] = ypred
btdata['Strategy'] = btdata.Benchmark * btdata.Signal.shift(1).fillna(0)

In [None]:
# generate report and save in the output folder
qs.reports.html(btdata['Strategy'], btdata['Benchmark'], title=f'Strategy BackTest Report for {company_name}', 
                output=f'{output_path}/{company_name}_backtesting_report_test_period-{date_time}.html')

In [None]:
bto = btdata[['Benchmark','Strategy']]
qs.stats.sharpe(bto)