# Use Decision Tree model to predict stock daily return rates

## Prepare data

In [1]:
import numpy as np
import pandas as pd
import warnings
import sys
import os

warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore all future warning

In [2]:
df = pd.read_csv('../data/processed/VNINDEX_add_features.csv')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6423 entries, 0 to 6422
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   DTYYYYMMDD            6423 non-null   object 
 1   Ticker                6423 non-null   object 
 2   Open                  6423 non-null   float64
 3   High                  6423 non-null   float64
 4   Low                   6423 non-null   float64
 5   Close                 6423 non-null   float64
 6   Volume                6423 non-null   float64
 7   Outlier               6423 non-null   bool   
 8   daily_returns         6423 non-null   float64
 9   monthly_returns       6423 non-null   float64
 10  yearly_returns        6423 non-null   float64
 11  Net_advances          6423 non-null   int64  
 12  A/D                   6423 non-null   float64
 13  Schultz               6423 non-null   float64
 14  EMA19_net_adv         6423 non-null   float64
 15  EMA39_net_adv        

In [3]:
## Caculate the correlation matrix of the features
corr_df = df.copy()
corr_df = corr_df.drop(columns=['DTYYYYMMDD','Ticker','Open','High','Low','Volume','Close','Outlier','monthly_returns','yearly_returns'])
corr_matrix = corr_df.corr()

## Display the correlation matrix
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)  # ngăn DataFrame xuống dòng
pd.set_option('display.notebook_repr_html', True)
print(corr_matrix)


                      daily_returns  Net_advances       A/D   Schultz  EMA19_net_adv  EMA39_net_adv  McClellan_Oscillator      TRIN  StockAboveMA50       MA5      EMA5      MA10     EMA10      MA20     EMA20      MA50     EMA50     MA100    EMA100     MA200    EMA200
daily_returns              1.000000      0.629682  0.385782  0.360136       0.304914       0.221159              0.357898 -0.045105        0.078571 -0.025212 -0.021846 -0.030325 -0.027663 -0.034325 -0.032604 -0.040646 -0.038338 -0.042650 -0.041812 -0.044733 -0.044045
Net_advances               0.629682      1.000000  0.499066  0.634140       0.443493       0.321071              0.521660 -0.027070        0.148104  0.025248  0.027990  0.022683  0.024633  0.021810  0.022581  0.019807  0.020557  0.018636  0.019296  0.017505  0.019351
A/D                        0.385782      0.499066  1.000000  0.428697       0.156179       0.102137              0.203736 -0.022393        0.111282  0.114891  0.116383  0.115240  0.116160  0.11777

In [4]:
# So when we analyze the correlation matrix, we decide to use the features: Net_advances,A/D,Schultz, and McClellan_Oscillator
selected_features = ['Net_advances','A/D','Schultz','McClellan_Oscillator']
target_feature = ['daily_returns']

In [5]:
# Get the absolute path of the `src` folder
src_path = os.path.abspath(os.path.join(os.getcwd(), "..", "src"))
# Add `src` to the system path
sys.path.insert(0, src_path)
# Import 
from models.DecisionTree import DecisionTree,prepare_data_for_decision_tree

X,y = prepare_data_for_decision_tree(df, selected_features, target_feature,method_to_create_threshold='up_and_down')
print(X.shape)
print(y.shape)
split_index = int(len(X) * 0.8)  # Calculate the 80% split index

# Split the data
X_train, X_test = X[:split_index], X[split_index:]
y_train,y_test = y[:split_index], y[split_index:]


(6423, 4)
(6423,)


## Train and predict with DecisionTree model

In [6]:
model = DecisionTree()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Score:', model.score(y_test, y_pred))
y_predict = np.pad(y_pred, (len(y_train), 0), 'constant', constant_values=0)


Score: 0.48715953307392995


## Trade with the predict

In [7]:
from trade_stocks.trade import Trading

trading = Trading()
trade_signal = trading.generate_trade_signal(y_predict)
print(trade_signal)
finally_capital = trading.execute_trade(trade_signal,np.array(df['Close']))
print(finally_capital)
total_profit,total_trade_number,win_rate,profit_factor = trading.performance()

print(f"Total profit: {total_profit}")
print(f"Total trade number: {total_trade_number}")
print(f"Win rate: {win_rate}")
print(f"Profit factor: {profit_factor}")


[ 0.  0.  0. ...  1. -1.  1.]
2003605.8157666668
Total profit: 3605.815766666783
Total trade number: 238
Win rate: 0.5672268907563025
Profit factor: 1.1699820442460325
