In [13]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyfolio as pf

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [14]:
sp500 = yf.Ticker("^GSPC") ## S&P-500

startDate = "2010-01-01"
endDate = "2024-01-01"

df = sp500.history(interval="1d", start=startDate, end=endDate)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04 00:00:00-05:00,1116.560059,1133.869995,1116.560059,1132.989990,3991400000,0.0,0.0
2010-01-05 00:00:00-05:00,1132.660034,1136.630005,1129.660034,1136.520020,2491020000,0.0,0.0
2010-01-06 00:00:00-05:00,1135.709961,1139.189941,1133.949951,1137.140015,4972660000,0.0,0.0
2010-01-07 00:00:00-05:00,1136.270020,1142.459961,1131.319946,1141.689941,5270680000,0.0,0.0
2010-01-08 00:00:00-05:00,1140.520020,1145.390015,1136.219971,1144.979980,4389590000,0.0,0.0
...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,4753.919922,4772.939941,4736.770020,4754.629883,3046770000,0.0,0.0
2023-12-26 00:00:00-05:00,4758.859863,4784.720215,4758.450195,4774.750000,2513910000,0.0,0.0
2023-12-27 00:00:00-05:00,4773.450195,4785.390137,4768.899902,4781.580078,2748450000,0.0,0.0
2023-12-28 00:00:00-05:00,4786.439941,4793.299805,4780.979980,4783.350098,2698860000,0.0,0.0


In [15]:
# Drop Dividends column
df.drop(columns=['Dividends'], inplace=True)

# Drop Stock Splits column
df.drop(columns=['Stock Splits'], inplace=True)

In [16]:
## Calculate Simple Moving Average (SWA) using rolling mean.
df['SMA'] = df['Close'].rolling(window=14).mean()

## Calculate EMA-12 and EMA-26 using Exponential Weighing Average (EWM)
df['EMA-12'] = df['Close'].ewm(span = 12, adjust = False).mean()
df['EMA-26'] = df['Close'].ewm(span = 26, adjust = False).mean()

## Calculate MACD 
df['MACD'] = df['EMA-12'] - df['EMA-26']

In [1]:
df

NameError: name 'df' is not defined

In [None]:
## Calculate RSI using formula
## RSI = 100 – [100 ÷ ( 1 + (Average Gain During Up Periods ÷ Average Loss During Down Periods ))]

delta = df['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
df['RSI'] = 100 - (100 / (1 + rs))

In [None]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA,EMA-12,EMA-26,MACD,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-01-04 00:00:00-05:00,1116.560059,1133.869995,1116.560059,1132.989990,3991400000,,1132.989990,1132.989990,0.000000,
2010-01-05 00:00:00-05:00,1132.660034,1136.630005,1129.660034,1136.520020,2491020000,,1133.533072,1133.251474,0.281598,
2010-01-06 00:00:00-05:00,1135.709961,1139.189941,1133.949951,1137.140015,4972660000,,1134.087986,1133.539514,0.548472,
2010-01-07 00:00:00-05:00,1136.270020,1142.459961,1131.319946,1141.689941,5270680000,,1135.257518,1134.143249,1.114268,
2010-01-08 00:00:00-05:00,1140.520020,1145.390015,1136.219971,1144.979980,4389590000,,1136.753281,1134.945970,1.807311,
...,...,...,...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,4753.919922,4772.939941,4736.770020,4754.629883,3046770000,4673.364990,4694.952226,4617.779082,77.173143,75.219000
2023-12-26 00:00:00-05:00,4758.859863,4784.720215,4758.450195,4774.750000,2513910000,4688.191406,4707.228806,4629.406558,77.822249,77.026599
2023-12-27 00:00:00-05:00,4773.450195,4785.390137,4768.899902,4781.580078,2748450000,4704.779994,4718.667463,4640.678670,77.988793,81.131399
2023-12-28 00:00:00-05:00,4786.439941,4793.299805,4780.979980,4783.350098,2698860000,4718.905727,4728.618638,4651.246924,77.371714,79.209536


In [None]:
# Create target variable: 1 if next day's close is higher than today's, else 0
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

In [None]:
df['PriceDiff'] = df['Close'].diff()
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA,EMA-12,EMA-26,MACD,RSI,Target,PriceDiff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-04 00:00:00-05:00,1116.560059,1133.869995,1116.560059,1132.989990,3991400000,,1132.989990,1132.989990,0.000000,,1,
2010-01-05 00:00:00-05:00,1132.660034,1136.630005,1129.660034,1136.520020,2491020000,,1133.533072,1133.251474,0.281598,,1,3.530029
2010-01-06 00:00:00-05:00,1135.709961,1139.189941,1133.949951,1137.140015,4972660000,,1134.087986,1133.539514,0.548472,,1,0.619995
2010-01-07 00:00:00-05:00,1136.270020,1142.459961,1131.319946,1141.689941,5270680000,,1135.257518,1134.143249,1.114268,,1,4.549927
2010-01-08 00:00:00-05:00,1140.520020,1145.390015,1136.219971,1144.979980,4389590000,,1136.753281,1134.945970,1.807311,,1,3.290039
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,4753.919922,4772.939941,4736.770020,4754.629883,3046770000,4673.364990,4694.952226,4617.779082,77.173143,75.219000,1,7.879883
2023-12-26 00:00:00-05:00,4758.859863,4784.720215,4758.450195,4774.750000,2513910000,4688.191406,4707.228806,4629.406558,77.822249,77.026599,1,20.120117
2023-12-27 00:00:00-05:00,4773.450195,4785.390137,4768.899902,4781.580078,2748450000,4704.779994,4718.667463,4640.678670,77.988793,81.131399,1,6.830078
2023-12-28 00:00:00-05:00,4786.439941,4793.299805,4780.979980,4783.350098,2698860000,4718.905727,4728.618638,4651.246924,77.371714,79.209536,0,1.770020


In [None]:
# Remove any row with NaN value
df = df.dropna()
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA,EMA-12,EMA-26,MACD,RSI,Target,PriceDiff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-22 00:00:00-05:00,1115.489990,1115.489990,1090.180054,1091.760010,6208650000,1135.942854,1130.206118,1133.353746,-3.147629,33.114902,1,-24.719971
2010-01-25 00:00:00-05:00,1092.400024,1102.969971,1092.400024,1096.780029,4481390000,1133.356428,1125.063642,1130.644582,-5.580940,35.756431,0,5.020020
2010-01-26 00:00:00-05:00,1095.800049,1103.689941,1089.859985,1092.170044,4731910000,1130.188572,1120.003089,1127.794616,-7.791528,32.701444,1,-4.609985
2010-01-27 00:00:00-05:00,1091.939941,1099.510010,1083.109985,1097.500000,5319120000,1127.357143,1116.541075,1125.550571,-9.009496,35.086501,0,5.329956
2010-01-28 00:00:00-05:00,1096.930054,1100.219971,1078.459961,1084.530029,5452400000,1123.274292,1111.616299,1122.512012,-10.895713,29.776400,0,-12.969971
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,4753.919922,4772.939941,4736.770020,4754.629883,3046770000,4673.364990,4694.952226,4617.779082,77.173143,75.219000,1,7.879883
2023-12-26 00:00:00-05:00,4758.859863,4784.720215,4758.450195,4774.750000,2513910000,4688.191406,4707.228806,4629.406558,77.822249,77.026599,1,20.120117
2023-12-27 00:00:00-05:00,4773.450195,4785.390137,4768.899902,4781.580078,2748450000,4704.779994,4718.667463,4640.678670,77.988793,81.131399,1,6.830078
2023-12-28 00:00:00-05:00,4786.439941,4793.299805,4780.979980,4783.350098,2698860000,4718.905727,4728.618638,4651.246924,77.371714,79.209536,0,1.770020


In [None]:
# Select features and target
features = ['SMA', 'MACD', 'RSI', 'PriceDiff']
X = df[features]
y = df['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train the model
model = RandomForestClassifier(
                                n_estimators=93,
                                max_depth=5, 
                                max_features=2, # Sqrt(n_features) 
                                random_state=42
                                )
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

print("Test score :", model.score(X_train, y_train))
print("Test score :", model.score(X_test, y_test))

Model Accuracy: 0.52
Test score : 0.5757035981474884
Test score : 0.5170940170940171
