In [1]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyfolio as pf

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [2]:
# sp500 = yf.Ticker("^GSPC") ## S&P-500
sp500 = yf.Ticker("^IXIC") ## NASDAQ

startDate = "2010-01-01"
endDate = "2024-01-01"

df = sp500.history(interval="1d", start=startDate, end=endDate)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04 00:00:00-05:00,2294.409912,2311.149902,2294.409912,2308.419922,1931380000,0.0,0.0
2010-01-05 00:00:00-05:00,2307.270020,2313.729980,2295.620117,2308.709961,2367860000,0.0,0.0
2010-01-06 00:00:00-05:00,2307.709961,2314.070068,2295.679932,2301.090088,2253340000,0.0,0.0
2010-01-07 00:00:00-05:00,2298.090088,2301.300049,2285.219971,2300.050049,2270050000,0.0,0.0
2010-01-08 00:00:00-05:00,2292.239990,2317.600098,2290.610107,2317.169922,2145390000,0.0,0.0
...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,15006.179688,15047.190430,14927.120117,14992.969727,4796600000,0.0,0.0
2023-12-26 00:00:00-05:00,15028.690430,15101.179688,15024.059570,15074.570312,6120600000,0.0,0.0
2023-12-27 00:00:00-05:00,15089.660156,15114.080078,15051.669922,15099.179688,7480170000,0.0,0.0
2023-12-28 00:00:00-05:00,15142.089844,15150.070312,15087.219727,15095.139648,5090570000,0.0,0.0


In [3]:
# Drop Dividends column
df.drop(columns=['Dividends'], inplace=True)

# Drop Stock Splits column
df.drop(columns=['Stock Splits'], inplace=True)

In [4]:
## Calculate Simple Moving Average (SWA) using rolling mean.
df['SMA'] = df['Close'].rolling(window=14).mean()

## Calculate EMA-12 and EMA-26 using Exponential Weighing Average (EWM)
df['EMA-12'] = df['Close'].ewm(span = 12, adjust = False).mean()
df['EMA-26'] = df['Close'].ewm(span = 26, adjust = False).mean()

## Calculate MACD 
df['MACD'] = df['EMA-12'] - df['EMA-26']

In [5]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA,EMA-12,EMA-26,MACD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-04 00:00:00-05:00,2294.409912,2311.149902,2294.409912,2308.419922,1931380000,,2308.419922,2308.419922,0.000000
2010-01-05 00:00:00-05:00,2307.270020,2313.729980,2295.620117,2308.709961,2367860000,,2308.464543,2308.441406,0.023137
2010-01-06 00:00:00-05:00,2307.709961,2314.070068,2295.679932,2301.090088,2253340000,,2307.330012,2307.896864,-0.566852
2010-01-07 00:00:00-05:00,2298.090088,2301.300049,2285.219971,2300.050049,2270050000,,2306.210017,2307.315619,-1.105601
2010-01-08 00:00:00-05:00,2292.239990,2317.600098,2290.610107,2317.169922,2145390000,,2307.896157,2308.045567,-0.149410
...,...,...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,15006.179688,15047.190430,14927.120117,14992.969727,4796600000,14645.622838,14739.929221,14454.702384,285.226836
2023-12-26 00:00:00-05:00,15028.690430,15101.179688,15024.059570,15074.570312,6120600000,14705.955706,14791.412465,14500.618527,290.793938
2023-12-27 00:00:00-05:00,15089.660156,15114.080078,15051.669922,15099.179688,7480170000,14773.989258,14838.761269,14544.956391,293.804878
2023-12-28 00:00:00-05:00,15142.089844,15150.070312,15087.219727,15095.139648,5090570000,14827.928502,14878.204096,14585.710706,292.493390


In [6]:
## Calculate RSI using formula
## RSI = 100 – [100 ÷ ( 1 + (Average Gain During Up Periods ÷ Average Loss During Down Periods ))]

delta = df['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
df['RSI'] = 100 - (100 / (1 + rs))

In [7]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA,EMA-12,EMA-26,MACD,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-01-04 00:00:00-05:00,2294.409912,2311.149902,2294.409912,2308.419922,1931380000,,2308.419922,2308.419922,0.000000,
2010-01-05 00:00:00-05:00,2307.270020,2313.729980,2295.620117,2308.709961,2367860000,,2308.464543,2308.441406,0.023137,
2010-01-06 00:00:00-05:00,2307.709961,2314.070068,2295.679932,2301.090088,2253340000,,2307.330012,2307.896864,-0.566852,
2010-01-07 00:00:00-05:00,2298.090088,2301.300049,2285.219971,2300.050049,2270050000,,2306.210017,2307.315619,-1.105601,
2010-01-08 00:00:00-05:00,2292.239990,2317.600098,2290.610107,2317.169922,2145390000,,2307.896157,2308.045567,-0.149410,
...,...,...,...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,15006.179688,15047.190430,14927.120117,14992.969727,4796600000,14645.622838,14739.929221,14454.702384,285.226836,78.343783
2023-12-26 00:00:00-05:00,15028.690430,15101.179688,15024.059570,15074.570312,6120600000,14705.955706,14791.412465,14500.618527,290.793938,78.894673
2023-12-27 00:00:00-05:00,15089.660156,15114.080078,15051.669922,15099.179688,7480170000,14773.989258,14838.761269,14544.956391,293.804878,83.943354
2023-12-28 00:00:00-05:00,15142.089844,15150.070312,15087.219727,15095.139648,5090570000,14827.928502,14878.204096,14585.710706,292.493390,81.107136


In [8]:
# Create target variable: 1 if next day's close is higher than today's, else 0
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

In [9]:
df['PriceDiff'] = df['Close'].diff()
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA,EMA-12,EMA-26,MACD,RSI,Target,PriceDiff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-04 00:00:00-05:00,2294.409912,2311.149902,2294.409912,2308.419922,1931380000,,2308.419922,2308.419922,0.000000,,1,
2010-01-05 00:00:00-05:00,2307.270020,2313.729980,2295.620117,2308.709961,2367860000,,2308.464543,2308.441406,0.023137,,0,0.290039
2010-01-06 00:00:00-05:00,2307.709961,2314.070068,2295.679932,2301.090088,2253340000,,2307.330012,2307.896864,-0.566852,,0,-7.619873
2010-01-07 00:00:00-05:00,2298.090088,2301.300049,2285.219971,2300.050049,2270050000,,2306.210017,2307.315619,-1.105601,,1,-1.040039
2010-01-08 00:00:00-05:00,2292.239990,2317.600098,2290.610107,2317.169922,2145390000,,2307.896157,2308.045567,-0.149410,,0,17.119873
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22 00:00:00-05:00,15006.179688,15047.190430,14927.120117,14992.969727,4796600000,14645.622838,14739.929221,14454.702384,285.226836,78.343783,1,29.099609
2023-12-26 00:00:00-05:00,15028.690430,15101.179688,15024.059570,15074.570312,6120600000,14705.955706,14791.412465,14500.618527,290.793938,78.894673,1,81.600586
2023-12-27 00:00:00-05:00,15089.660156,15114.080078,15051.669922,15099.179688,7480170000,14773.989258,14838.761269,14544.956391,293.804878,83.943354,0,24.609375
2023-12-28 00:00:00-05:00,15142.089844,15150.070312,15087.219727,15095.139648,5090570000,14827.928502,14878.204096,14585.710706,292.493390,81.107136,0,-4.040039


In [10]:
# Fetch VIX index data
vix = yf.Ticker("^VIX")
vix_df = vix.history(interval="1d", start=startDate, end=endDate)


df.index = df.index.tz_convert('UTC')
vix_df.index = vix_df.index.tz_convert('UTC')

# Shift the timestamps of vix_df by one hour
vix_df.index = vix_df.index - pd.Timedelta(hours=1)
# Merge VIX close prices into the original DataFrame
# df['VIX'] = vix_df['Close']

df = df.join(vix_df[['Close']], rsuffix='_VIX')

# Rename the VIX close column
df.rename(columns={'Close_VIX': 'VIX'}, inplace=True)

vix_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04 05:00:00+00:00,21.680000,21.680000,20.030001,20.040001,0,0.0,0.0
2010-01-05 05:00:00+00:00,20.049999,20.129999,19.340000,19.350000,0,0.0,0.0
2010-01-06 05:00:00+00:00,19.590000,19.680000,18.770000,19.160000,0,0.0,0.0
2010-01-07 05:00:00+00:00,19.680000,19.709999,18.700001,19.059999,0,0.0,0.0
2010-01-08 05:00:00+00:00,19.270000,19.270000,18.110001,18.129999,0,0.0,0.0
...,...,...,...,...,...,...,...
2023-12-22 05:00:00+00:00,13.720000,13.960000,13.000000,13.030000,0,0.0,0.0
2023-12-26 05:00:00+00:00,13.770000,13.800000,12.960000,12.990000,0,0.0,0.0
2023-12-27 05:00:00+00:00,13.020000,13.040000,12.370000,12.430000,0,0.0,0.0
2023-12-28 05:00:00+00:00,12.440000,12.650000,12.380000,12.470000,0,0.0,0.0


In [11]:
# Remove any row with NaN value
df = df.dropna()
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA,EMA-12,EMA-26,MACD,RSI,Target,PriceDiff,VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-01-22 05:00:00+00:00,2255.760010,2262.270020,2200.370117,2205.290039,2817620000,2294.673549,2283.929921,2295.591410,-11.661489,31.016435,1,-60.409912,27.309999
2010-01-25 05:00:00+00:00,2220.290039,2223.219971,2201.169922,2210.800049,2134350000,2287.700701,2272.679171,2289.310569,-16.631397,32.387948,0,5.510010,25.410000
2010-01-26 05:00:00+00:00,2203.439941,2227.889893,2195.439941,2203.729980,2361260000,2280.202131,2262.071604,2282.971266,-20.899662,31.512364,1,-7.070068,24.549999
2010-01-27 05:00:00+00:00,2200.300049,2225.669922,2192.590088,2221.409912,2492880000,2274.510690,2255.815959,2278.411165,-22.595207,36.448004,0,17.679932,23.139999
2010-01-28 05:00:00+00:00,2220.310059,2220.870117,2166.899902,2179.000000,2829640000,2265.864258,2243.998119,2271.047375,-27.049256,31.951651,0,-42.409912,23.730000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22 05:00:00+00:00,15006.179688,15047.190430,14927.120117,14992.969727,4796600000,14645.622838,14739.929221,14454.702384,285.226836,78.343783,1,29.099609,13.030000
2023-12-26 05:00:00+00:00,15028.690430,15101.179688,15024.059570,15074.570312,6120600000,14705.955706,14791.412465,14500.618527,290.793938,78.894673,1,81.600586,12.990000
2023-12-27 05:00:00+00:00,15089.660156,15114.080078,15051.669922,15099.179688,7480170000,14773.989258,14838.761269,14544.956391,293.804878,83.943354,0,24.609375,12.430000
2023-12-28 05:00:00+00:00,15142.089844,15150.070312,15087.219727,15095.139648,5090570000,14827.928502,14878.204096,14585.710706,292.493390,81.107136,0,-4.040039,12.470000


In [12]:
# Select features and target
features = ['SMA', 'MACD', 'RSI', 'VIX']
X = df[features]
y = df['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train the model

model = RandomForestClassifier(
                                n_estimators=21,
                                # n_estimators=5,
                                max_depth=3, 
                                # max_features=2, # Sqrt(n_features) 
                                random_state=42
                                )
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

print("Train score :", model.score(X_train, y_train))
print("Test score :", model.score(X_test, y_test))

Model Accuracy: 0.53
Train score : 0.577841111506947
Test score : 0.5256410256410257


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Select features and target
features = ['SMA', 'MACD', 'RSI', 'VIX']
X = df[features]
y = df['Target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


if (False):
    # Define the range of n_estimators to explore
    estimator_range = range(10, 210, 10)  # Adjust the range as needed

    # Loop through the range of n_estimators
    for n_estimators in estimator_range:
        # Train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=3, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Number of Estimators: {n_estimators}, Model Accuracy: {accuracy:.5f}")

        # Optionally, print train and test scores
        # print("Train score :", model.score(X_train, y_train))
        # print("Test score :", model.score(X_test, y_test))
