In [1]:
import yfinance as yf
import pandas as pd
from sklearn.metrics import precision_score, accuracy_score
from sklearn.metrics import precision_score, accuracy_score

In [2]:
pd.options.display.float_format = '{:.2f}'.format
sp500 = yf.Ticker("^GSPC")
sp500 = sp500.history(period="max")
# sp500.to_csv("sp500.csv")

In [3]:
del sp500["Dividends"]
del sp500["Stock Splits"]

In [4]:
sp500["Tomarrow"] = sp500["Close"].shift(-1)

In [5]:
sp500["Target"] = (sp500["Close"]<sp500["Tomarrow"]).astype(int)

In [6]:
sp500=sp500.loc["1990-1-1":].copy()

In [7]:
import sklearn
from sklearn.ensemble import RandomForestClassifier

train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

In [8]:
horizons = [2,5,60,250,1000]
new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
   
    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]
    
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    new_predictors+= [ratio_column, trend_column]

In [9]:
def backtest(data, model, new_predictors, start=2500, step=250):
    all_predictions = []
    
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, new_predictors, model)
        all_predictions.append(predictions)
    
    return pd.concat(all_predictions)

In [10]:
sp500=sp500.dropna()
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomarrow,Target,Close_Ratio_2,Trend_2,Close_Ratio_5,Trend_5,Close_Ratio_60,Trend_60,Close_Ratio_250,Trend_250,Close_Ratio_1000,Trend_1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1993-12-14 00:00:00-05:00,465.73,466.12,462.46,463.06,275050000,461.84,0,1.00,1.00,1.00,1.00,1.00,32.00,1.03,127.00,1.18,512.00
1993-12-15 00:00:00-05:00,463.06,463.69,461.84,461.84,331770000,463.34,1,1.00,0.00,1.00,1.00,1.00,32.00,1.03,126.00,1.17,512.00
1993-12-16 00:00:00-05:00,461.86,463.98,461.86,463.34,284620000,466.38,1,1.00,1.00,1.00,2.00,1.00,32.00,1.03,127.00,1.18,513.00
1993-12-17 00:00:00-05:00,463.34,466.38,463.34,466.38,363750000,465.85,0,1.00,2.00,1.00,3.00,1.01,32.00,1.03,128.00,1.18,514.00
1993-12-20 00:00:00-05:00,466.38,466.90,465.53,465.85,255900000,465.30,0,1.00,1.00,1.00,2.00,1.01,32.00,1.03,128.00,1.18,513.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-01 00:00:00-05:00,5098.51,5140.33,5094.16,5137.08,4748110000,5130.95,0,1.00,2.00,1.01,3.00,1.06,38.00,1.16,141.00,1.27,534.00
2024-03-04 00:00:00-05:00,5130.99,5149.67,5127.18,5130.95,4758440000,5078.65,0,1.00,1.00,1.01,3.00,1.06,38.00,1.16,140.00,1.27,534.00
2024-03-05 00:00:00-05:00,5110.52,5114.54,5056.82,5078.65,4418410000,5104.76,1,0.99,0.00,1.00,2.00,1.04,38.00,1.14,140.00,1.25,533.00
2024-03-06 00:00:00-05:00,5108.03,5127.97,5092.22,5104.76,4559050000,5157.36,1,1.00,1.00,1.00,3.00,1.05,38.00,1.15,140.00,1.26,534.00


In [11]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [12]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >=.6] = 1
    preds[preds <.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [13]:
predictions = backtest(sp500, model, new_predictors)

In [14]:
predictions["Predictions"].value_counts()

Predictions
0.00    4281
1.00     830
Name: count, dtype: int64

In [15]:
prec = precision_score(predictions["Target"], predictions["Predictions"])
acc = accuracy_score(predictions["Target"], predictions["Predictions"])

In [16]:
print('precision: ',prec)
print('accuracy: ',acc)

precision:  0.5734939759036145
accuracy:  0.4791625904910976


In [17]:
predictions["Target"].value_counts() / predictions.shape[0]

Target
1   0.54
0   0.46
Name: count, dtype: float64

In [18]:
feature_imp=pd.Series( model.feature_importances_).sort_index(ascending=True)

In [19]:
new_pred=pd.Series(new_predictors)

In [20]:
new_data = sp500.tail(1)

In [21]:
new_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomarrow,Target,Close_Ratio_2,Trend_2,Close_Ratio_5,Trend_5,Close_Ratio_60,Trend_60,Close_Ratio_250,Trend_250,Close_Ratio_1000,Trend_1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-03-07 00:00:00-05:00,5132.38,5165.62,5128.21,5157.36,4137980000,5123.69,0,1.01,2.0,1.01,3.0,1.06,38.0,1.16,141.0,1.27,534.0


In [22]:
del new_data['Open']
del new_data['High']
del new_data['Low']
del new_data['Close']
del new_data['Volume']
del new_data['Tomarrow']
del new_data['Target']

In [24]:
res=model.predict(new_data)

In [25]:
res

array([0])

In [23]:
print(pd.concat([new_pred,feature_imp],axis=1))

                  0    1
0     Close_Ratio_2 0.16
1           Trend_2 0.01
2     Close_Ratio_5 0.15
3           Trend_5 0.03
4    Close_Ratio_60 0.15
5          Trend_60 0.06
6   Close_Ratio_250 0.14
7         Trend_250 0.08
8  Close_Ratio_1000 0.13
9        Trend_1000 0.08
