In [1]:
import yfinance as yf
import pandas as pd
from sklearn.metrics import precision_score, accuracy_score
from sklearn.metrics import precision_score, accuracy_score

In [2]:
pd.options.display.float_format = '{:.2f}'.format
sp500 = yf.Ticker("^GSPC")
sp500 = sp500.history(period="max")
# sp500.to_csv("sp500.csv")

In [3]:
del sp500["Dividends"]
del sp500["Stock Splits"]

In [4]:
sp500["Tomarrow"] = sp500["Close"].shift(-1)

In [5]:
sp500["Target"] = (sp500["Close"]<sp500["Tomarrow"]).astype(int)

In [6]:
sp500=sp500.loc["1990-1-1":].copy()

In [7]:
import sklearn
from sklearn.ensemble import RandomForestClassifier

train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

In [8]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
    
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    
    return pd.concat(all_predictions)

In [9]:
horizons = [2,5,60,250,1000]
new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
   
    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]
    
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    new_predictors+= [ratio_column, trend_column]

In [10]:
sp500=sp500.dropna()

In [11]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [12]:
def predict(train, test, new_predictors, model):
    model.fit(train[new_predictors], train["Target"])
    preds = model.predict_proba(test[new_predictors])[:,1]
    preds[preds >=.6] = 1
    preds[preds <.6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [13]:
predictions = backtest(sp500, model, new_predictors)

In [14]:
predictions["Predictions"].value_counts()

Predictions
0.00    4279
1.00     830
Name: count, dtype: int64

In [15]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5734939759036145

In [16]:
predictions["Target"].value_counts() / predictions.shape[0]

Target
1   0.54
0   0.46
Name: count, dtype: float64

In [22]:
feature_imp=pd.Series( model.feature_importances_).sort_index(ascending=True)

In [18]:
new_pred=pd.Series(new_predictors)

In [24]:
print(pd.concat([new_pred,feature_imp],axis=1))

                  0    1
0     Close_Ratio_2 0.16
1           Trend_2 0.01
2     Close_Ratio_5 0.15
3           Trend_5 0.03
4    Close_Ratio_60 0.15
5          Trend_60 0.06
6   Close_Ratio_250 0.14
7         Trend_250 0.08
8  Close_Ratio_1000 0.13
9        Trend_1000 0.08
