In [4]:
!pip install yfinance scikit-learn matplotlib pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
# =========================
# Final, Robust ML Pipeline
# =========================

import numpy as np
import pandas as pd
import yfinance as yf

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# ------------------
# 1) Download prices
# ------------------
# AAPL data
df = yf.download("AAPL", start="2015-01-01", auto_adjust=True)[['Close','Volume']].copy()

# -----------------------
# 2) Core feature set (AAPL)
# -----------------------
df['Return_5d']    = df['Close'].shift(-5) / df['Close'] - 1           # future 5-day return
df['Target']       = (df['Return_5d'] > 0).astype(int)                 # 1 if future 5-day return > 0
df['Momentum_5d']  = df['Close'].pct_change(5)                         # past 5-day momentum
df['Volume_Ratio'] = df['Volume'] / df['Volume'].rolling(10).mean()    # volume vs 10-day avg
df['Volatility']   = df['Close'].rolling(10).std()                     # 10-day std dev of close
df['MA50']         = df['Close'].rolling(50).mean()                    # 50-day moving average
df['MA200']        = df['Close'].rolling(200, min_periods=200).mean()  # 200-day moving average

# -------------------------------------------------
# 3) Market context: S&P 500 corr (30d) and beta (60d)
# -------------------------------------------------
spy = yf.download("^GSPC", start="2015-01-01", auto_adjust=True)[['Close']].rename(
    columns={'Close': 'SP500_Close'}
)

# Join on the index (dates). Keep only dates both series share.
df = df.join(spy, how='inner')

# Daily returns for AAPL and SP500
df['Ret']      = df['Close'].pct_change()
df['Mkt_Ret']  = df['SP500_Close'].pct_change()

# 30-day rolling correlation (needs at least 30 days)
df['SP500_corr_30d'] = df['Ret'].rolling(30, min_periods=30).corr(df['Mkt_Ret'])

# 60-day rolling beta = cov(AAPL, MKT) / var(MKT)
cov_60 = df['Ret'].rolling(60, min_periods=60).cov(df['Mkt_Ret'])
var_60 = df['Mkt_Ret'].rolling(60, min_periods=60).var()
df['SP500_beta_60d'] = cov_60 / var_60

# ------------------------------------------
# 4) Clean once, then compute binary flags
# ------------------------------------------
# Drop warm-up NaNs from rolling/shift/corr/beta
df = df.dropna().reset_index(drop=True)

# Row-wise comparison as 1-D arrays (prevents alignment/broadcast errors)
df['Above_MA200'] = (df['Close'].to_numpy().ravel() > df['MA200'].to_numpy().ravel()).astype(np.int8)

# Quick preview
print("\nData preview:")
print(df[['Close','MA50','MA200','Above_MA200','SP500_corr_30d','SP500_beta_60d']].head().to_string(index=False))


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
Data preview:
    Close      MA50     MA200 Above_MA200 SP500_corr_30d SP500_beta_60d
     AAPL                                                              
24.953526 25.247378 27.111634           0       0.559378       1.109902
25.108587 25.230344 27.115872           0       0.544503       1.117540
25.567030 25.203601 27.125819           0       0.443345       1.112118
25.564774 25.204815 27.135744           0       0.396032       1.118124
25.955799 25.205983 27.145971           0       0.426827       1.121300



In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit, train_test_split
from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score, precision_score, recall_score
import numpy as np
import pandas as pd


features = ['Momentum_5d', 'Volume_Ratio', 'Volatility', 'MA50','MA200','Above_MA200']
X = df[features].values
y = df['Target'].values


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


tscv = TimeSeriesSplit(n_splits=3)   # fewer splits = faster


param_dist = {
    'n_estimators':      [50, 100, 200],     # smaller forests first (faster)
    'max_depth':         [None, 6, 10], 
    'min_samples_split': [2, 5], 
    'min_samples_leaf':  [1, 2],
    'max_features':      ['sqrt', 'log2', None]
}


scoring = {
    'accuracy':  'accuracy',
    'precision': make_scorer(precision_score, pos_label=1),
    'recall':    make_scorer(recall_score,    pos_label=1),
    'f1':        make_scorer(f1_score,        pos_label=1),
}


base_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

rand_search = RandomizedSearchCV(
    estimator=base_rf,
    param_distributions=param_dist,
    n_iter=20,                 # only test 20 random combos
    scoring=scoring,
    refit='f1',                 # pick best by F1
    cv=tscv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)


rand_search.fit(X_train, y_train)

print("Best params (by F1):", rand_search.best_params_)
print("Best CV F1:", rand_search.best_score_)


best_model = rand_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, digits=3))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params (by F1): {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 6}
Best CV F1: 0.5167490970884846
Test Accuracy: 0.5441767068273092
Classification Report:
               precision    recall  f1-score   support

           0      0.510     0.231     0.318       229
           1      0.553     0.810     0.658       269

    accuracy                          0.544       498
   macro avg      0.531     0.521     0.488       498
weighted avg      0.533     0.544     0.502       498



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=347709d5-cded-40f4-823a-1111eb8b4b8b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>