In [1]:
# Step 1: Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer, mean_squared_error

In [2]:


# Step 2: Select 5 balanced stocks based on beta (high and low risk mix)
selected_stocks = ['C', 'PYPL', 'V', 'SCHW', 'MA']  # 2 high beta, 2 low beta, 1 mid beta

# Step 3: Download historical stock prices
start_date = "2019-01-01"
end_date = "2024-01-01"
data = yf.download(selected_stocks, start=start_date, end=end_date)['Close']

# Step 4: Calculate daily returns
returns = data.pct_change().dropna()

# Step 5: Feature Engineering
features = pd.DataFrame(index=returns.index)
for stock in selected_stocks:
    df = pd.DataFrame()
    df["Return"] = returns[stock]
    df["MA10"] = returns[stock].rolling(window=10).mean()
    df["MA50"] = returns[stock].rolling(window=50).mean()
    df["Volatility"] = returns[stock].rolling(window=10).std()
    df["Momentum"] = returns[stock].rolling(window=10).apply(lambda x: x[-1] - x[0], raw=True)
    df.columns = [f"{stock}_{col}" for col in df.columns]
    features = features.join(df, how="outer")

# Step 6: Drop NA values
features = features.dropna()

# Step 7: Create labels (next-day return)
labels = pd.DataFrame(index=features.index)
for stock in selected_stocks:
    labels[stock] = returns[stock].shift(-1)
labels = labels.dropna()
features = features.loc[labels.index]  # Align features with labels

# Step 8: Normalize features
scaler = StandardScaler()
features_scaled = pd.DataFrame(
    scaler.fit_transform(features),
    columns=features.columns,
    index=features.index
)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  5 of 5 completed


In [3]:
# Step 9: Train-test split (80% train, 20% test)
split_index = int(len(features_scaled) * 0.8)
X_train = features_scaled.iloc[:split_index]
X_test = features_scaled.iloc[split_index:]
y_train = labels.iloc[:split_index]
y_test = labels.iloc[split_index:]

# CROSS-VALIDATION SETUP
tscv = TimeSeriesSplit(n_splits=5)
mse_scorer = make_scorer(mean_squared_error)


In [4]:
# MODEL IMPLEMENTATION WITH CV

# Linear Regression
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

print("\n" + "="*50)
print("LINEAR REGRESSION MODEL WITH CROSS-VALIDATION")
print("="*50 + "\n")

lr_models = {}
lr_cv_scores = {}

for stock in y_train.columns:
    print(f"\n🔍 Training Linear Regression for {stock}...")
    
    # OLS for interpretation
    X_const = sm.add_constant(X_train)
    ols_model = sm.OLS(y_train[stock], X_const).fit()
    print(ols_model.summary().tables[1])
    
    # Scikit-learn for CV
    lr = LinearRegression()
    cv_scores = cross_val_score(lr, X_train, y_train[stock], cv=tscv, scoring=mse_scorer)
    lr_cv_scores[stock] = cv_scores
    print(f"CV MSE Scores: {cv_scores}")
    print(f"Mean CV MSE: {cv_scores.mean():.6f} (±{cv_scores.std():.6f})")
    
    # Train final model
    lr.fit(X_train, y_train[stock])
    lr_models[stock] = lr

# Support Vector Machine
from sklearn.svm import SVR

print("\n" + "="*50)
print("SVM MODEL WITH CROSS-VALIDATION")
print("="*50 + "\n")

svm_models = {}
svm_cv_scores = {}
svm_params = {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale', 'epsilon': 0.001}

for stock in y_train.columns:
    print(f"\n🚀 Training SVM for {stock}...")
    svr = SVR(**svm_params)
    
    cv_scores = cross_val_score(svr, X_train, y_train[stock], cv=tscv, scoring=mse_scorer)
    svm_cv_scores[stock] = cv_scores
    print(f"CV MSE Scores: {cv_scores}")
    print(f"Mean CV MSE: {cv_scores.mean():.6f} (±{cv_scores.std():.6f})")
    
    svr.fit(X_train, y_train[stock])
    svm_models[stock] = svr

# Random Forest
from sklearn.ensemble import RandomForestRegressor

print("\n" + "="*50)
print("RANDOM FOREST MODEL WITH CROSS-VALIDATION")
print("="*50 + "\n")

rf_models = {}
rf_cv_scores = {}
rf_feature_importances = {}
rf_params = {'n_estimators': 100, 'max_depth': None, 'random_state': 42}

for stock in y_train.columns:
    print(f"\n🌲 Training Random Forest for {stock}...")
    rf = RandomForestRegressor(**rf_params)
    
    cv_scores = cross_val_score(rf, X_train, y_train[stock], cv=tscv, scoring=mse_scorer)
    rf_cv_scores[stock] = cv_scores
    print(f"CV MSE Scores: {cv_scores}")
    print(f"Mean CV MSE: {cv_scores.mean():.6f} (±{cv_scores.std():.6f})")
    
    rf.fit(X_train, y_train[stock])
    rf_models[stock] = rf
    rf_feature_importances[stock] = rf.feature_importances_

#Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error

print("\n" + "="*50)
print("NEURAL NETWORK MODEL WITH CROSS-VALIDATION")
print("="*50 + "\n")

def build_nn():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

nn_models = {}
nn_cv_scores = {}

for stock in y_train.columns:
    print(f"\n🤖 Training Neural Network for {stock}...")

    cv_scores = []

    for train_idx, val_idx in tscv.split(X_train):
        X_train_fold = X_train.iloc[train_idx]
        y_train_fold = y_train[stock].iloc[train_idx]
        X_val_fold = X_train.iloc[val_idx]
        y_val_fold = y_train[stock].iloc[val_idx]

        model = build_nn()
        model.fit(X_train_fold, y_train_fold, epochs=50, batch_size=32, verbose=0)

        preds = model.predict(X_val_fold).flatten()
        mse = mean_squared_error(y_val_fold, preds)
        cv_scores.append(mse)

    nn_cv_scores[stock] = np.array(cv_scores)
    print(f"CV MSE Scores: {cv_scores}")
    print(f"Mean CV MSE: {np.mean(cv_scores):.6f} (±{np.std(cv_scores):.6f})")

    # Final model training on the full training set
    final_model = build_nn()
    final_model.fit(X_train, y_train[stock], epochs=50, batch_size=32, verbose=0)
    nn_models[stock] = final_model


LINEAR REGRESSION MODEL WITH CROSS-VALIDATION


🔍 Training Linear Regression for C...
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.0002      0.001     -0.210      0.834      -0.002       0.002
C_Return           -0.0008      0.002     -0.450      0.653      -0.004       0.003
C_MA10             -0.0004      0.002     -0.278      0.781      -0.003       0.003
C_MA50             -0.0023      0.002     -1.473      0.141      -0.005       0.001
C_Volatility        0.0022      0.002      1.231      0.219      -0.001       0.006
C_Momentum         -0.0009      0.002     -0.520      0.603      -0.004       0.002
PYPL_Return         0.0003      0.002      0.182      0.856      -0.003       0.004
PYPL_MA10          -0.0012      0.001     -0.963      0.336      -0.004       0.001
PYPL_MA50           0.0008      0.001      0.669      0.504      -0.002  

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
CV MSE Scores: [0.1297009202927026, 0.011101125113548676, 0.006697613057135908, 0.012254645560812431, 0.0045329003060640225]
Mean CV MSE: 0.032857 (±0.048504)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



🤖 Training Neural Network for PYPL...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step  


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
CV MSE Scores: [0.12129350807291184, 0.023767284871454183, 0.007837789862917674, 0.02648218486079274, 0.00973709937266055]
Mean CV MSE: 0.037824 (±0.042383)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



🤖 Training Neural Network for V...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
CV MSE Scores: [0.06036964688770604, 0.023763792781634683, 0.006764663298885939, 0.02034977030215495, 0.004819122487524663]
Mean CV MSE: 0.023213 (±0.019990)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



🤖 Training Neural Network for SCHW...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
CV MSE Scores: [0.1470744492052106, 0.023606658323960722, 0.009559084512035373, 0.025447583409563334, 0.004627330890492068]
Mean CV MSE: 0.042063 (±0.053108)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



🤖 Training Neural Network for MA...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
CV MSE Scores: [0.25857336755607446, 0.02371683321587096, 0.006518256364410199, 0.05705943029286727, 0.0054572693107067015]
Mean CV MSE: 0.070265 (±0.095984)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [5]:
# FINAL EVALUATION
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model_dict, model_name):
    print(f"\n⭐ {model_name.upper()} TEST SET PERFORMANCE ⭐")
    results = {}
    for stock in y_test.columns:
        preds = model_dict[stock].predict(X_test)
        mse = mean_squared_error(y_test[stock], preds)
        mae = mean_absolute_error(y_test[stock], preds)
        r2 = r2_score(y_test[stock], preds)
        results[stock] = {'MSE': mse, 'MAE': mae, 'R2': r2}
        print(f"{stock} - MSE: {mse:.6f}, MAE: {mae:.6f}, R²: {r2:.4f}")
    return results

# Evaluate all models
lr_results = evaluate_model(lr_models, "Linear Regression")
svm_results = evaluate_model(svm_models, "SVM")
rf_results = evaluate_model(rf_models, "Random Forest")
nn_results = evaluate_model(nn_models, "Neural Network")

#CV RESULTS SUMMARY
print("\n" + "="*50)
print("CROSS-VALIDATION RESULTS SUMMARY")
print("="*50)

def print_cv_summary(scores_dict, model_name):
    print(f"\n📊 {model_name} CV MSE:")
    for stock in scores_dict:
        mean_score = np.mean(scores_dict[stock])
        std_score = np.std(scores_dict[stock])
        print(f"{stock}: {mean_score:.6f} (±{std_score:.6f})")

print_cv_summary(lr_cv_scores, "Linear Regression")
print_cv_summary(svm_cv_scores, "SVM")
print_cv_summary(rf_cv_scores, "Random Forest")
print_cv_summary(nn_cv_scores, "Neural Network")


⭐ LINEAR REGRESSION TEST SET PERFORMANCE ⭐
C - MSE: 0.000282, MAE: 0.012648, R²: -0.0869
PYPL - MSE: 0.000563, MAE: 0.017610, R²: -0.0662
V - MSE: 0.000103, MAE: 0.007906, R²: -0.1222
SCHW - MSE: 0.000733, MAE: 0.018178, R²: -0.0219
MA - MSE: 0.000115, MAE: 0.008176, R²: -0.0678

⭐ SVM TEST SET PERFORMANCE ⭐
C - MSE: 0.000523, MAE: 0.017954, R²: -1.0156
PYPL - MSE: 0.001023, MAE: 0.024914, R²: -0.9376
V - MSE: 0.000278, MAE: 0.013058, R²: -2.0399
SCHW - MSE: 0.001028, MAE: 0.023296, R²: -0.4346
MA - MSE: 0.000382, MAE: 0.014815, R²: -2.5594

⭐ RANDOM FOREST TEST SET PERFORMANCE ⭐
C - MSE: 0.000526, MAE: 0.015552, R²: -1.0270
PYPL - MSE: 0.000708, MAE: 0.019608, R²: -0.3417
V - MSE: 0.000208, MAE: 0.010494, R²: -1.2734
SCHW - MSE: 0.000777, MAE: 0.018622, R²: -0.0844
MA - MSE: 0.000241, MAE: 0.011511, R²: -1.2500

⭐ NEURAL NETWORK TEST SET PERFORMANCE ⭐
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
C - MSE: 0.004089, MAE: 0.049376, R²: -14.7477
[1m8/8[0m [3