# Stock Price Prediction with Random Forest and Technical Indicators

## 1. Setup and Imports

In [1]:
# Install necessary packages (uncomment if not already installed)
%pip install pandas scikit-learn matplotlib numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import sys
import pathlib

# Add the 'Main' directory to the system path to import custom modules
main_path = pathlib.Path("..", "Main").resolve()
sys.path.append(str(main_path))

from data_collection import StockDataCollector
from data_cleaning import StockDataCleaner
from technical_indicators import TechnicalIndicators

## 2. Data Loading and Preparation

In [3]:
# 1. Collect data
collector = StockDataCollector(historical_data_path='../Datasets/Historical Data') # specify the path of dataset dir
collector.collect_data()

# 2. Clean data
cleaner = StockDataCleaner()
cleaned_data = cleaner.clean_all(collector)

# Using Apple data (or any other ticker available)
AAPL = cleaned_data["AAPL"].copy() # .copy() to avoid SettingWithCopyWarning

print(f"Apple Stock (first 5 rows):\n{AAPL.head()}\n")
print(f"Apple Stock (last 5 rows):\n{AAPL.tail()}\n")

Looking for data in: c:\Users\LENOVO\Documents\GISMA\Data Mining\Stock-Predictions\Datasets\Historical Data
Successfully loaded 5 tickers
Apple Stock (first 5 rows):
        Date   Close    Volume    Open    High       Low
0 2025-06-13  196.45  51447350  199.73  200.37  195.7000
1 2025-06-12  199.20  43904640  199.08  199.68  197.3601
2 2025-06-11  198.78  60989860  203.50  204.50  198.4100
3 2025-06-10  202.67  54672610  200.60  204.35  200.5700
4 2025-06-09  201.45  72862560  204.39  206.00  200.0200

Apple Stock (last 5 rows):
           Date   Close     Volume     Open     High      Low
2511 2015-06-19  31.650  217446120  31.9275  31.9550  31.6000
2512 2015-06-18  31.970  141455960  31.8075  32.0775  31.8050
2513 2015-06-17  31.825  131435600  31.9300  31.9700  31.6850
2514 2015-06-16  31.900  125774600  31.7575  31.9625  31.5925
2515 2015-06-15  31.730  175587840  31.5250  31.8100  31.4275



## 3. Feature Engineering (Lagged Prices & Technical Indicators)

In [4]:
# Create lagged closing price as a baseline and feature (avoiding data leakage)
AAPL['Previous_Close'] = AAPL['Close'].shift(1)

# Calculate all technical indicators
aapl_with_indicators = TechnicalIndicators.calculate_all_indicators(AAPL)

# Drop rows with NaN values introduced by shifting and indicator calculations
aapl_with_indicators.dropna(inplace=True)
# Temporarily shorten the data for faster testing
aapl_with_indicators = aapl_with_indicators.loc['2023-01-01':'2025-01-01'].copy()

print(f"AAPL data with indicators (first 5 rows after dropping NaNs):\n{aapl_with_indicators.head()}\n")
print(f"Columns available for features: {aapl_with_indicators.columns.tolist()}\n")

AAPL data with indicators (first 5 rows after dropping NaNs):
           Date    Close    Volume     Open     High      Low  Previous_Close  \
2024 2017-05-25  38.4675  76871240  38.4325  38.5875  38.2575         38.4025   
2025 2017-05-24  38.3350  76807280  38.4600  38.5425  38.1675         38.4675   

        SMA_50    SMA_200    EMA_12     EMA_26      MACD  MACD_Signal  \
2024  37.25670  40.537262  38.09554  37.653040  0.442499     0.312869   
2025  37.24145  40.501787  38.13238  37.703556  0.428824     0.336060   

      MACD_Hist        RSI   Stoch_%K   Stoch_%D   BB_Upper   BB_Lower  \
2024   0.129631  59.693137  84.335561  83.453602  39.700122  35.220878   
2025   0.092764  57.893092  80.400891  82.508374  39.772714  35.341036   

           ATR  
2024  0.663188  
2025  0.642603  

Columns available for features: ['Date', 'Close', 'Volume', 'Open', 'High', 'Low', 'Previous_Close', 'SMA_50', 'SMA_200', 'EMA_12', 'EMA_26', 'MACD', 'MACD_Signal', 'MACD_Hist', 'RSI', 'Stoch_%K', 'S

## 4. Model Building and Walk-Forward Validation

In [5]:
# Define features and target
features = [
    'Previous_Close', 'Volume', 'Open', 'High', 'Low',
    'SMA_50', 'SMA_200', 'EMA_12', 'EMA_26',
    'MACD', 'MACD_Signal', 'MACD_Hist',
    'RSI', 'Stoch_%K', 'Stoch_%D',
    'BB_Upper', 'BB_Lower', 'ATR'
]
target = 'Close'

# Ensure all features exist in the DataFrame
existing_features = [f for f in features if f in aapl_with_indicators.columns]
if len(existing_features) != len(features):
    print("Warning: Some specified features are not in the DataFrame and will be skipped.")
    features = existing_features

# Parameters for walk-forward validation
train_window_size = 60 # Approximately 3 months of trading days
test_window_size = 20  # Predict for the next 20 trading days (approx 1 month)

rf_predictions = []
baseline_predictions = []
actual_prices = []

daily_rf_mae = []
daily_baseline_mae = []

# Perform walk-forward validation
for i in range(train_window_size, len(aapl_with_indicators) - test_window_size):
    train_data = aapl_with_indicators.iloc[i - train_window_size : i]
    test_data = aapl_with_indicators.iloc[i : i + test_window_size]

    # Check if train_data has enough samples for fitting
    if len(train_data) < train_window_size:
        continue

    X_train = train_data[features]
    y_train = train_data[target]

    # Random Forest Model
    rf_model = RandomForestRegressor(n_estimators=20, random_state=42, n_jobs=-1) # n_jobs=-1 to use all cores
    rf_model.fit(X_train, y_train)

    for j in range(len(test_data)):
        current_test_features = test_data[features].iloc[[j]] # Need to pass as DataFrame
        current_actual = test_data[target].iloc[j]
        current_previous_close = test_data['Previous_Close'].iloc[j]

        # Get prediction from RF model
        rf_pred = rf_model.predict(current_test_features)[0]

        # Get prediction from baseline model (previous day's close)
        baseline_pred = current_previous_close

        rf_predictions.append(rf_pred)
        baseline_predictions.append(baseline_pred)
        actual_prices.append(current_actual)

        daily_rf_mae.append(mean_absolute_error([current_actual], [rf_pred]))
        daily_baseline_mae.append(mean_absolute_error([current_actual], [baseline_pred]))

## 5. Evaluation and Visualization

In [6]:
# Convert lists to Series for easier plotting and analysis
rf_preds_series = pd.Series(rf_predictions, index=aapl_with_indicators.index[train_window_size : len(aapl_with_indicators) - test_window_size + len(rf_predictions) - 1])
baseline_preds_series = pd.Series(baseline_predictions, index=aapl_with_indicators.index[train_window_size : len(aapl_with_indicators) - test_window_size + len(baseline_predictions) - 1])
actual_prices_series = pd.Series(actual_prices, index=aapl_with_indicators.index[train_window_size : len(aapl_with_indicators) - test_window_size + len(actual_prices) - 1])

daily_rf_mae_series = pd.Series(daily_rf_mae, index=actual_prices_series.index)
daily_baseline_mae_series = pd.Series(daily_baseline_mae, index=actual_prices_series.index)

In [7]:
# Calculate overall MAE and MSE
overall_rf_mae = mean_absolute_error(actual_prices, rf_predictions)
overall_rf_mse = mean_squared_error(actual_prices, rf_predictions)

overall_baseline_mae = mean_absolute_error(actual_prices, baseline_predictions)
overall_baseline_mse = mean_squared_error(actual_prices, baseline_predictions)

print(f"Overall Random Forest MAE: {overall_rf_mae:.3f}")
print(f"Overall Random Forest MSE: {overall_rf_mse:.3f}")
print(f"Overall Baseline (Previous Close) MAE: {overall_baseline_mae:.3f}")
print(f"Overall Baseline (Previous Close) MSE: {overall_baseline_mse:.3f}")

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [None]:
# Plotting predictions vs actuals
plt.figure(figsize=(15, 7))
plt.style.use('dark_background')
plt.plot(actual_prices_series.index, actual_prices_series, label='Actual Close Price', color='white', linewidth=2)
plt.plot(rf_preds_series.index, rf_preds_series, label='Random Forest Prediction', color='cyan', linestyle='--')
plt.plot(baseline_preds_series.index, baseline_preds_series, label='Baseline Prediction (Previous Close)', color='yellow', linestyle=':')
plt.title('AAPL Stock Price Prediction: Random Forest vs Baseline')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.grid(True, linestyle=':', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# Plotting Daily MAE for comparison
plt.figure(figsize=(15, 6))
plt.style.use('dark_background')
plt.plot(daily_rf_mae_series.index, daily_rf_mae_series, label='Random Forest Daily MAE', color='cyan')
plt.plot(daily_baseline_mae_series.index, daily_baseline_mae_series, label='Baseline Daily MAE', color='yellow', linestyle='--')
plt.title('Daily Mean Absolute Error: Random Forest vs Baseline (AAPL)')
plt.xlabel('Date')
plt.ylabel('MAE')
plt.legend()
plt.grid(True, linestyle=':', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# (Optional) Feature Importance from Random Forest
if 'rf_model' in locals():
    importances = rf_model.feature_importances_
    feature_names = features # Use the list of features directly
    
    # Check if lengths match before zipping
    if len(importances) == len(feature_names):
        feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

        plt.figure(figsize=(12, 7))
        plt.style.use('dark_background')
        plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='skyblue')
        plt.xlabel('Feature Importance')
        plt.ylabel('Feature')
        plt.title('Random Forest Feature Importance')
        plt.gca().invert_yaxis() # Invert y-axis to have the most important feature at the top
        plt.tight_layout()
        plt.show()
    else:
        print("Error: Mismatch between feature_importances_ and feature names length.")
