 Import Required Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pmdarima as pm
from pmdarima.arima.utils import ndiffs
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import joblib
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error

Load & Inspect Data

In [None]:
import pandas as pd
df = pd.read_parquet("../data/processed/model_ready.parquet")

ticker = "AAPL"
stock_data = df[df['ticker'] == ticker][['close']].copy()

print(f"First 5 rows for {ticker}:")
print(stock_data.head())

Exploratory Data Analysis (EDA)

 Plot Historical Prices

In [None]:
stock_data.plot(title=f"{ticker} Daily Closing Price", grid=True)
plt.ylabel("Price ($)")
plt.show()

Check for Stationarity

In [None]:
adf_result = adfuller(stock_data['close'])
print(f"ADF p-value: {adf_result[1]:.4f}")  

optimal_diff = ndiffs(stock_data['close'], test='adf')
print(f"Recommended differencing (d): {optimal_diff}")

stock_diff = stock_data.diff(optimal_diff).dropna()
stock_diff.plot(title=f"Differenced Series (d={optimal_diff})", grid=True)
plt.show()

ACF & PACF Analysis

In [None]:
plot_acf(stock_diff, lags=20, title="ACF Plot")
plot_pacf(stock_diff, lags=20, title="PACF Plot")
plt.tight_layout()
plt.show()

ARIMA Model Training

Auto-ARIMA for Parameter Selection

In [None]:
from pmdarima.arima.utils import ndiffs

optimal_diff = ndiffs(stock_data['close'], test='adf')
print(f"Optimal differencing (d): {optimal_diff}")

model = pm.auto_arima(
    stock_data['close'],
    seasonal=False,
    d=optimal_diff,          
    start_p=1,               
    start_q=1,               
    max_p=3,                
    max_q=3,                 
    trace=True,              
    suppress_warnings=True,
    error_action="ignore",   
    stepwise=True           
)

print("\n=== Model Summary ===")
print(model.summary())

Train-Test Split & Validation

In [None]:
train_size = int(0.8 * len(stock_data))
train, test = stock_data.iloc[:train_size], stock_data.iloc[train_size:]

model.fit(train['close'])

forecast, conf_int = model.predict(
    n_periods=len(test),
    return_conf_int=True
)

plt.plot(train.index, train['close'], label="Training Data")
plt.plot(test.index, test['close'], label="Actual Prices")
plt.plot(test.index, forecast, label="ARIMA Forecast", linestyle="--")
plt.fill_between(
    test.index,
    conf_int[:, 0],
    conf_int[:, 1],
    color='gray',
    alpha=0.2,
    label="95% Confidence"
)
plt.title(f"{ticker}: ARIMA Forecast vs Actual")
plt.legend()
plt.grid(True)
plt.show()

if model.arparams().size == 0 and model.maparams().size == 0:
    print("\n⚠️ WARNING: Model has NO AR/MA terms! Try LSTM instead.")
else:
    print("\n✅ Model has AR/MA terms. Proceed to forecasting.")

model.plot_diagnostics(figsize=(12, 8))
plt.suptitle("Model Diagnostics", y=1.02)
plt.tight_layout()
plt.show()

Model Evaluation

In [None]:
mae = mean_absolute_error(test['close'], forecast)
rmse = np.sqrt(mean_squared_error(test['close'], forecast))

print("Performance Metrics:")
print(f"- MAE: ${mae:.2f}")
print(f"- RMSE: ${rmse:.2f}")
print(f"- MAPE: {100 * mae / test['close'].mean():.2f}%")

 Model Diagnostics

Residual Analysis

In [None]:
residuals = model.resid()

model.plot_diagnostics(figsize=(12, 8))
plt.tight_layout()
plt.show()

from statsmodels.stats.diagnostic import acorr_ljungbox
lb_test = acorr_ljungbox(residuals, lags=[10])
print("Ljung-Box Test (H₀: residuals are random):")
print(f"P-value: {lb_test['lb_pvalue'].values[0]:.4f}") 

Normality Check

In [None]:
from scipy.stats import normaltest
norm_test = normaltest(residuals)
print("\nNormality Test (H₀: residuals are normal):")
print(f"P-value: {norm_test.pvalue:.4f}")  

Save Model for Production

In [None]:
os.makedirs("../src/models", exist_ok=True)
model_path = f"../src/models/arima_{ticker.lower()}.pkl"
joblib.dump(model, model_path)
print(f"\nModel saved to {model_path}")

Generate Final Forecast

In [None]:
forecast, conf_int = model.predict(
    n_periods=7,
    return_conf_int=True,
    alpha=0.05  
)

print("\n=== 7-Day Forecast ===")
forecast_df = pd.DataFrame({
    "Date": pd.date_range(start=stock_data.index[-1] + pd.Timedelta(days=1), periods=7),
    "Forecast": forecast,
    "Lower CI": conf_int[:, 0],
    "Upper CI": conf_int[:, 1]
})
print(forecast_df.round(2))

plt.figure(figsize=(12, 5))
plt.plot(stock_data.index[-30:], stock_data['close'][-30:], label="Historical")
plt.plot(forecast_df["Date"], forecast_df["Forecast"], label="Forecast", marker="o")
plt.fill_between(forecast_df["Date"], forecast_df["Lower CI"], forecast_df["Upper CI"], alpha=0.2)
plt.title(f"{ticker} Price Forecast")
plt.legend()
plt.grid(True)
plt.show()

ARIMA Modeling Outcome Report
🔴 ARIMA Model Failed
The ARIMA model did not produce meaningful forecasts for this stock price data.

❌ Why ARIMA Failed
No Significant Autocorrelation

The model (ARIMA(0,1,0)) found no AR (AutoRegressive) or MA (Moving Average) patterns to leverage.

This means:

Past prices do not linearly predict future prices.

The stock behaves like a random walk (today’s price ≈ yesterday’s price + noise).

Flat Forecasts

Predictions were identical for all future days because:

ARIMA defaulted to a naive model (Tomorrow = Today + Random Noise).

No trend or seasonality was detected.

Data Characteristics

Stock prices may be:

Too volatile for linear models.

Influenced by external factors (news, events) that ARIMA can’t capture.