In [13]:
%pip install yfinance pandas plotly statsmodels numpy scikit-learn nbformat


You should consider upgrading via the '/usr/local/bin/python3.10 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
import yfinance as yf
import pandas as pd

def get_stock_data(symbol: str, start="2020-01-01", end=None):
    """Fetch daily stock data from Yahoo Finance."""
    if end is None:
        end = pd.Timestamp.today().strftime('%Y-%m-%d')
    
    print(f"Fetching {symbol}...")
    df = yf.download(symbol, start=start, end=end, progress=False, auto_adjust=False)
    
    if df.empty:
        print(f"⚠️ Error fetching {symbol}: No data returned")
        return pd.DataFrame()
    
    # Flatten column names if they're multi-level
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(0)
    
    print(f"✓ Fetched {len(df)} days of data for {symbol}")
    return df

# Fetch stock data (no throttling needed with yfinance!)
aapl = get_stock_data("AAPL")
nvda = get_stock_data("NVDA")
lyft = get_stock_data("LYFT")

print("\nAAPL Sample:")
print(aapl.head())
print("\nColumns:", aapl.columns.tolist())

Fetching AAPL...
✓ Fetched 1453 days of data for AAPL
Fetching NVDA...
✓ Fetched 1453 days of data for NVDA
Fetching LYFT...
✓ Fetched 1453 days of data for LYFT

AAPL Sample:
Price       Adj Close      Close       High        Low       Open     Volume
Date                                                                        
2020-01-02  72.538506  75.087502  75.150002  73.797501  74.059998  135480400
2020-01-03  71.833282  74.357498  75.144997  74.125000  74.287498  146322800
2020-01-06  72.405685  74.949997  74.989998  73.187500  73.447502  118387200
2020-01-07  72.065132  74.597504  75.224998  74.370003  74.959999  108872000
2020-01-08  73.224411  75.797501  76.110001  74.290001  74.290001  132079200

Columns: ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']


In [15]:
def add_ema_dema(df, span=20):
    """Add EMA and DEMA columns to the DataFrame."""
    df[f"EMA_{span}"] = df["Close"].ewm(span=span, adjust=False).mean()
    ema = df[f"EMA_{span}"]
    df[f"DEMA_{span}"] = 2*ema - ema.ewm(span=span, adjust=False).mean()
    return df

# Apply to all stocks
aapl = add_ema_dema(aapl, 20)
nvda = add_ema_dema(nvda, 20)
lyft = add_ema_dema(lyft, 20)


In [16]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_stock(df, symbol, span=20):
    fig = go.Figure()
    
    # Add traces
    fig.add_trace(go.Scatter(x=df.index, y=df["Close"], 
                             name="Close Price", 
                             line=dict(color='blue', width=2)))
    fig.add_trace(go.Scatter(x=df.index, y=df[f"EMA_{span}"], 
                             name=f"EMA {span}", 
                             line=dict(color='orange', dash='dash')))
    fig.add_trace(go.Scatter(x=df.index, y=df[f"DEMA_{span}"], 
                             name=f"DEMA {span}", 
                             line=dict(color='green', dash='dot')))
    
    # Update layout with range slider
    fig.update_layout(
        title=f"{symbol} Stock with EMA & DEMA",
        xaxis_title="Date",
        yaxis_title="Price ($)",
        hovermode='x unified',
        height=600,
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1d", step="day", stepmode="backward"),
                    dict(count=7, label="1w", step="day", stepmode="backward"),
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=3, label="3m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(count=5, label="5y", step="year", stepmode="backward"),
                    dict(step="all", label="All")
                ])
            ),
            rangeslider=dict(visible=True),
            type="date"
        )
    )
    
    fig.show()

plot_stock(aapl, "AAPL")
plot_stock(nvda, "NVDA")
plot_stock(lyft, "LYFT")

In [17]:
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def decompose_series(df, symbol):
    decomposition = seasonal_decompose(df["Adj Close"], model="multiplicative", period=252)
    
    # Create subplots with shared x-axis
    fig = make_subplots(
        rows=4, cols=1,
        subplot_titles=('Observed', 'Trend', 'Seasonal', 'Residual'),
        vertical_spacing=0.08,
        shared_xaxes=True
    )
    
    # Add traces for each component
    fig.add_trace(go.Scatter(x=df.index, y=decomposition.observed, 
                             name='Observed', line=dict(color='blue', width=2)),
                  row=1, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=decomposition.trend, 
                             name='Trend', line=dict(color='orange', width=2)),
                  row=2, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=decomposition.seasonal, 
                             name='Seasonal', line=dict(color='green', width=2)),
                  row=3, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=decomposition.resid, 
                             name='Residual', line=dict(color='red', width=1)),
                  row=4, col=1)
    
    # Update layout
    fig.update_layout(
        title_text=f"{symbol} - Time Series Decomposition",
        height=1000,
        showlegend=False,
        hovermode='x unified',
        template='plotly_white'
    )
    
    # Add y-axis labels
    fig.update_yaxes(title_text="Price ($)", row=1, col=1)
    fig.update_yaxes(title_text="Multiplier", row=2, col=1)
    fig.update_yaxes(title_text="Multiplier", row=3, col=1)
    fig.update_yaxes(title_text="Multiplier", row=4, col=1)
    
    # Add x-axis label to bottom
    fig.update_xaxes(title_text="Date", row=4, col=1)
    
    # Add range selector to top (row 1) and range slider to bottom (row 4)
    fig.update_xaxes(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1d", step="day", stepmode="backward"),
                dict(count=7, label="1w", step="day", stepmode="backward"),
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=3, label="3m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(count=5, label="5y", step="year", stepmode="backward"),
                dict(step="all", label="All")
            ]),
            bgcolor="lightgray",
            activecolor="gray",
            y=1.15,
            yanchor="top"
        ),
        type="date",
        row=1, col=1
    )
    
    fig.update_xaxes(
        rangeslider=dict(visible=True),
        type="date",
        row=4, col=1
    )
    
    fig.show()

decompose_series(aapl, "AAPL")
decompose_series(nvda, "NVDA")
decompose_series(lyft, "LYFT")

In [18]:
# Step 4: Classical Time Series Models (MA, AR, ARIMA)

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import plotly.graph_objects as go
import numpy as np

# Function to prepare train/test split
def prepare_data(df, train_ratio=0.8):
    data = df["Adj Close"].dropna()
    train_size = int(len(data) * train_ratio)
    train, test = data[:train_size], data[train_size:]
    return train, test

# Prepare data for all stocks
aapl_train, aapl_test = prepare_data(aapl)
nvda_train, nvda_test = prepare_data(nvda)
lyft_train, lyft_test = prepare_data(lyft)

print(f"AAPL - Training: {len(aapl_train)} days, Test: {len(aapl_test)} days")
print(f"NVDA - Training: {len(nvda_train)} days, Test: {len(nvda_test)} days")
print(f"LYFT - Training: {len(lyft_train)} days, Test: {len(lyft_test)} days")

AAPL - Training: 1162 days, Test: 291 days
NVDA - Training: 1162 days, Test: 291 days
LYFT - Training: 1162 days, Test: 291 days


In [19]:
# Moving Average (MA) Model - MA(q)
# MA models use past forecast errors in a regression-like model

from sklearn.metrics import mean_squared_error, mean_absolute_error

def fit_ma_model(train, test, q=20):
    """Fit MA model and return forecasts and metrics"""
    model = ARIMA(train, order=(0, 0, q))
    fitted = model.fit()
    forecast = fitted.forecast(steps=len(test))
    forecast.index = test.index
    
    rmse = np.sqrt(mean_squared_error(test, forecast))
    mae = mean_absolute_error(test, forecast)
    
    return fitted, forecast, rmse, mae

# Fit MA(20) for all stocks
print("="*60)
print("MOVING AVERAGE (MA) MODEL - MA(20)")
print("="*60)

aapl_ma_fitted, aapl_ma_forecast, aapl_ma_rmse, aapl_ma_mae = fit_ma_model(aapl_train, aapl_test)
print(f"\nAAPL MA(20) - RMSE: ${aapl_ma_rmse:.2f}, MAE: ${aapl_ma_mae:.2f}")

nvda_ma_fitted, nvda_ma_forecast, nvda_ma_rmse, nvda_ma_mae = fit_ma_model(nvda_train, nvda_test)
print(f"NVDA MA(20) - RMSE: ${nvda_ma_rmse:.2f}, MAE: ${nvda_ma_mae:.2f}")

lyft_ma_fitted, lyft_ma_forecast, lyft_ma_rmse, lyft_ma_mae = fit_ma_model(lyft_train, lyft_test)
print(f"LYFT MA(20) - RMSE: ${lyft_ma_rmse:.2f}, MAE: ${lyft_ma_mae:.2f}")

MOVING AVERAGE (MA) MODEL - MA(20)



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-invertible starting MA parameters found. Using zeros as starting parameters.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provi


AAPL MA(20) - RMSE: $79.67, MAE: $77.42



Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-invertible starting MA parameters found. Using zeros as starting parameters.



NVDA MA(20) - RMSE: $111.28, MAE: $108.13



Maximum Likelihood optimization failed to converge. Check mle_retvals



LYFT MA(20) - RMSE: $83.13, MAE: $30.04



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



In [20]:
# Autoregressive (AR) Model - AR(p)
# AR models use past values of the series itself to predict future values

def fit_ar_model(train, test, p=20):
    """Fit AR model and return forecasts and metrics"""
    model = ARIMA(train, order=(p, 0, 0))
    fitted = model.fit()
    forecast = fitted.forecast(steps=len(test))
    forecast.index = test.index
    
    rmse = np.sqrt(mean_squared_error(test, forecast))
    mae = mean_absolute_error(test, forecast)
    
    return fitted, forecast, rmse, mae

# Fit AR(20) for all stocks
print("="*60)
print("AUTOREGRESSIVE (AR) MODEL - AR(20)")
print("="*60)

aapl_ar_fitted, aapl_ar_forecast, aapl_ar_rmse, aapl_ar_mae = fit_ar_model(aapl_train, aapl_test)
print(f"\nAAPL AR(20) - RMSE: ${aapl_ar_rmse:.2f}, MAE: ${aapl_ar_mae:.2f}")

nvda_ar_fitted, nvda_ar_forecast, nvda_ar_rmse, nvda_ar_mae = fit_ar_model(nvda_train, nvda_test)
print(f"NVDA AR(20) - RMSE: ${nvda_ar_rmse:.2f}, MAE: ${nvda_ar_mae:.2f}")

lyft_ar_fitted, lyft_ar_forecast, lyft_ar_rmse, lyft_ar_mae = fit_ar_model(lyft_train, lyft_test)
print(f"LYFT AR(20) - RMSE: ${lyft_ar_rmse:.2f}, MAE: ${lyft_ar_mae:.2f}")


# ARIMA Model - ARIMA(p, d, q)
# Combines AR and MA with differencing (d) for non-stationary data

def fit_arima_model(train, test, p=20, d=1, q=20):
    """Fit ARIMA model and return forecasts and metrics"""
    model = ARIMA(train, order=(p, d, q))
    fitted = model.fit()
    forecast = fitted.forecast(steps=len(test))
    forecast.index = test.index
    
    rmse = np.sqrt(mean_squared_error(test, forecast))
    mae = mean_absolute_error(test, forecast)
    
    return fitted, forecast, rmse, mae

# Fit ARIMA(20,1,20) for all stocks
print("\n" + "="*60)
print("ARIMA MODEL - ARIMA(20,1,20)")
print("="*60)

aapl_arima_fitted, aapl_arima_forecast, aapl_arima_rmse, aapl_arima_mae = fit_arima_model(aapl_train, aapl_test)
print(f"\nAAPL ARIMA(20,1,20) - RMSE: ${aapl_arima_rmse:.2f}, MAE: ${aapl_arima_mae:.2f}")

nvda_arima_fitted, nvda_arima_forecast, nvda_arima_rmse, nvda_arima_mae = fit_arima_model(nvda_train, nvda_test)
print(f"NVDA ARIMA(20,1,20) - RMSE: ${nvda_arima_rmse:.2f}, MAE: ${nvda_arima_mae:.2f}")

lyft_arima_fitted, lyft_arima_forecast, lyft_arima_rmse, lyft_arima_mae = fit_arima_model(lyft_train, lyft_test)
print(f"LYFT ARIMA(20,1,20) - RMSE: ${lyft_arima_rmse:.2f}, MAE: ${lyft_arima_mae:.2f}")


# Compare all models for all stocks
print("\n" + "="*60)
print("MODEL COMPARISON - ALL STOCKS")
print("="*60)

for symbol, ma_rmse, ma_mae, ar_rmse, ar_mae, arima_rmse, arima_mae in [
    ("AAPL", aapl_ma_rmse, aapl_ma_mae, aapl_ar_rmse, aapl_ar_mae, aapl_arima_rmse, aapl_arima_mae),
    ("NVDA", nvda_ma_rmse, nvda_ma_mae, nvda_ar_rmse, nvda_ar_mae, nvda_arima_rmse, nvda_arima_mae),
    ("LYFT", lyft_ma_rmse, lyft_ma_mae, lyft_ar_rmse, lyft_ar_mae, lyft_arima_rmse, lyft_arima_mae)
]:
    print(f"\n{symbol}:")
    print(f"  MA(20)         - RMSE: ${ma_rmse:.2f}, MAE: ${ma_mae:.2f}")
    print(f"  AR(20)         - RMSE: ${ar_rmse:.2f}, MAE: ${ar_mae:.2f}")
    print(f"  ARIMA(20,1,20) - RMSE: ${arima_rmse:.2f}, MAE: ${arima_mae:.2f}")


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



AUTOREGRESSIVE (AR) MODEL - AR(20)



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.




AAPL AR(20) - RMSE: $23.63, MAE: $18.15



Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



NVDA AR(20) - RMSE: $39.94, MAE: $30.50



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.



LYFT AR(20) - RMSE: $3.31, MAE: $2.75

ARIMA MODEL - ARIMA(20,1,20)



Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.




AAPL ARIMA(20,1,20) - RMSE: $17.27, MAE: $14.23



Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.



NVDA ARIMA(20,1,20) - RMSE: $30.91, MAE: $23.99



Maximum Likelihood optimization failed to converge. Check mle_retvals



LYFT ARIMA(20,1,20) - RMSE: $5.18, MAE: $4.39

MODEL COMPARISON - ALL STOCKS

AAPL:
  MA(20)         - RMSE: $79.67, MAE: $77.42
  AR(20)         - RMSE: $23.63, MAE: $18.15
  ARIMA(20,1,20) - RMSE: $17.27, MAE: $14.23

NVDA:
  MA(20)         - RMSE: $111.28, MAE: $108.13
  AR(20)         - RMSE: $39.94, MAE: $30.50
  ARIMA(20,1,20) - RMSE: $30.91, MAE: $23.99

LYFT:
  MA(20)         - RMSE: $83.13, MAE: $30.04
  AR(20)         - RMSE: $3.31, MAE: $2.75
  ARIMA(20,1,20) - RMSE: $5.18, MAE: $4.39



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



In [21]:
# Visualize Model Predictions vs Actual

def plot_forecasts(train, test, ma_forecast, ar_forecast, arima_forecast, symbol):
    fig = go.Figure()
    
    # Training data
    fig.add_trace(go.Scatter(x=train.index, y=train, 
                             name='Training Data', 
                             line=dict(color='blue', width=2)))
    
    # Actual test data
    fig.add_trace(go.Scatter(x=test.index, y=test, 
                             name='Actual Test Data', 
                             line=dict(color='black', width=2)))
    
    # MA forecast
    fig.add_trace(go.Scatter(x=ma_forecast.index, y=ma_forecast, 
                             name='MA(20) Forecast', 
                             line=dict(color='red', dash='dash')))
    
    # AR forecast
    fig.add_trace(go.Scatter(x=ar_forecast.index, y=ar_forecast, 
                             name='AR(20) Forecast', 
                             line=dict(color='orange', dash='dash')))
    
    # ARIMA forecast
    fig.add_trace(go.Scatter(x=arima_forecast.index, y=arima_forecast, 
                             name='ARIMA(20,1,20) Forecast', 
                             line=dict(color='green', dash='dash')))
    
    # Update layout
    fig.update_layout(
        title=f"{symbol} - Model Forecasts Comparison (20 Lags)",
        xaxis_title="Date",
        yaxis_title="Price ($)",
        hovermode='x unified',
        height=600,
        template='plotly_white',
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=3, label="3m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all", label="All")
                ]),
                bgcolor="lightgray",
                activecolor="gray"
            ),
            rangeslider=dict(visible=True),
            type="date"
        )
    )
    
    fig.show()

# Plot for all stocks
plot_forecasts(aapl_train, aapl_test, aapl_ma_forecast, aapl_ar_forecast, aapl_arima_forecast, "AAPL")
plot_forecasts(nvda_train, nvda_test, nvda_ma_forecast, nvda_ar_forecast, nvda_arima_forecast, "NVDA")
plot_forecasts(lyft_train, lyft_test, lyft_ma_forecast, lyft_ar_forecast, lyft_arima_forecast, "LYFT")

In [22]:
# Stationarity Diagnostic Tests
# ACF, PACF, and Augmented Dickey-Fuller Tests

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

def perform_stationarity_tests(data, symbol):
    """
    Perform comprehensive stationarity tests including:
    - Augmented Dickey-Fuller Test
    - ACF and PACF analysis
    - Visual inspection of plots
    """
    print(f"\n{'='*80}")
    print(f"STATIONARITY DIAGNOSTIC TESTS FOR {symbol}")
    print(f"{'='*80}")
    
    # 1. Augmented Dickey-Fuller Test
    print("\n1. AUGMENTED DICKEY-FULLER TEST:")
    print("-" * 50)
    
    adf_result = adfuller(data.dropna())
    adf_statistic = adf_result[0]
    p_value = adf_result[1]
    critical_values = adf_result[4]
    
    print(f"ADF Statistic: {adf_statistic:.6f}")
    print(f"p-value: {p_value:.6f}")
    print("Critical Values:")
    for key, value in critical_values.items():
        print(f"  {key}: {value:.6f}")
    
    # Interpretation
    if p_value <= 0.05:
        print("✓ Result: Series is STATIONARY (reject null hypothesis)")
    else:
        print("✗ Result: Series is NON-STATIONARY (fail to reject null hypothesis)")
    
    # 2. ACF and PACF Analysis
    print(f"\n2. AUTOCORRELATION ANALYSIS:")
    print("-" * 50)
    
    # Calculate ACF and PACF
    acf_values = acf(data.dropna(), nlags=40, fft=True)
    pacf_values = pacf(data.dropna(), nlags=40, method='ols')
    
    # Check for significant autocorrelations
    significant_acf = np.sum(np.abs(acf_values[1:]) > 0.1)  # Beyond lag 0
    significant_pacf = np.sum(np.abs(pacf_values[1:]) > 0.1)  # Beyond lag 0
    
    print(f"Significant ACF lags (|correlation| > 0.1): {significant_acf}")
    print(f"Significant PACF lags (|correlation| > 0.1): {significant_pacf}")
    
    if significant_acf > 5:
        print("→ High autocorrelation suggests non-stationarity")
    else:
        print("→ Low autocorrelation suggests stationarity")
    
    return {
        'adf_statistic': adf_statistic,
        'p_value': p_value,
        'critical_values': critical_values,
        'is_stationary': p_value <= 0.05,
        'acf_values': acf_values,
        'pacf_values': pacf_values,
        'significant_acf': significant_acf,
        'significant_pacf': significant_pacf
    }

def plot_stationarity_diagnostics(data, symbol, test_results):
    """
    Create comprehensive stationarity diagnostic plots
    """
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=[
            f'{symbol} - Time Series',
            f'{symbol} - ACF Plot',
            f'{symbol} - PACF Plot', 
            f'{symbol} - Rolling Mean & Std',
            f'{symbol} - First Differences',
            f'{symbol} - ACF of First Differences'
        ],
        vertical_spacing=0.08,
        horizontal_spacing=0.1
    )
    
    # 1. Original Time Series
    fig.add_trace(
        go.Scatter(x=data.index, y=data, name='Original Series', 
                  line=dict(color='blue', width=2)),
        row=1, col=1
    )
    
    # 2. ACF Plot
    lags = range(len(test_results['acf_values']))
    fig.add_trace(
        go.Bar(x=list(lags), y=test_results['acf_values'], 
               name='ACF', marker_color='lightblue'),
        row=1, col=2
    )
    
    # Add confidence intervals for ACF
    n = len(data.dropna())
    conf_int = 1.96 / np.sqrt(n)
    fig.add_hline(y=conf_int, line_dash="dash", line_color="red", 
                  annotation_text="95% CI", row=1, col=2)
    fig.add_hline(y=-conf_int, line_dash="dash", line_color="red", row=1, col=2)
    
    # 3. PACF Plot
    fig.add_trace(
        go.Bar(x=list(lags), y=test_results['pacf_values'], 
               name='PACF', marker_color='lightgreen'),
        row=2, col=1
    )
    
    # Add confidence intervals for PACF
    fig.add_hline(y=conf_int, line_dash="dash", line_color="red", 
                  annotation_text="95% CI", row=2, col=1)
    fig.add_hline(y=-conf_int, line_dash="dash", line_color="red", row=2, col=1)
    
    # 4. Rolling Statistics
    rolling_mean = data.rolling(window=30).mean()
    rolling_std = data.rolling(window=30).std()
    
    fig.add_trace(
        go.Scatter(x=data.index, y=data, name='Original', 
                  line=dict(color='blue', width=1)),
        row=2, col=2
    )
    fig.add_trace(
        go.Scatter(x=rolling_mean.index, y=rolling_mean, name='Rolling Mean', 
                  line=dict(color='red', width=2)),
        row=2, col=2
    )
    fig.add_trace(
        go.Scatter(x=rolling_std.index, y=rolling_std, name='Rolling Std', 
                  line=dict(color='green', width=2)),
        row=2, col=2
    )
    
    # 5. First Differences
    first_diff = data.diff().dropna()
    fig.add_trace(
        go.Scatter(x=first_diff.index, y=first_diff, name='First Differences', 
                  line=dict(color='purple', width=2)),
        row=3, col=1
    )
    
    # 6. ACF of First Differences
    acf_diff = acf(first_diff, nlags=40, fft=True)
    fig.add_trace(
        go.Bar(x=list(range(len(acf_diff))), y=acf_diff, 
               name='ACF of Differences', marker_color='orange'),
        row=3, col=2
    )
    
    # Add confidence intervals for ACF of differences
    n_diff = len(first_diff)
    conf_int_diff = 1.96 / np.sqrt(n_diff)
    fig.add_hline(y=conf_int_diff, line_dash="dash", line_color="red", 
                  annotation_text="95% CI", row=3, col=2)
    fig.add_hline(y=-conf_int_diff, line_dash="dash", line_color="red", row=3, col=2)
    
    # Update layout
    fig.update_layout(
        title_text=f"{symbol} - Comprehensive Stationarity Diagnostics",
        height=1200,
        showlegend=True,
        template='plotly_white'
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_xaxes(title_text="Lag", row=1, col=2)
    fig.update_xaxes(title_text="Lag", row=2, col=1)
    fig.update_xaxes(title_text="Date", row=2, col=2)
    fig.update_xaxes(title_text="Date", row=3, col=1)
    fig.update_xaxes(title_text="Lag", row=3, col=2)
    
    fig.update_yaxes(title_text="Price ($)", row=1, col=1)
    fig.update_yaxes(title_text="ACF", row=1, col=2)
    fig.update_yaxes(title_text="PACF", row=2, col=1)
    fig.update_yaxes(title_text="Price ($)", row=2, col=2)
    fig.update_yaxes(title_text="Difference", row=3, col=1)
    fig.update_yaxes(title_text="ACF", row=3, col=2)
    
    fig.show()

# Perform stationarity tests for all stocks
stocks_data = {
    'AAPL': aapl['Adj Close'],
    'NVDA': nvda['Adj Close'], 
    'LYFT': lyft['Adj Close']
}

test_results = {}
for symbol, data in stocks_data.items():
    results = perform_stationarity_tests(data, symbol)
    test_results[symbol] = results
    plot_stationarity_diagnostics(data, symbol, results)



STATIONARITY DIAGNOSTIC TESTS FOR AAPL

1. AUGMENTED DICKEY-FULLER TEST:
--------------------------------------------------
ADF Statistic: -1.274950
p-value: 0.640553
Critical Values:
  1%: -3.434862
  5%: -2.863533
  10%: -2.567831
✗ Result: Series is NON-STATIONARY (fail to reject null hypothesis)

2. AUTOCORRELATION ANALYSIS:
--------------------------------------------------
Significant ACF lags (|correlation| > 0.1): 40
Significant PACF lags (|correlation| > 0.1): 1
→ High autocorrelation suggests non-stationarity



STATIONARITY DIAGNOSTIC TESTS FOR NVDA

1. AUGMENTED DICKEY-FULLER TEST:
--------------------------------------------------
ADF Statistic: 1.264074
p-value: 0.996400
Critical Values:
  1%: -3.434925
  5%: -2.863560
  10%: -2.567846
✗ Result: Series is NON-STATIONARY (fail to reject null hypothesis)

2. AUTOCORRELATION ANALYSIS:
--------------------------------------------------
Significant ACF lags (|correlation| > 0.1): 40
Significant PACF lags (|correlation| > 0.1): 3
→ High autocorrelation suggests non-stationarity



STATIONARITY DIAGNOSTIC TESTS FOR LYFT

1. AUGMENTED DICKEY-FULLER TEST:
--------------------------------------------------
ADF Statistic: -1.765002
p-value: 0.397970
Critical Values:
  1%: -3.434925
  5%: -2.863560
  10%: -2.567846
✗ Result: Series is NON-STATIONARY (fail to reject null hypothesis)

2. AUTOCORRELATION ANALYSIS:
--------------------------------------------------
Significant ACF lags (|correlation| > 0.1): 40
Significant PACF lags (|correlation| > 0.1): 1
→ High autocorrelation suggests non-stationarity


In [31]:
# Summary Analysis of Stationarity Tests

print("\n" + "="*100)
print("COMPREHENSIVE STATIONARITY ANALYSIS SUMMARY")
print("="*100)

# Create summary table
summary_data = []
for symbol, results in test_results.items():
    summary_data.append({
        'Stock': symbol,
        'ADF Statistic': f"{results['adf_statistic']:.4f}",
        'p-value': f"{results['p_value']:.6f}",
        'Stationary': "✓ YES" if results['is_stationary'] else "✗ NO",
        'Significant ACF': results['significant_acf'],
        'Significant PACF': results['significant_pacf']
    })

# Display summary table
import pandas as pd
summary_df = pd.DataFrame(summary_data)
print("\nSTATIONARITY TEST RESULTS:")
print(summary_df.to_string(index=False))

# Detailed interpretation
print(f"\n{'='*100}")
print("DETAILED INTERPRETATION:")
print(f"{'='*100}")

for symbol, results in test_results.items():
    print(f"\n{symbol} ANALYSIS:")
    print("-" * 50)
    
    # ADF Test Interpretation
    if results['is_stationary']:
        print("✓ ADF Test: Series is STATIONARY")
        print("  → The null hypothesis of unit root is rejected")
        print("  → No differencing needed for ARIMA models")
    else:
        print("✗ ADF Test: Series is NON-STATIONARY")
        print("  → The null hypothesis of unit root cannot be rejected")
        print("  → Differencing (d=1) is recommended for ARIMA models")
    
    # ACF/PACF Interpretation
    print(f"\nAutocorrelation Analysis:")
    print(f"  • Significant ACF lags: {results['significant_acf']}")
    print(f"  • Significant PACF lags: {results['significant_pacf']}")
    
    if results['significant_acf'] > 10:
        print("  → High autocorrelation indicates strong trend/seasonality")
        print("  → Suggests non-stationarity in the series")
    elif results['significant_acf'] > 5:
        print("  → Moderate autocorrelation")
        print("  → May need differencing or detrending")
    else:
        print("  → Low autocorrelation suggests stationarity")
    
    # Recommendations
    print(f"\nModeling Recommendations:")
    if results['is_stationary']:
        print("  • Can use ARIMA(p,0,q) models (no differencing)")
        print("  • Focus on AR and MA components")
    else:
        print("  • Use ARIMA(p,1,q) models (first differencing)")
        print("  • Consider higher order differencing if needed")
        print("  • May need seasonal differencing for seasonal patterns")

# Overall conclusions
print(f"\n{'='*100}")
print("OVERALL CONCLUSIONS:")
print(f"{'='*100}")

stationary_count = sum(1 for results in test_results.values() if results['is_stationary'])
total_stocks = len(test_results)

print(f"• {stationary_count}/{total_stocks} stocks show stationarity in their price levels")
print(f"• Most financial time series are typically non-stationary")
print(f"• This is expected behavior for stock prices, which tend to have trends")

if stationary_count == 0:
    print("\n→ ALL stocks are NON-STATIONARY")
    print("→ This confirms that stock prices generally require differencing")
    print("→ ARIMA models with d=1 (first differencing) are appropriate")
    print("→ Consider log returns for better stationarity properties")
elif stationary_count == total_stocks:
    print("\n→ ALL stocks are STATIONARY")
    print("→ This is unusual for stock prices - verify data quality")
    print("→ ARIMA models with d=0 may be appropriate")
else:
    print(f"\n→ MIXED results: {stationary_count} stationary, {total_stocks - stationary_count} non-stationary")
    print("→ Use appropriate differencing based on individual stock results")

print(f"\n→ Next steps: Apply appropriate transformations and re-test stationarity")
print(f"→ Consider log returns: log(price_t) - log(price_{t-1}) for better stationarity")



COMPREHENSIVE STATIONARITY ANALYSIS SUMMARY

STATIONARITY TEST RESULTS:
Stock ADF Statistic  p-value Stationary  Significant ACF  Significant PACF
 AAPL       -1.2750 0.640553       ✗ NO               40                 1
 NVDA        1.2641 0.996400       ✗ NO               40                 3
 LYFT       -1.7650 0.397970       ✗ NO               40                 1

DETAILED INTERPRETATION:

AAPL ANALYSIS:
--------------------------------------------------
✗ ADF Test: Series is NON-STATIONARY
  → The null hypothesis of unit root cannot be rejected
  → Differencing (d=1) is recommended for ARIMA models

Autocorrelation Analysis:
  • Significant ACF lags: 40
  • Significant PACF lags: 1
  → High autocorrelation indicates strong trend/seasonality
  → Suggests non-stationarity in the series

Modeling Recommendations:
  • Use ARIMA(p,1,q) models (first differencing)
  • Consider higher order differencing if needed
  • May need seasonal differencing for seasonal patterns

NVDA ANALYSIS:

NameError: name 't' is not defined

In [25]:
# Stationarity Tests on Log Returns
# Log returns are often more stationary than price levels

print("\n" + "="*100)
print("STATIONARITY TESTS ON LOG RETURNS")
print("="*100)

def calculate_log_returns(prices):
    """Calculate log returns: log(price_t) - log(price_{t-1})"""
    return np.log(prices).diff().dropna()

# Calculate log returns for all stocks
log_returns = {}
for symbol, data in stocks_data.items():
    log_returns[symbol] = calculate_log_returns(data)
    print(f"\n{symbol} Log Returns:")
    print(f"  Mean: {log_returns[symbol].mean():.6f}")
    print(f"  Std: {log_returns[symbol].std():.6f}")
    print(f"  Min: {log_returns[symbol].min():.6f}")
    print(f"  Max: {log_returns[symbol].max():.6f}")

# Test stationarity of log returns
print(f"\n{'='*80}")
print("AUGMENTED DICKEY-FULLER TESTS ON LOG RETURNS")
print(f"{'='*80}")

log_return_results = {}
for symbol, returns in log_returns.items():
    print(f"\n{symbol} LOG RETURNS:")
    print("-" * 50)
    
    adf_result = adfuller(returns.dropna())
    adf_statistic = adf_result[0]
    p_value = adf_result[1]
    critical_values = adf_result[4]
    
    print(f"ADF Statistic: {adf_statistic:.6f}")
    print(f"p-value: {p_value:.6f}")
    print("Critical Values:")
    for key, value in critical_values.items():
        print(f"  {key}: {value:.6f}")
    
    is_stationary = p_value <= 0.05
    print(f"Result: {'✓ STATIONARY' if is_stationary else '✗ NON-STATIONARY'}")
    
    log_return_results[symbol] = {
        'adf_statistic': adf_statistic,
        'p_value': p_value,
        'is_stationary': is_stationary
    }

# Visualize log returns and their stationarity properties
def plot_log_returns_analysis(symbol, prices, log_returns):
    """Plot log returns with stationarity diagnostics"""
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            f'{symbol} - Price Levels',
            f'{symbol} - Log Returns',
            f'{symbol} - ACF of Log Returns',
            f'{symbol} - PACF of Log Returns'
        ],
        vertical_spacing=0.1,
        horizontal_spacing=0.1
    )
    
    # 1. Original prices
    fig.add_trace(
        go.Scatter(x=prices.index, y=prices, name='Price', 
                  line=dict(color='blue', width=2)),
        row=1, col=1
    )
    
    # 2. Log returns
    fig.add_trace(
        go.Scatter(x=log_returns.index, y=log_returns, name='Log Returns', 
                  line=dict(color='red', width=1)),
        row=1, col=2
    )
    
    # 3. ACF of log returns
    acf_returns = acf(log_returns.dropna(), nlags=40, fft=True)
    fig.add_trace(
        go.Bar(x=list(range(len(acf_returns))), y=acf_returns, 
               name='ACF', marker_color='lightblue'),
        row=2, col=1
    )
    
    # Add confidence intervals
    n = len(log_returns.dropna())
    conf_int = 1.96 / np.sqrt(n)
    fig.add_hline(y=conf_int, line_dash="dash", line_color="red", row=2, col=1)
    fig.add_hline(y=-conf_int, line_dash="dash", line_color="red", row=2, col=1)
    
    # 4. PACF of log returns
    pacf_returns = pacf(log_returns.dropna(), nlags=40, method='ols')
    fig.add_trace(
        go.Bar(x=list(range(len(pacf_returns))), y=pacf_returns, 
               name='PACF', marker_color='lightgreen'),
        row=2, col=2
    )
    
    # Add confidence intervals
    fig.add_hline(y=conf_int, line_dash="dash", line_color="red", row=2, col=2)
    fig.add_hline(y=-conf_int, line_dash="dash", line_color="red", row=2, col=2)
    
    # Update layout
    fig.update_layout(
        title_text=f"{symbol} - Log Returns Stationarity Analysis",
        height=800,
        showlegend=True,
        template='plotly_white'
    )
    
    # Update axes labels
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_xaxes(title_text="Date", row=1, col=2)
    fig.update_xaxes(title_text="Lag", row=2, col=1)
    fig.update_xaxes(title_text="Lag", row=2, col=2)
    
    fig.update_yaxes(title_text="Price ($)", row=1, col=1)
    fig.update_yaxes(title_text="Log Returns", row=1, col=2)
    fig.update_yaxes(title_text="ACF", row=2, col=1)
    fig.update_yaxes(title_text="PACF", row=2, col=2)
    
    fig.show()

# Create plots for all stocks
for symbol in stocks_data.keys():
    plot_log_returns_analysis(symbol, stocks_data[symbol], log_returns[symbol])

# Summary of log returns stationarity
print(f"\n{'='*100}")
print("LOG RETURNS STATIONARITY SUMMARY")
print(f"{'='*100}")

log_stationary_count = sum(1 for results in log_return_results.values() if results['is_stationary'])
print(f"• {log_stationary_count}/{len(log_return_results)} stocks have stationary log returns")

if log_stationary_count == len(log_return_results):
    print("✓ ALL log returns are STATIONARY")
    print("→ This is the expected behavior for financial returns")
    print("→ Log returns are suitable for ARIMA modeling")
    print("→ No differencing needed for log returns")
elif log_stationary_count > 0:
    print(f"→ MIXED results: {log_stationary_count} stationary, {len(log_return_results) - log_stationary_count} non-stationary")
    print("→ Most log returns should be stationary")
else:
    print("✗ NO log returns are stationary")
    print("→ This is unusual - investigate data quality")
    print("→ Consider additional transformations")

print(f"\n→ Log returns are generally more stationary than price levels")
print(f"→ This makes them better suited for time series modeling")
print(f"→ Consider using log returns for ARIMA models instead of price levels")



STATIONARITY TESTS ON LOG RETURNS

AAPL Log Returns:
  Mean: 0.000846
  Std: 0.020271
  Min: -0.137708
  Max: 0.142617

NVDA Log Returns:
  Mean: 0.002377
  Std: 0.033657
  Min: -0.203979
  Max: 0.218088

LYFT Log Returns:
  Mean: -0.000537
  Std: 0.045532
  Min: -0.453131
  Max: 0.300990

AUGMENTED DICKEY-FULLER TESTS ON LOG RETURNS

AAPL LOG RETURNS:
--------------------------------------------------
ADF Statistic: -12.740294
p-value: 0.000000
Critical Values:
  1%: -3.434890
  5%: -2.863545
  10%: -2.567837
Result: ✓ STATIONARY

NVDA LOG RETURNS:
--------------------------------------------------
ADF Statistic: -12.025553
p-value: 0.000000
Critical Values:
  1%: -3.434890
  5%: -2.863545
  10%: -2.567837
Result: ✓ STATIONARY

LYFT LOG RETURNS:
--------------------------------------------------
ADF Statistic: -9.083592
p-value: 0.000000
Critical Values:
  1%: -3.434918
  5%: -2.863558
  10%: -2.567844
Result: ✓ STATIONARY



LOG RETURNS STATIONARITY SUMMARY
• 3/3 stocks have stationary log returns
✓ ALL log returns are STATIONARY
→ This is the expected behavior for financial returns
→ Log returns are suitable for ARIMA modeling
→ No differencing needed for log returns

→ Log returns are generally more stationary than price levels
→ This makes them better suited for time series modeling
→ Consider using log returns for ARIMA models instead of price levels


In [26]:
# Fix: Define all necessary variables for plotting
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA

# Function to prepare train/test split
def prepare_data(df, train_ratio=0.8):
    data = df["Adj Close"].dropna()
    train_size = int(len(data) * train_ratio)
    train, test = data[:train_size], data[train_size:]
    return train, test

# Function to fit MA model
def fit_ma_model(train, test, q=20):
    model = ARIMA(train, order=(0, 0, q))
    fitted = model.fit()
    forecast = fitted.forecast(steps=len(test))
    forecast.index = test.index
    rmse = np.sqrt(mean_squared_error(test, forecast))
    mae = mean_absolute_error(test, forecast)
    return fitted, forecast, rmse, mae

# Function to fit AR model
def fit_ar_model(train, test, p=20):
    model = ARIMA(train, order=(p, 0, 0))
    fitted = model.fit()
    forecast = fitted.forecast(steps=len(test))
    forecast.index = test.index
    rmse = np.sqrt(mean_squared_error(test, forecast))
    mae = mean_absolute_error(test, forecast)
    return fitted, forecast, rmse, mae

# Function to fit ARIMA model
def fit_arima_model(train, test, p=20, d=1, q=20):
    model = ARIMA(train, order=(p, d, q))
    fitted = model.fit()
    forecast = fitted.forecast(steps=len(test))
    forecast.index = test.index
    rmse = np.sqrt(mean_squared_error(test, forecast))
    mae = mean_absolute_error(test, forecast)
    return fitted, forecast, rmse, mae

# Prepare data for all stocks
print("Preparing train/test data...")
aapl_train, aapl_test = prepare_data(aapl)
nvda_train, nvda_test = prepare_data(nvda)
lyft_train, lyft_test = prepare_data(lyft)

print(f"AAPL - Training: {len(aapl_train)} days, Test: {len(aapl_test)} days")
print(f"NVDA - Training: {len(nvda_train)} days, Test: {len(nvda_test)} days")
print(f"LYFT - Training: {len(lyft_train)} days, Test: {len(lyft_test)} days")

# Fit MA models
print("\nFitting MA models...")
aapl_ma_fitted, aapl_ma_forecast, aapl_ma_rmse, aapl_ma_mae = fit_ma_model(aapl_train, aapl_test)
nvda_ma_fitted, nvda_ma_forecast, nvda_ma_rmse, nvda_ma_mae = fit_ma_model(nvda_train, nvda_test)
lyft_ma_fitted, lyft_ma_forecast, lyft_ma_rmse, lyft_ma_mae = fit_ma_model(lyft_train, lyft_test)

# Fit AR models
print("Fitting AR models...")
aapl_ar_fitted, aapl_ar_forecast, aapl_ar_rmse, aapl_ar_mae = fit_ar_model(aapl_train, aapl_test)
nvda_ar_fitted, nvda_ar_forecast, nvda_ar_rmse, nvda_ar_mae = fit_ar_model(nvda_train, nvda_test)
lyft_ar_fitted, lyft_ar_forecast, lyft_ar_rmse, lyft_ar_mae = fit_ar_model(lyft_train, lyft_test)

# Fit ARIMA models
print("Fitting ARIMA models...")
aapl_arima_fitted, aapl_arima_forecast, aapl_arima_rmse, aapl_arima_mae = fit_arima_model(aapl_train, aapl_test)
nvda_arima_fitted, nvda_arima_forecast, nvda_arima_rmse, nvda_arima_mae = fit_arima_model(nvda_train, nvda_test)
lyft_arima_fitted, lyft_arima_forecast, lyft_arima_rmse, lyft_arima_mae = fit_arima_model(lyft_train, lyft_test)

print("\nAll models fitted successfully!")
print("Variables are now defined and ready for plotting.")

# Print some results
print(f"\nAAPL MA(20) - RMSE: ${aapl_ma_rmse:.2f}, MAE: ${aapl_ma_mae:.2f}")
print(f"AAPL AR(20) - RMSE: ${aapl_ar_rmse:.2f}, MAE: ${aapl_ar_mae:.2f}")
print(f"AAPL ARIMA(20,1,20) - RMSE: ${aapl_arima_rmse:.2f}, MAE: ${aapl_arima_mae:.2f}")


Preparing train/test data...
AAPL - Training: 1162 days, Test: 291 days
NVDA - Training: 1162 days, Test: 291 days
LYFT - Training: 1162 days, Test: 291 days

Fitting MA models...



A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-invertible starting MA parameters found. Using zeros as starting parameters.


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provi

Fitting AR models...



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model 

Fitting ARIMA models...



Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


A date index has been provided, but it has no associated frequency information and so will be ignored when e.g. forecasting.


Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.


Non-invertible starting MA parameters found. Using zeros as starting parameters.


Maximum Likelihood optimization failed to converge. Check mle_retvals


No supported index is available. Predictio


All models fitted successfully!
Variables are now defined and ready for plotting.

AAPL MA(20) - RMSE: $79.67, MAE: $77.42
AAPL AR(20) - RMSE: $23.63, MAE: $18.15
AAPL ARIMA(20,1,20) - RMSE: $17.27, MAE: $14.23



No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



In [27]:
# Visualize Model Predictions vs Actual

def plot_forecasts(train, test, ma_forecast, ar_forecast, arima_forecast, symbol):
    fig = go.Figure()
    
    # Training data
    fig.add_trace(go.Scatter(x=train.index, y=train, 
                             name='Training Data', 
                             line=dict(color='blue', width=2)))
    
    # Actual test data
    fig.add_trace(go.Scatter(x=test.index, y=test, 
                             name='Actual Test Data', 
                             line=dict(color='black', width=2)))
    
    # MA forecast
    fig.add_trace(go.Scatter(x=ma_forecast.index, y=ma_forecast, 
                             name='MA(20) Forecast', 
                             line=dict(color='red', dash='dash')))
    
    # AR forecast
    fig.add_trace(go.Scatter(x=ar_forecast.index, y=ar_forecast, 
                             name='AR(20) Forecast', 
                             line=dict(color='orange', dash='dash')))
    
    # ARIMA forecast
    fig.add_trace(go.Scatter(x=arima_forecast.index, y=arima_forecast, 
                             name='ARIMA(20,1,20) Forecast', 
                             line=dict(color='green', dash='dash')))
    
    # Update layout
    fig.update_layout(
        title=f"{symbol} - Model Forecasts Comparison (20 Lags)",
        xaxis_title="Date",
        yaxis_title="Price ($)",
        hovermode='x unified',
        height=600,
        template='plotly_white',
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=3, label="3m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all", label="All")
                ]),
                bgcolor="lightgray",
                activecolor="gray"
            ),
            rangeslider=dict(visible=True),
            type="date"
        )
    )
    
    fig.show()

# Plot for all stocks
plot_forecasts(aapl_train, aapl_test, aapl_ma_forecast, aapl_ar_forecast, aapl_arima_forecast, "AAPL")
plot_forecasts(nvda_train, nvda_test, nvda_ma_forecast, nvda_ar_forecast, nvda_arima_forecast, "NVDA")
plot_forecasts(lyft_train, lyft_test, lyft_ma_forecast, lyft_ar_forecast, lyft_arima_forecast, "LYFT")


In [28]:
# Stationarity Tests and Analysis
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

def perform_stationarity_tests(data, symbol):
    """
    Perform comprehensive stationarity tests including:
    - Augmented Dickey-Fuller Test
    - ACF and PACF analysis
    - Visual inspection of plots
    """
    print(f"\n{'='*80}")
    print(f"STATIONARITY DIAGNOSTIC TESTS FOR {symbol}")
    print(f"{'='*80}")
    
    # 1. Augmented Dickey-Fuller Test
    print("\n1. AUGMENTED DICKEY-FULLER TEST:")
    print("-" * 50)
    
    adf_result = adfuller(data.dropna())
    adf_statistic = adf_result[0]
    p_value = adf_result[1]
    critical_values = adf_result[4]
    
    print(f"ADF Statistic: {adf_statistic:.6f}")
    print(f"p-value: {p_value:.6f}")
    print("Critical Values:")
    for key, value in critical_values.items():
        print(f"  {key}: {value:.6f}")
    
    # Interpretation
    if p_value <= 0.05:
        print("✓ Result: Series is STATIONARY (reject null hypothesis)")
    else:
        print("✗ Result: Series is NON-STATIONARY (fail to reject null hypothesis)")
    
    # 2. ACF and PACF Analysis
    print(f"\n2. AUTOCORRELATION ANALYSIS:")
    print("-" * 50)
    
    # Calculate ACF and PACF
    acf_values = acf(data.dropna(), nlags=40, fft=True)
    pacf_values = pacf(data.dropna(), nlags=40, method='ols')
    
    # Check for significant autocorrelations
    significant_acf = np.sum(np.abs(acf_values[1:]) > 0.1)  # Beyond lag 0
    significant_pacf = np.sum(np.abs(pacf_values[1:]) > 0.1)  # Beyond lag 0
    
    print(f"Significant ACF lags (|correlation| > 0.1): {significant_acf}")
    print(f"Significant PACF lags (|correlation| > 0.1): {significant_pacf}")
    
    if significant_acf > 5:
        print("→ High autocorrelation suggests non-stationarity")
    else:
        print("→ Low autocorrelation suggests stationarity")
    
    return {
        'adf_statistic': adf_statistic,
        'p_value': p_value,
        'critical_values': critical_values,
        'is_stationary': p_value <= 0.05,
        'acf_values': acf_values,
        'pacf_values': pacf_values,
        'significant_acf': significant_acf,
        'significant_pacf': significant_pacf
    }

# Perform stationarity tests for all stocks
stocks_data = {
    'AAPL': aapl['Adj Close'],
    'NVDA': nvda['Adj Close'], 
    'LYFT': lyft['Adj Close']
}

test_results = {}
for symbol, data in stocks_data.items():
    results = perform_stationarity_tests(data, symbol)
    test_results[symbol] = results

print("\nAll stationarity tests completed!")



STATIONARITY DIAGNOSTIC TESTS FOR AAPL

1. AUGMENTED DICKEY-FULLER TEST:
--------------------------------------------------
ADF Statistic: -1.274950
p-value: 0.640553
Critical Values:
  1%: -3.434862
  5%: -2.863533
  10%: -2.567831
✗ Result: Series is NON-STATIONARY (fail to reject null hypothesis)

2. AUTOCORRELATION ANALYSIS:
--------------------------------------------------
Significant ACF lags (|correlation| > 0.1): 40
Significant PACF lags (|correlation| > 0.1): 1
→ High autocorrelation suggests non-stationarity

STATIONARITY DIAGNOSTIC TESTS FOR NVDA

1. AUGMENTED DICKEY-FULLER TEST:
--------------------------------------------------
ADF Statistic: 1.264074
p-value: 0.996400
Critical Values:
  1%: -3.434925
  5%: -2.863560
  10%: -2.567846
✗ Result: Series is NON-STATIONARY (fail to reject null hypothesis)

2. AUTOCORRELATION ANALYSIS:
--------------------------------------------------
Significant ACF lags (|correlation| > 0.1): 40
Significant PACF lags (|correlation| > 0.1): 

In [29]:
# Summary Analysis of Stationarity Tests

print("\n" + "="*100)
print("COMPREHENSIVE STATIONARITY ANALYSIS SUMMARY")
print("="*100)

# Create summary table
summary_data = []
for symbol, results in test_results.items():
    summary_data.append({
        'Stock': symbol,
        'ADF Statistic': f"{results['adf_statistic']:.4f}",
        'p-value': f"{results['p_value']:.6f}",
        'Stationary': "✓ YES" if results['is_stationary'] else "✗ NO",
        'Significant ACF': results['significant_acf'],
        'Significant PACF': results['significant_pacf']
    })

# Display summary table
import pandas as pd
summary_df = pd.DataFrame(summary_data)
print("\nSTATIONARITY TEST RESULTS:")
print(summary_df.to_string(index=False))

# Detailed interpretation
print(f"\n{'='*100}")
print("DETAILED INTERPRETATION:")
print(f"{'='*100}")

for symbol, results in test_results.items():
    print(f"\n{symbol} ANALYSIS:")
    print("-" * 50)
    
    # ADF Test Interpretation
    if results['is_stationary']:
        print("✓ ADF Test: Series is STATIONARY")
        print("  → The null hypothesis of unit root is rejected")
        print("  → No differencing needed for ARIMA models")
    else:
        print("✗ ADF Test: Series is NON-STATIONARY")
        print("  → The null hypothesis of unit root cannot be rejected")
        print("  → Differencing (d=1) is recommended for ARIMA models")
    
    # ACF/PACF Interpretation
    print(f"\nAutocorrelation Analysis:")
    print(f"  • Significant ACF lags: {results['significant_acf']}")
    print(f"  • Significant PACF lags: {results['significant_pacf']}")
    
    if results['significant_acf'] > 10:
        print("  → High autocorrelation indicates strong trend/seasonality")
        print("  → Suggests non-stationarity in the series")
    elif results['significant_acf'] > 5:
        print("  → Moderate autocorrelation")
        print("  → May need differencing or detrending")
    else:
        print("  → Low autocorrelation suggests stationarity")
    
    # Recommendations
    print(f"\nModeling Recommendations:")
    if results['is_stationary']:
        print("  • Can use ARIMA(p,0,q) models (no differencing)")
        print("  • Focus on AR and MA components")
    else:
        print("  • Use ARIMA(p,1,q) models (first differencing)")
        print("  • Consider higher order differencing if needed")
        print("  • May need seasonal differencing for seasonal patterns")

# Overall conclusions
print(f"\n{'='*100}")
print("OVERALL CONCLUSIONS:")
print(f"{'='*100}")

stationary_count = sum(1 for results in test_results.values() if results['is_stationary'])
total_stocks = len(test_results)

print(f"• {stationary_count}/{total_stocks} stocks show stationarity in their price levels")
print(f"• Most financial time series are typically non-stationary")
print(f"• This is expected behavior for stock prices, which tend to have trends")

if stationary_count == 0:
    print("\n→ ALL stocks are NON-STATIONARY")
    print("→ This confirms that stock prices generally require differencing")
    print("→ ARIMA models with d=1 (first differencing) are appropriate")
    print("→ Consider log returns for better stationarity properties")
elif stationary_count == total_stocks:
    print("\n→ ALL stocks are STATIONARY")
    print("→ This is unusual for stock prices - verify data quality")
    print("→ ARIMA models with d=0 may be appropriate")
else:
    print(f"\n→ MIXED results: {stationary_count} stationary, {total_stocks - stationary_count} non-stationary")
    print("→ Use appropriate differencing based on individual stock results")

print(f"\n→ Next steps: Apply appropriate transformations and re-test stationarity")
print(f"→ Consider log returns: log(price_t) - log(price_{{t-1}}) for better stationarity")



COMPREHENSIVE STATIONARITY ANALYSIS SUMMARY

STATIONARITY TEST RESULTS:
Stock ADF Statistic  p-value Stationary  Significant ACF  Significant PACF
 AAPL       -1.2750 0.640553       ✗ NO               40                 1
 NVDA        1.2641 0.996400       ✗ NO               40                 3
 LYFT       -1.7650 0.397970       ✗ NO               40                 1

DETAILED INTERPRETATION:

AAPL ANALYSIS:
--------------------------------------------------
✗ ADF Test: Series is NON-STATIONARY
  → The null hypothesis of unit root cannot be rejected
  → Differencing (d=1) is recommended for ARIMA models

Autocorrelation Analysis:
  • Significant ACF lags: 40
  • Significant PACF lags: 1
  → High autocorrelation indicates strong trend/seasonality
  → Suggests non-stationarity in the series

Modeling Recommendations:
  • Use ARIMA(p,1,q) models (first differencing)
  • Consider higher order differencing if needed
  • May need seasonal differencing for seasonal patterns

NVDA ANALYSIS: