In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

# ARIMA
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA

# LSTM
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

# Load TSLA data (cleaned from Task 1)
data_folder = os.path.join("..", "data")
plots_folder = os.path.join("..", "plots")
os.makedirs(plots_folder, exist_ok=True)

df = pd.read_csv(os.path.join(data_folder, "TSLA.csv"), parse_dates=[0], index_col=0)
df["Daily_Return"] = df["Close"].pct_change()
df.dropna(inplace=True)

# Train-test split
train_df = df.loc["2015-07-01":"2023-12-31"]["Close"]
test_df = df.loc["2024-01-01":"2025-07-31"]["Close"]

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

# --- ARIMA ---
print("\n--- Training ARIMA ---")
model_arima = pm.auto_arima(train_df, seasonal=False, stepwise=True, suppress_warnings=True)
print("Best ARIMA order:", model_arima.order)

arima_fit = ARIMA(train_df, order=model_arima.order).fit()
arima_pred = arima_fit.forecast(steps=len(test_df))

arima_mae = mean_absolute_error(test_df, arima_pred)
arima_rmse = math.sqrt(mean_squared_error(test_df, arima_pred))
arima_mape = np.mean(np.abs((test_df - arima_pred) / test_df)) * 100

plt.figure(figsize=(12,6))
plt.plot(train_df, label="Train")
plt.plot(test_df, label="Test")
plt.plot(test_df.index, arima_pred, label="ARIMA Forecast")
plt.legend()
plt.title("ARIMA Forecast vs Actual (TSLA)")
plt.savefig(os.path.join(plots_folder, "task2_arima_forecast.png"))
plt.show()

print(f"ARIMA MAE: {arima_mae:.4f}, RMSE: {arima_rmse:.4f}, MAPE: {arima_mape:.2f}%")

# --- LSTM ---
print("\n--- Training LSTM ---")
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_train = scaler.fit_transform(train_df.values.reshape(-1, 1))

def create_sequences(data, seq_length=60):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

seq_length = 60
X_train, y_train = create_sequences(scaled_train, seq_length)

model_lstm = Sequential([
    LSTM(50, return_sequences=False, input_shape=(X_train.shape[1], 1)),
    Dense(1)
])
model_lstm.compile(optimizer="adam", loss="mean_squared_error")
model_lstm.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)

# Test data for LSTM
full_scaled = scaler.transform(df["Close"].values.reshape(-1, 1))
X_test, y_test = create_sequences(full_scaled[len(train_df)-seq_length:], seq_length)
y_test_actual = df["Close"].values[len(train_df):]

lstm_pred_scaled = model_lstm.predict(X_test)
lstm_pred = scaler.inverse_transform(lstm_pred_scaled)

lstm_mae = mean_absolute_error(y_test_actual, lstm_pred)
lstm_rmse = math.sqrt(mean_squared_error(y_test_actual, lstm_pred))
lstm_mape = np.mean(np.abs((y_test_actual - lstm_pred.flatten()) / y_test_actual)) * 100

plt.figure(figsize=(12,6))
plt.plot(train_df.index, train_df, label="Train")
plt.plot(test_df.index, test_df, label="Test")
plt.plot(test_df.index[seq_length:], lstm_pred, label="LSTM Forecast")
plt.legend()
plt.title("LSTM Forecast vs Actual (TSLA)")
plt.savefig(os.path.join(plots_folder, "task2_lstm_forecast.png"))
plt.show()

print(f"LSTM MAE: {lstm_mae:.4f}, RMSE: {lstm_rmse:.4f}, MAPE: {lstm_mape:.2f}%")

# --- Model Comparison ---
if arima_rmse < lstm_rmse:
    best_model = "ARIMA"
else:
    best_model = "LSTM"

print(f"\nBest model based on RMSE: {best_model}")
