In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from math import sqrt
import shap
from lime import lime_tabular

# Load the data
data = pd.read_csv('../data/stock_data_blk.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Define start and end dates for rolling experiments
rolling_windows = pd.date_range(start='2018-01-01', end='2023-09-01', freq='Y')  # yearly increments

rmse_list = []  # Store RMSEs for each period
all_predicted_prices = []  # To store predicted prices over multiple periods

# Loop through each rolling window
for i in range(len(rolling_windows)-1):
    # Set the current rolling window for training and prediction
    start_train = rolling_windows[i]
    end_train = rolling_windows[i+1] - pd.DateOffset(days=1)
    start_test = rolling_windows[i+1]
    end_test = min(rolling_windows[i+2] - pd.DateOffset(days=1), data.index[-1]) if i+2 <= len(rolling_windows) else data.index[-1]
    
    print(f"Training from {start_train.date()} to {end_train.date()}, predicting {start_test.date()} to {end_test.date()}")
    
    # Split the data into train and future
    stock_data = data.loc[start_train:end_train]
    future_data = data.loc[start_test:end_test]
    
    features = stock_data.drop(columns=['Adj Close', 'Open']).fillna(method='ffill').fillna(method='bfill')
    target = stock_data['Adj Close'].fillna(method='ffill').fillna(method='bfill')
    
    X_train, X_last = features.iloc[:-1], features.iloc[-1:]
    y_train = target.iloc[:-1]

    # Adjust the model's parameters for quicker training
    lgb_model = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=100)
    lgb_model.fit(X_train, y_train)

    future_features = future_data.drop(columns=['Adj Close', 'Open'])
    X_recent = stock_data.iloc[-1:].drop(columns=['Adj Close', 'Open'])

    predicted_prices = []

    # Predict the next day's price using the most recent data
    predicted_price = lgb_model.predict(X_recent)[0]
    predicted_prices.append(predicted_price)
    
    for i in range(len(future_features)):
        X_future = future_features.iloc[i:i+1]  # Select the features for the current day
        predicted_price = lgb_model.predict(X_future)[0]
        predicted_prices.append(predicted_price)
    
    # Ensure the lengths of predicted and actual prices match
    actual_prices = future_data['Adj Close'][:len(predicted_prices)]
    dates = future_data.index[:len(predicted_prices)]

    # Calculate RMSE for this period
    rmse = sqrt(mean_squared_error(actual_prices, predicted_prices))
    print(f'RMSE for {start_test.date()} to {end_test.date()}: {rmse}')
    rmse_list.append(rmse)
    
    # Store predicted prices
    all_predicted_prices.extend(predicted_prices)

# Plotting predictions for all periods combined
data_long = pd.DataFrame({
    'Date': data.index[len(data)-len(all_predicted_prices):],
    'actual price': data['Adj Close'].iloc[len(data)-len(all_predicted_prices):].values,
    'predicted price': all_predicted_prices
})
data_long_melted = pd.melt(data_long, id_vars=['Date'], value_vars=['actual price', 'predicted price'], var_name='Variable', value_name='Value')

# Create the plot
sns.set_theme()  # Set the Seaborn theme
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot the data using seaborn lineplot
sns.lineplot(data=data_long_melted, x='Date', y='Value', hue='Variable')

# Customize the plot
plt.title('Comparison of actual and predicted price Over Time')
plt.xlabel('Date')
plt.ylabel('Value')
plt.xticks(rotation=45)  # Rotate the x-axis labels for better readability

# Show the plot
plt.show()

# RMSE over the years
rmse_df = pd.DataFrame({
    'Window': rolling_windows[:-1].strftime('%Y'),
    'RMSE': rmse_list
})

print(rmse_df)

Training from 2018-12-31 to 2019-12-30, predicting 2019-12-31 to 2020-12-30
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000387 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1264
[LightGBM] [Info] Number of data points in the train set: 251, number of used features: 15
[LightGBM] [Info] Start training from score 390.714423


  rolling_windows = pd.date_range(start='2018-01-01', end='2023-09-01', freq='Y')  # yearly increments
  features = stock_data.drop(columns=['Adj Close', 'Open']).fillna(method='ffill').fillna(method='bfill')
  target = stock_data['Adj Close'].fillna(method='ffill').fillna(method='bfill')


ValueError: Found input variables with inconsistent numbers of samples: [253, 254]