# Data Exploration

## Preliminaries

In [None]:
# imports
import duckdb
import matplotlib.pyplot as plt
from merlion.models.factory import ModelFactory
from online_conformal.dataset import M4
from online_conformal.visualize import plot_simulated_forecast

## Reading in data

In [None]:
regn_data = duckdb.sql(
    """select * 
       from 'test_data/regn_20230103_to_20230331.csv.gz' 
       where DATE = '2023-01-03'
       order by TIME_M """).df()

In [None]:
regn_data

In [None]:
# Create lag columns
regn_data['lag_1'] = regn_data['PRICE'].shift(1)  # Lag by 1 period
regn_data['lag_2'] = regn_data['PRICE'].shift(2)  # Lag by 2 periods
regn_data['lag_3'] = regn_data['PRICE'].shift(3)  # Lag by 3 periods

In [None]:
regn_data = regn_data.dropna(subset=['lag_1', 'lag_2', 'lag_3'])[:16000] # cut off closing

A simple visual of REGN's stock price:

In [None]:
regn_data["dollar_volume"] = regn_data["PRICE"] * regn_data["SIZE"]

In [None]:
regn_data.PRICE.plot()

In [None]:
regn_data.SIZE.plot()

## Test Mondrian Forest

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
X, y = boston.data, boston.target

In [None]:
### Use MondrianForests for variance estimation
from skgarden import MondrianForestRegressor
mfr = MondrianForestRegressor()
mfr.fit(X, y)
y_mean, y_std = mfr.predict(X, return_std=True)

In [None]:
y_mean

In [None]:
X = regn_data[["lag_1", "lag_2", "lag_3", "SIZE"]].to_numpy()
y = regn_data.PRICE.to_numpy()

In [None]:
len(X)

In [None]:
X_train, y_train, X_test, y_test = X[:14000], y[:14000], X[14000:], y[14000:]

In [None]:
### Use MondrianForests for variance estimation
from skgarden import MondrianForestRegressor

mfr = MondrianForestRegressor()
mfr.fit(X_train, y_train)
y_mean, y_std = mfr.predict(X_test, return_std=True)

In [None]:
import numpy as np
np.abs(y_mean - y_test).mean()

In [None]:
y_mean

In [None]:
y_std

In [None]:
y_test

In [None]:
len(y_test)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = list(range(len(y_test))) 

# Plotting
plt.figure(figsize=(16, 7))

# Plot y_test in green
plt.plot(x, y_test, color='green', label='y_test')

# Plot y_mean in red
plt.plot(x, y_mean, color='red', label='y_mean')

# Shaded region for y_mean ± y_std
plt.fill_between(x, y_mean - y_std, y_mean + y_std, color='red', alpha=0.2, label='y_mean ± y_std')

# Adding labels, legend, and grid
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('y_test, y_mean, and Confidence Bounds')
plt.legend()
plt.grid(True)

# Show plot
plt.show()


In [None]:
import numpy as np
import plotly.graph_objects as go
# Create Plotly figure
fig = go.Figure()

# Add y_test in green
fig.add_trace(go.Scatter(
    x=x, y=y_test,
    mode='lines',
    line=dict(color='green'),
    name='y_test'
))

# Add y_mean in red
fig.add_trace(go.Scatter(
    x=x, y=y_mean,
    mode='lines',
    line=dict(color='red'),
    name='y_mean'
))

# Add shaded area for y_mean ± y_std
fig.add_trace(go.Scatter(
    x=np.concatenate([x, x[::-1]]),
    y=np.concatenate([y_mean + 3*y_std, (y_mean - 3*y_std)[::-1]]),
    fill='toself',
    fillcolor='rgba(255, 0, 0, 0.2)',  # Transparent red
    line=dict(color='rgba(255, 0, 0, 0)'),  # No border
    name='y_mean ± y_std'
))

# Customize layout
fig.update_layout(
    title='Interactive Plot: y_test, y_mean, and Confidence Bounds',
    xaxis_title='X-axis',
    yaxis_title='Y-axis',
    legend=dict(title='Legend'),
    template='plotly_white',
)

# Show interactive plot
fig.write_html("interactive_plot.html")