In [84]:
import pandas as pd

In [85]:
import plotly.io as pio

pio.renderers.default = "vscode"  # or "vscode" if using VS Code notebooks

template = pio.templates["plotly_dark"]

# Backgrounds
template.layout.plot_bgcolor = "#2B2B2B"   # inside the axes
template.layout.paper_bgcolor = "#2B2B2B"  # around the plot

# X axis
template.layout.xaxis.color = "#A9B7C6"    # tick labels + title
template.layout.xaxis.gridcolor = "#7B7E82"
template.layout.xaxis.showline = False
template.layout.xaxis.linecolor = "#A9B7C6"
template.layout.xaxis.tickcolor = "#A9B7C6"  # color of tick lines
template.layout.xaxis.zeroline = False
template.layout.xaxis.zerolinecolor = "#A9B7C6"

# Y axis
template.layout.yaxis.color = "#A9B7C6"
template.layout.yaxis.gridcolor = "#7B7E82"
template.layout.yaxis.showline = False
template.layout.yaxis.linecolor = "#A9B7C6"
template.layout.yaxis.tickcolor = "#A9B7C6"
template.layout.yaxis.zeroline = False
template.layout.yaxis.zerolinecolor = "#A9B7C6"

# Register as new template
pio.templates["custom_dark"] = template
pio.templates.default = "custom_dark"

### **1. Download raw data from the csv**

In [86]:
df_raw = pd.read_csv("raw/raw_OHLCV_730d_1hr.csv", header=[0, 1], index_col=0)
df_raw

Price,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Close,Close,Close,...,Open,Open,Open,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,AAPL,AMZN,META,MSFT,NVDA,SPY,TSLA,AAPL,AMZN,META,...,NVDA,SPY,TSLA,AAPL,AMZN,META,MSFT,NVDA,SPY,TSLA
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2022-12-27 14:30:00+00:00,130.165207,84.300003,117.675598,237.800003,14.733500,382.410004,114.959999,130.165207,84.300003,117.675598,...,15.074000,382.790009,117.495003,19654999,13371541,5044900,3576945,11449954,12365271,62349883
2022-12-27 15:30:00+00:00,129.725693,84.480003,118.330002,237.860001,14.553999,382.349915,113.174698,129.725693,84.480003,118.330002,...,14.735950,382.399994,114.959999,10795118,7642949,3222341,2120949,6190604,6916905,35422505
2022-12-27 16:30:00+00:00,129.929993,84.040001,117.589996,237.679993,14.372000,382.130005,112.360100,129.929993,84.040001,117.589996,...,14.554500,382.350006,113.210899,7518424,5681291,2028363,1599512,5267371,4665338,23816752
2022-12-27 17:30:00+00:00,130.005005,83.599998,117.129997,237.210007,14.297999,381.969910,112.791199,130.005005,83.599998,117.129997,...,14.372991,382.129913,112.360001,5515019,4866664,1798389,1180473,4094220,4164240,20311348
2022-12-27 18:30:00+00:00,129.929901,83.275002,116.477600,236.369995,14.186000,380.730011,112.419998,129.929901,83.275002,116.477600,...,14.297000,381.970001,112.809898,6673932,4967037,1586244,1371487,5007248,5187465,16735713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-11-21 16:30:00+00:00,270.549988,219.639999,592.979980,474.260010,181.070007,660.380005,400.375000,270.549988,219.639999,592.979980,...,178.854706,655.929993,391.470001,2949033,6048938,1489299,1872762,29829294,17590697,12205100
2025-11-21 17:30:00+00:00,270.980011,218.722000,591.625000,472.820007,179.334702,658.359985,394.640015,270.980011,218.722000,591.625000,...,181.080002,660.382507,400.390015,2023820,3049149,1093645,1260590,19462417,10519402,8240202
2025-11-21 18:30:00+00:00,272.609985,222.050003,597.548584,476.570007,184.190002,664.229980,401.700012,272.609985,222.050003,597.548584,...,179.320007,658.349976,394.640015,3562846,3506398,1107144,1386177,30711523,13617205,8102757
2025-11-21 19:30:00+00:00,271.369995,220.184998,594.070007,473.390015,180.389999,659.630005,395.790497,271.369995,220.184998,594.070007,...,184.193497,664.210022,401.739990,3135966,3265276,1133638,1441643,22276052,13077513,6210947


### **2. Stack the multi-index so that we have columns:**
[index, timestamp, ticker, adj_close, close, high, low, open, volume]

In [87]:
df = df_raw.stack(level=1, future_stack=True).reset_index()
df = df.rename(columns={"Datetime": "timestamp",
                        "Ticker":"ticker",
                        "Close": "close",
                        "Open":"open",
                        "High":"high",
                        "Low":"low",
                        "Volume":"volume"
                        })
df.drop(columns=[("Adj Close")], inplace=True)
df.columns.name = None
df = df.sort_values(["ticker", "timestamp"]).reset_index(drop=True)
df

Unnamed: 0,timestamp,ticker,close,high,low,open,volume
0,2022-12-27 14:30:00+00:00,AAPL,130.165207,131.410004,128.720001,131.380005,19654999
1,2022-12-27 15:30:00+00:00,AAPL,129.725693,130.389999,129.179993,130.160004,10795118
2,2022-12-27 16:30:00+00:00,AAPL,129.929993,130.529907,129.509995,129.720001,7518424
3,2022-12-27 17:30:00+00:00,AAPL,130.005005,130.369995,129.679993,129.925003,5515019
4,2022-12-27 18:30:00+00:00,AAPL,129.929901,130.160004,129.619995,129.994995,6673932
...,...,...,...,...,...,...,...
35597,2025-11-21 16:30:00+00:00,TSLA,400.375000,401.209991,389.748199,391.470001,12205100
35598,2025-11-21 17:30:00+00:00,TSLA,394.640015,402.250000,393.170197,400.390015,8240202
35599,2025-11-21 18:30:00+00:00,TSLA,401.700012,402.320007,394.380005,394.640015,8102757
35600,2025-11-21 19:30:00+00:00,TSLA,395.790497,402.010010,395.709991,401.739990,6210947


In [88]:
TICKERS = df["ticker"].unique().tolist()
TICKERS

['AAPL', 'AMZN', 'META', 'MSFT', 'NVDA', 'SPY', 'TSLA']

### **3. Reindex for just trading hours**
Sometimes there are missing bars due to:
- Make it so that we only have data in the range 9:30-16:00
- Even if there is missing data for one hour of a trading day etc. fill in an empty row
- We will handle filling in empty rows from missing data next

In [100]:
import pandas as pd
from datetime import time

# df: columns = timestamp, ticker, adj_close, close, high, low, open, volume

# 1. Ensure timezone is NY
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp"] = df["timestamp"].dt.tz_convert("America/New_York")

# 2. Keep only regular trading hours
df = df.set_index("timestamp")
df = df.between_time("09:30", "16:00").reset_index()

# 3. Get *actual* trading days from the data (no weekends, no holidays)
trading_days = (
    df["timestamp"]
    .dt.normalize()          # drop time - date at midnight
    .drop_duplicates()
    .sort_values()
)

# 4. Build hourly timestamps only for those trading days
full_index = pd.DatetimeIndex([], tz="America/New_York")

for d in trading_days:
    hours = pd.date_range(
        d + pd.Timedelta(hours=9, minutes=30),
        d + pd.Timedelta(hours=16, minutes=0),
        freq="1h",
        tz="America/New_York",
    )
    hours = hours[hours.time <= time(16, 0)]
    full_index = full_index.union(hours)

# 5. Reindex each ticker to this trading-hours-only index
dfs = []
for t in TICKERS:
    df_t = df[df["ticker"] == t].set_index("timestamp")
    df_t = df_t.reindex(full_index)
    df_t["ticker"] = t
    dfs.append(df_t)

df_clean = pd.concat(dfs).reset_index().rename(columns={"index": "timestamp"})
df_clean[~df_clean.notna()["high"]].head(10)

Unnamed: 0,timestamp,ticker,close,high,low,open,volume
899,2023-07-03 12:30:00-04:00,AAPL,,,,,
900,2023-07-03 13:30:00-04:00,AAPL,,,,,
901,2023-07-03 14:30:00-04:00,AAPL,,,,,
902,2023-07-03 15:30:00-04:00,AAPL,,,,,
1606,2023-11-24 12:30:00-05:00,AAPL,,,,,
1607,2023-11-24 13:30:00-05:00,AAPL,,,,,
1608,2023-11-24 14:30:00-05:00,AAPL,,,,,
1609,2023-11-24 15:30:00-05:00,AAPL,,,,,
2663,2024-07-03 12:30:00-04:00,AAPL,,,,,
2664,2024-07-03 13:30:00-04:00,AAPL,,,,,


### 4. **Handle true missing values during trading days**

In [107]:
# Forward-fill prices per ticker
df_clean[["open", "high", "low", "close"]] = (
    df_clean.groupby("ticker")[["open", "high", "low", "close"]].ffill()
)

# Volume: NaN -> 0
df_clean["volume"] = df_clean["volume"].fillna(0)

df = df_clean.sort_values(["ticker", "timestamp"]).reset_index(drop=True)
df

Unnamed: 0,timestamp,ticker,close,high,low,open,volume
0,2022-12-27 09:30:00-05:00,AAPL,130.165207,131.410004,128.720001,131.380005,19654999.0
1,2022-12-27 10:30:00-05:00,AAPL,129.725693,130.389999,129.179993,130.160004,10795118.0
2,2022-12-27 11:30:00-05:00,AAPL,129.929993,130.529907,129.509995,129.720001,7518424.0
3,2022-12-27 12:30:00-05:00,AAPL,130.005005,130.369995,129.679993,129.925003,5515019.0
4,2022-12-27 13:30:00-05:00,AAPL,129.929901,130.160004,129.619995,129.994995,6673932.0
...,...,...,...,...,...,...,...
35765,2025-11-21 11:30:00-05:00,TSLA,400.375000,401.209991,389.748199,391.470001,12205100.0
35766,2025-11-21 12:30:00-05:00,TSLA,394.640015,402.250000,393.170197,400.390015,8240202.0
35767,2025-11-21 13:30:00-05:00,TSLA,401.700012,402.320007,394.380005,394.640015,8102757.0
35768,2025-11-21 14:30:00-05:00,TSLA,395.790497,402.010010,395.709991,401.739990,6210947.0


In [104]:
import plotly.express as px
fig = px.line(df[df.ticker == "NVDA"], x="timestamp", y="close")
fig.show()

In [99]:
24*7

168