In [5]:
import pandas as pd
from pathlib import Path

PROC = Path("../data/processed")
paths = sorted(PROC.glob("prices_preprocessed_*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
assert paths, "No processed files found in ../data/processed"
src_path = paths[0]
df = pd.read_csv(src_path, parse_dates=["date"])
print("Loaded:", src_path.name, "rows:", len(df))
df.head()


Loaded: prices_preprocessed_20250821-111452.csv rows: 250


Unnamed: 0,date,open,high,low,close,volume,Dividends,Stock Splits,daily_range,gap,ret_1d,ret_1d_z
0,2024-08-16 00:00:00-04:00,222.8827,225.779224,222.613947,225.002838,44340200,0.0,0.0,3.165277,,,
1,2024-08-19 00:00:00-04:00,224.674371,224.943125,222.006778,224.843582,40687800,0.0,0.0,2.936347,-0.00146,-0.07078,-0.050741
2,2024-08-20 00:00:00-04:00,224.724146,226.117655,224.405621,225.460709,30299000,0.0,0.0,1.712033,-0.000531,0.274469,0.120104
3,2024-08-21 00:00:00-04:00,225.470651,226.923879,224.007459,225.351196,34765500,0.0,0.0,2.916419,4.4e-05,-0.048573,-0.039752
4,2024-08-22 00:00:00-04:00,226.734776,227.282232,222.862797,223.489883,43695300,0.0,0.0,4.419435,0.00614,-0.825961,-0.424439


In [6]:
import sys, os
sys.path.append(os.path.abspath(".."))  # allow from src import ...
from src.features import add_basic_features

df_feat = add_basic_features(df)
df_feat[["date","close","ret_1d","high","low","volume","volatility_ratio","volume_zscore","ret_vol_interaction"]].head()


Unnamed: 0,date,close,ret_1d,high,low,volume,volatility_ratio,volume_zscore,ret_vol_interaction
0,2024-08-16 00:00:00-04:00,225.002838,,225.779224,222.613947,44340200,0.014068,-0.344144,
1,2024-08-19 00:00:00-04:00,224.843582,-0.07078,224.943125,222.006778,40687800,0.01306,-0.476076,0.033696
2,2024-08-20 00:00:00-04:00,225.460709,0.274469,226.117655,224.405621,30299000,0.007593,-0.85134,-0.233667
3,2024-08-21 00:00:00-04:00,225.351196,-0.048573,226.923879,224.007459,34765500,0.012942,-0.690001,0.033515
4,2024-08-22 00:00:00-04:00,223.489883,-0.825961,227.282232,222.862797,43695300,0.019775,-0.367439,0.30349


In [7]:
display(df_feat[["volatility_ratio","volume_zscore","ret_vol_interaction"]].describe())

Unnamed: 0,volatility_ratio,volume_zscore,ret_vol_interaction
count,250.0,250.0,249.0
mean,0.021984,5.684342e-17,0.057122
std,0.015543,1.002006,5.250494
min,0.007593,-1.106517,-18.967864
25%,0.014353,-0.5186035,-0.316194
50%,0.018386,-0.25737,-0.034879
75%,0.024274,0.1006081,0.161636
max,0.14443,9.565556,72.274775


## Homework 09 — Feature Engineering (Project Data)

**Feature 1 — `volatility_ratio = (high - low) / close`**  
*Reasoning:* Measures intraday volatility relative to price level; less scale-dependent than raw range.

**Feature 2 — `volume_zscore = (volume - mean) / std`**  
*Reasoning:* Flags abnormal activity days; aligns with EDA observation that volume spikes co-occur with large ranges.

**Feature 3 (optional) — `ret_vol_interaction = ret_1d * volume_zscore`**  
*Reasoning:* Combines the size/direction of return with unusual trading activity to highlight impactful sessions.
