# 02_feature_engineering

### Setup

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_PROCESSED = os.path.join(REPO_ROOT, "data", "processed")
DATA_RAW = os.path.join(REPO_ROOT, "data", "raw")

market_path = os.path.join(DATA_PROCESSED, "market_merged.parquet")
assert os.path.exists(market_path), f"Missing {market_path}. Run Notebook 01 first."

df = pd.read_parquet(market_path)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["ticker","date"]).reset_index(drop=True)

df.head()


### Helper functions

EMA, Stochastic, Candlestick patterns

In [None]:
def add_ema(df, price_col="adj_close", span=5, out_col=None):
    out_col = out_col or f"ema_{span}"
    df[out_col] = df.groupby("ticker")[price_col].transform(lambda s: s.ewm(span=span, adjust=False).mean())
    return df

def add_stochastic(df, high_col="high", low_col="low", close_col="adj_close", k=14, d=3):
    """
    Stochastic %K = 100 * (Close - LowestLow(k)) / (HighestHigh(k) - LowestLow(k))
    Stochastic %D = SMA(%K, d)
    """
    g = df.groupby("ticker", group_keys=False)
    low_k = g[low_col].transform(lambda s: s.rolling(k, min_periods=k).min())
    high_k = g[high_col].transform(lambda s: s.rolling(k, min_periods=k).max())

    denom = (high_k - low_k)
    denom = denom.replace(0, np.nan)

    df[f"stoch_k_{k}"] = 100 * ((df[close_col] - low_k) / denom)
    df[f"stoch_d_{k}_{d}"] = g[f"stoch_k_{k}"].transform(lambda s: s.rolling(d, min_periods=d).mean())
    return df

def add_candlestick_features(df, open_col="open", high_col="high", low_col="low", close_col="close"):
    """
    Basic candle geometry + pattern flags:
    - doji: small real body relative to range
    - hammer: small body, long lower wick, closes near high
    - engulfing: bullish/bearish engulfing based on previous candle body
    """
    o = df[open_col].astype(float)
    h = df[high_col].astype(float)
    l = df[low_col].astype(float)
    c = df[close_col].astype(float)

    body = (c - o).abs()
    rng = (h - l).replace(0, np.nan)
    upper_wick = h - np.maximum(c, o)
    lower_wick = np.minimum(c, o) - l

    df["candle_body"] = body
    df["candle_range"] = rng
    df["upper_wick"] = upper_wick
    df["lower_wick"] = lower_wick
    df["body_to_range"] = body / rng

    # Doji: tiny body (<=10% of range)
    df["is_doji"] = (df["body_to_range"] <= 0.10).astype(int)

    # Hammer (simplified):
    # - lower wick >= 2 * body
    # - upper wick <= 0.5 * body (or small)
    # - body is relatively small
    df["is_hammer"] = (
        (lower_wick >= 2.0 * body) &
        (upper_wick <= 0.5 * body) &
        (df["body_to_range"] <= 0.30)
    ).astype(int)

    # Engulfing requires previous candle
    g = df.groupby("ticker", group_keys=False)

    prev_o = g[open_col].shift(1).astype(float)
    prev_c = g[close_col].shift(1).astype(float)

    prev_bull = prev_c > prev_o
    prev_bear = prev_c < prev_o
    curr_bull = c > o
    curr_bear = c < o

    # bullish engulfing: prev bear, curr bull, curr body engulfs prev body
    df["is_bull_engulf"] = (
        prev_bear & curr_bull &
        (o <= prev_c) & (c >= prev_o)
    ).astype(int)

    # bearish engulfing: prev bull, curr bear, curr body engulfs prev body
    df["is_bear_engulf"] = (
        prev_bull & curr_bear &
        (o >= prev_c) & (c <= prev_o)
    ).astype(int)

    return df


### Add technical indicators

In [None]:
df = add_ema(df, price_col="adj_close", span=5, out_col="ema5")
df = add_ema(df, price_col="adj_close", span=20, out_col="ema20")   # useful regime baseline later
df = add_stochastic(df, high_col="high", low_col="low", close_col="adj_close", k=14, d=3)

# Optional: trend regime (simple)
df["regime_up_ema20"] = (df["adj_close"] > df["ema20"]).astype(int)

df[["date","ticker","adj_close","ema5","ema20","stoch_k_14","stoch_d_14_3","regime_up_ema20"]].head(20)


### Add candlestick features

In [None]:
df = add_candlestick_features(df)

cols = ["date","ticker","open","high","low","close","candle_body","candle_range","is_doji","is_hammer","is_bull_engulf","is_bear_engulf"]
df[cols].head(25)


### Fundamental features: z-scores + simple composites

In [None]:
fund_cols = ["trailingPE","forwardPE","priceToBook","profitMargins","revenueGrowth","debtToEquity","dividendYield"]

# Convert to numeric safely
for c in fund_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Helper: z-score safely (ignores NaNs)
def zscore(series):
    mu = series.mean(skipna=True)
    sd = series.std(skipna=True)
    if sd == 0 or np.isnan(sd):
        return (series * np.nan)
    return (series - mu) / sd

# Global z-scores (simple first)
for c in fund_cols:
    df[f"z_{c}"] = df.groupby("date")[c].transform(zscore)  # per-date cross-section z-score

# Build simple interpretable composites (you can tweak later)
# Value: lower PE and lower P/B are better => invert the z-score sign
df["value_score"] = (-df["z_trailingPE"]) + (-df["z_priceToBook"])

# Quality: higher margins are better, lower debt-to-equity better
df["quality_score"] = (df["z_profitMargins"]) + (-df["z_debtToEquity"])

# Growth: higher revenue growth is better
df["growth_score"] = df["z_revenueGrowth"]

# Income: higher dividend yield is better
df["income_score"] = df["z_dividendYield"]

# Overall fundamental score (simple weighted sum)
df["fundamental_score"] = (
    0.35 * df["value_score"] +
    0.35 * df["quality_score"] +
    0.20 * df["growth_score"] +
    0.10 * df["income_score"]
)

df[["date","ticker","value_score","quality_score","growth_score","income_score","fundamental_score"]].head(10)


### Basic feature sanity checks

In [None]:
# Missingness check for key features
feature_cols = [
    "ema5","ema20","stoch_k_14","stoch_d_14_3",
    "is_doji","is_hammer","is_bull_engulf","is_bear_engulf",
    "fundamental_score"
]

missing = df[feature_cols].isna().mean().sort_values(ascending=False)
missing


### Quick visualization

In [None]:
ticker = "SPY"
tmp = df[df["ticker"] == ticker].copy().set_index("date")

fig = plt.figure(figsize=(12,4))
plt.plot(tmp.index, tmp["adj_close"], label="Adj Close")
plt.plot(tmp.index, tmp["ema5"], label="EMA5")
plt.plot(tmp.index, tmp["ema20"], label="EMA20")
plt.title(f"{ticker}: Price + EMA")
plt.legend()
plt.show()

fig = plt.figure(figsize=(12,3))
plt.plot(tmp.index, tmp["stoch_k_14"], label="%K (14)")
plt.plot(tmp.index, tmp["stoch_d_14_3"], label="%D (3)")
plt.axhline(80, linestyle="--")
plt.axhline(20, linestyle="--")
plt.title(f"{ticker}: Stochastic Oscillator")
plt.legend()
plt.show()


### Save feature dataset

In [None]:
out_path = os.path.join(DATA_PROCESSED, "features.parquet")
df.to_parquet(out_path, index=False)
print("Saved:", out_path)
