# Feature Engineering

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=DeprecationWarning)

In [3]:
data_dir = "data"
file_path = os.path.join(data_dir, "cleaned_merged_data.csv")
stock_data = pd.read_csv(file_path)

In [4]:
stock_data["day_index"] = stock_data.groupby("symbol").cumcount()

features_data = (
    stock_data.groupby("symbol")
    .agg(
        {
            "adj_close": "mean",
            "change": "mean",
            "volume": ["mean", "std"],
            "high": "max",
            "low": "min",
        }
    )
    .reset_index()
)

features_data.columns = [
    "symbol",
    "avg_adj_close",
    "avg_daily_return",
    "avg_volume",
    "volume_volatility",
    "max_high",
    "min_low",
]

volatility = (
    stock_data.groupby("symbol")["adj_close"].std().reset_index(name="volatility")
)
features_data = features_data.merge(volatility, on="symbol")

first_last = (
    stock_data.groupby("symbol")["adj_close"].agg(["first", "last"]).reset_index()
)
features_data["cumulative_return"] = (first_last["last"] / first_last["first"]) - 1

stock_data["intraday_range"] = stock_data["high"] - stock_data["low"]
avg_intraday_range = stock_data.groupby("symbol")["intraday_range"].mean().reset_index()
features_data = features_data.merge(
    avg_intraday_range.rename(columns={"intraday_range": "avg_intraday_range"}),
    on="symbol",
)

In [5]:
daily_avg_change = (
    stock_data.groupby("date")["change"]
    .mean()
    .reset_index()
    .rename(columns={"change": "market_change"})
)

stock_with_market = stock_data.merge(daily_avg_change, on="date")

In [6]:
def safe_corr(group):
    if (
        len(group) < 2
        or group["change"].std() == 0
        or group["market_change"].std() == 0
    ):
        return 0
    return group["change"].corr(group["market_change"])

In [7]:
beta_proxy = (
    stock_with_market.groupby("symbol").apply(safe_corr).reset_index(name="beta_proxy")
)
features_data = features_data.merge(beta_proxy, on="symbol")


def calc_max_drawdown(group):
    if len(group) < 2 or group["adj_close"].std() == 0:
        return 0
    rolling_max = group["adj_close"].cummax()
    drawdown = (group["adj_close"] / rolling_max) - 1
    return drawdown.min()


max_drawdown = (
    stock_data.groupby("symbol")
    .apply(calc_max_drawdown)
    .reset_index(name="max_drawdown")
)

features_data = features_data.merge(max_drawdown, on="symbol")

features_data.drop(columns=["max_high", "min_low"], inplace=True)

market_cap = stock_data.groupby("symbol")["market_cap"].first().reset_index()
features_data = features_data.merge(market_cap, on="symbol")

# Handle NaN values
features_data["volatility"].fillna(0, inplace=True)
features_data["volume_volatility"].fillna(0, inplace=True)
features_data["beta_proxy"].fillna(0, inplace=True)
features_data["max_drawdown"].fillna(0, inplace=True)

In [8]:
features_data.sample(10)

Unnamed: 0,symbol,avg_adj_close,avg_daily_return,avg_volume,volume_volatility,volatility,cumulative_return,avg_intraday_range,beta_proxy,max_drawdown,market_cap
248,SPOT,457.5075,0.005888,2301401.82,2934179.0,75.245904,0.729438,13.1559,0.324648,-0.109479,Large-Cap
58,BWXT,119.2255,0.00061,901044.12,406159.7,7.291792,0.013279,3.2675,0.390932,-0.198454,Mid-Cap
170,MOBX,1.1127,0.009698,903101.89,1896194.0,0.369742,0.242991,0.1875,0.123455,-0.493088,Nano-Cap
28,ASTH,44.3695,-0.003236,308383.6,153535.9,10.224501,-0.316851,1.7332,0.24238,-0.528622,Small-Cap
205,PCYO,11.9599,0.00153,41472.66,27539.45,1.183787,0.139775,0.359,0.455291,-0.237184,Micro-Cap
228,RUN,11.8811,-0.006444,9254072.42,4949961.0,3.054081,-0.53358,0.7614,0.139361,-0.577167,Small-Cap
221,REFR,1.8616,-0.004355,27816.5,18994.53,0.250235,-0.36,0.1109,0.140414,-0.393939,Nano-Cap
262,TM,179.6121,0.000159,389021.67,158883.8,7.369679,0.008208,2.1247,0.407475,-0.093925,Mega-Cap
295,WYNN,90.586,0.001456,2843035.44,1966691.0,7.181292,0.110389,2.4784,0.231065,-0.274406,Mid-Cap
198,PAC,182.4745,0.000793,77572.12,46221.18,7.710505,0.074253,5.1231,0.124441,-0.115587,Mid-Cap


In [9]:
features_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   symbol              300 non-null    object 
 1   avg_adj_close       300 non-null    float64
 2   avg_daily_return    300 non-null    float64
 3   avg_volume          300 non-null    float64
 4   volume_volatility   300 non-null    float64
 5   volatility          300 non-null    float64
 6   cumulative_return   300 non-null    float64
 7   avg_intraday_range  300 non-null    float64
 8   beta_proxy          300 non-null    float64
 9   max_drawdown        300 non-null    float64
 10  market_cap          300 non-null    object 
dtypes: float64(9), object(2)
memory usage: 25.9+ KB


In [10]:
features_data.isnull().sum()

symbol                0
avg_adj_close         0
avg_daily_return      0
avg_volume            0
volume_volatility     0
volatility            0
cumulative_return     0
avg_intraday_range    0
beta_proxy            0
max_drawdown          0
market_cap            0
dtype: int64

In [11]:
save_to_path = os.path.join(data_dir, "feature_engineering_data.csv")
features_data.to_csv(save_to_path, index=False)

---