In [1]:
%matplotlib inline

# 1) Wipe out your namespace
%reset -f

# 2) Clear Jupyter’s stored outputs (and inputs if you like)
try:
    Out.clear()
except NameError:
    pass

try:
    In.clear()
except NameError:
    pass

# 3) Force Python GC
import gc
gc.collect()

# 4) Free any GPU buffers
import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()


import importlib
from libs import trades, plots, params, feats
importlib.reload(trades)
importlib.reload(plots)
importlib.reload(params)
importlib.reload(feats)

<module 'libs.feats' from '/workspace/my_models/Trading/_Stock_Analysis_/libs/feats.py'>

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

import glob
import os
import json
import re

import time
import datetime as dt
from tqdm.auto import tqdm
from IPython.display import clear_output, display

In [3]:
# Read sign timestamps (with pre-market included)
df_sign = pd.read_csv(params.sign_csv, index_col=0, parse_dates=True)

all_inds = []
print("Generating all indicator (standard + engineered) …")

for day, day_df in tqdm(df_sign.groupby(df_sign.index.normalize()), desc="Days", unit="day"):
    # 1) Compute raw standard + custom-window features
    stand_inds = feats.standard_indicators(df=day_df, mult_inds_win=params.mult_inds_win)

    # 2) Build stationary, ratio-based eng_* signals
    eng_inds = feats.engineered_indicators(stand_inds)

    # Merge raw/custom + engineered and drop rows that still have any NaN
    day_inds = pd.concat([stand_inds, eng_inds], axis=1).dropna()

    all_inds.append(day_inds)

# Glue back into one long DataFrame
df_inds_unsc = pd.concat(all_inds).sort_index()

Generating all indicator (standard + engineered) …


Days:   0%|          | 0/5400 [00:00<?, ?day/s]

In [4]:
# # quick engineered features visualization

# # 1) Identify engineered‐feature columns and the raw close price column
# eng_features = [c for c in df_feat_unsc.columns if c.startswith("eng_")]

# # 2) Pick a random trading day and convert to pandas Timestamp
# all_days   = df_feat_unsc.index.normalize().unique()
# random_day = np.random.choice(all_days)
# day_ts     = pd.to_datetime(random_day)

# # 3) Subset the DataFrame to that single day
# mask         = df_feat_unsc.index.normalize() == random_day
# df_day_feats = df_feat_unsc.loc[mask, eng_features]
# df_day_close = df_feat_unsc.loc[mask, 'close']

# # 4) Create one subplot per engineered feature
# fig, axes = plt.subplots(
#     nrows   = len(eng_features),
#     ncols   = 1,
#     figsize = (12, 2 * len(eng_features)),
#     sharex  = True
# )
# fig.suptitle(f"Engineered Features vs. Close on {day_ts.date()}", fontsize=16)

# # 5) Plot each feature on its own left‐y axis, and close price on a right‐y axis
# for ax, feat in zip(axes, eng_features):
#     # left axis: engineered feature
#     ax.plot(df_day_feats.index, df_day_feats[feat],
#             color="C0", label=feat)
#     ax.set_ylabel(feat, color="C0")
#     ax.tick_params(axis="y", colors="C0")
#     ax.grid(True)

#     # right axis: raw close price
#     ax2 = ax.twinx()
#     ax2.plot(df_day_close.index, df_day_close,
#              color="k", alpha=0.6, label="close")
#     ax2.set_ylabel("close", color="k")
#     ax2.tick_params(axis="y", colors="k")

# # 6) Final formatting
# axes[-1].set_xlabel("Time of Day")
# plt.tight_layout(rect=[0, 0.03, 1, 0.95])
# plt.show()


In [5]:
# # importlib.reload(feats) ###################

# feat_cols    = [c for c in df_feat_unsc.columns if c!="signal"]

# overrides = {
#   # raw geometry & MACD → light tails
#   "body":              "robust_tails_light",
#   "macd_line_12_26_9": "robust_tails_light",
#   "macd_signal_12_26_9":"robust_tails_light",
#   "macd_diff_12_26_9": "robust_tails_light",
#   # OBV & related → heavy tails
#   "obv":         "robust_tails_heavy",
#   "obv_diff_14": "robust_tails_heavy",
#   "obv_sma_14":  "robust_tails_heavy",
#   "eng_obv":     "robust_tails_heavy",
#   "volume":      "robust_tails_heavy",
#   # continuous ratio/unbounded
#   "ret":        "unbounded",
#   "log_ret":    "unbounded",
#   "body_pct":   "unbounded",
#   "range_pct":  "unbounded",
#   "sma_pct_14": "unbounded",
#   "sma_pct_28": "unbounded",
#   "bb_w_20":    "unbounded",
#   "atr_pct_14": "unbounded",
# }


# assign_df = feats.assign_feature_groups(
#     df        = df_feat_unsc,
#     cols      = feat_cols,
#     # ratio_range:     float = 0.15,
#     # heavy_thresh:    float = 1e7,
#     # skew_thresh:     float = 3.0,
#     # kurtosis_thresh: float = 5.0,
#     # discrete_thresh: int   = 10,
#     overrides    = overrides
# )

# # print group lists
# for group, features in assign_df['group_final'].groupby(assign_df['group_final']).groups.items():
#     print(f"{group:10s}", len(features), ":\n", features, "\n")

# assign_df


In [6]:
# # -----------------------------------------------------------------------------
# #   This plot shows, for each trading day, the first 1-minute bar when *every* feature has a non-null value.  
# #   Plotting the histogram of those hours tells how long the indicators take to “warm up” each morning before the model can run.
# # -----------------------------------------------------------------------------

# #  For each calendar day, find the first timestamp where *all* feat_ cols are non-null
# first_valid = (
#     df_feat_unsc
#       .groupby(df_feat_unsc.index.normalize())
#       .apply(lambda grp: grp.dropna(subset=feat_cols).index.min())
# )

# #  Extract the hour (0–23) of that first fully-populated bar
# first_valid_hours = first_valid.dt.hour

# #  Plot the histogram
# plt.figure(figsize=(12, 5))
# plt.hist(
#     first_valid_hours,
#     bins=range(0, 25),       # 24 one-hour bins
#     align='left',
#     color='skyblue',
#     edgecolor='black'
# )
# plt.xticks(range(0, 24))
# plt.xlabel('Hour of Day (0 = midnight)')
# plt.ylabel('Number of Days')
# plt.title('Histogram of First Fully-Populated Feature Bar per Session')
# plt.tight_layout()
# plt.show()


In [7]:
# # candidate raw-level names/patterns to consider dropping
# drop_exact = {"open", "high", "low", "close", "volume", "body", "upper_shad", "lower_shad"}
# drop_patterns = [
#     r"^ema_\d+$",        # raw EMA levels
#     r"^sma_\d+$",        # raw SMA levels (keep sma_*_pct)
#     r"^atr_\d+$",        # raw ATR levels (keep atr_pct_*)
#     r"^bb_lband_", r"^bb_hband_",  # raw BB levels (keep bb_w_)
#     r"^vwap_(?!.*_dev)", # raw VWAP levels (keep vwap_dev_*)
#     r"^rolling_max_close_", r"^rolling_min_close_"
# ]

# # find candidates present in df
# cands = set(c for c in df_feat_unsc.columns if c in drop_exact)
# for p in drop_patterns:
#     cands.update([c for c in df_feat_unsc.columns if re.match(p, c)])
# cands = sorted(cands)

# # helper checks
# def has_col(name):
#     return name in df_feat_unsc.columns

# def has_any(pattern):
#     pat = pattern.replace("*", ".*")
#     return any(re.match(pat, col) for col in df_feat_unsc.columns)

# safe = []
# keep = []

# for col in cands:
#     ok = False

#     # raw EMA -> require ema_dev_N OR both eng_ema_cross flags
#     m = re.match(r"^ema_(\d+)$", col)
#     if m:
#         w = m.group(1)
#         ok = (f"ema_dev_{w}" in df_feat_unsc.columns) or ("eng_ema_cross_up" in df_feat_unsc.columns and "eng_ema_cross_down" in df_feat_unsc.columns)

#     # raw SMA -> require sma_N_pct
#     elif re.match(r"^sma_(\d+)$", col):
#         w = re.search(r"_(\d+)$", col).group(1)
#         ok = (f"sma_{w}_pct" in df_feat_unsc.columns) or has_any("sma_*_pct")

#     # ATR raw -> require atr_pct_N
#     elif re.match(r"^atr_(\d+)$", col):
#         w = re.search(r"_(\d+)$", col).group(1)
#         ok = (f"atr_pct_{w}" in df_feat_unsc.columns) or has_any("atr_pct_")

#     # BB raw bands -> require bb_w_N or eng_bb
#     elif re.match(r"^bb_(lband|hband)_", col):
#         ok = has_any("bb_w_") or ("eng_bb" in df_feat_unsc.columns)

#     # VWAP raw -> require vwap_dev_N or eng_vwap
#     elif re.match(r"^vwap_(?!.*_dev)", col):
#         ok = has_any("vwap_dev_") or ("eng_vwap" in df_feat_unsc.columns)

#     # rolling max/min -> require dist_high_*/dist_low_*
#     elif re.match(r"^rolling_(max|min)_close_", col):
#         ok = has_any("dist_high_") or has_any("dist_low_")

#     # OBV raw -> require obv_pct_* or obv_z_*
#     elif col == "obv":
#         ok = has_any("obv_pct_") or has_any("obv_z_") or has_any("obv_diff_")

#     # body/shadows -> require body_pct or range_pct
#     elif col in {"body", "upper_shad", "lower_shad"}:
#         ok = has_col("body_pct") or has_col("range_pct")

#     # open/high/low/close/volume -> require returns/normalized volume exist
#     elif col in {"open", "high", "low", "close"}:
#         ok = has_col("ret") or has_col("log_ret") or has_any("ret_")
#     elif col == "volume":
#         ok = has_any("vol_z_") or has_any("vol_spike_") or has_any("obv_") or has_any("obv_pct_")

#     if ok:
#         safe.append(col)
#     else:
#         keep.append(col)

# # report
# print("SAFE TO DROP:", len(safe))
# for c in safe: print("  -", c)
# print("\nKEEP FOR NOW (missing derived alternatives):", len(keep))
# for c in keep: print("  -", c)

# # If safe list looks correct, uncomment to drop:
# # df_feat_unsc = df_feat_unsc.drop(columns=safe, errors="ignore")
# # print("Dropped", len(safe), "columns; new shape:", df_feat_unsc.shape)


In [None]:
importlib.reload(feats) ########

df_feat_unsc, to_drop, diag = feats.prune_and_percentiles(
    df_unsc = df_inds_unsc,
    train_prop = params.train_prop
)

print("dropped features:\n", to_drop)

# show remaining features only (exclude dropped) and sort by pct_low (ascending -> most aggressive clipping first)
diag_alive = diag[diag["status"] != "DROP"].copy()
display(diag_alive.sort_values("pct_pair", ascending=False).head(25))

print("features remaining:", diag_alive.shape[0], "dropped:", len(to_drop))

prune_and_percentiles:   0%|          | 0/177 [00:00<?, ?feat/s]

In [None]:
importlib.reload(feats) #########

df_feat_scal = feats.scaling_with_percentiles(
    df = df_feat_unsc,
    label_col = params.label_col,
    diag = diag,
    train_prop = params.train_prop,
    val_prop = params.val_prop,
    winsorize = True
)

df_feat_scal  # scaled DataFrame


In [None]:
importlib.reload(feats) ###################

diag = feats.scaling_diagnostics(df_unscaled = df_feat_unsc,
                           df_scaled = df_feat_scal,
                           train_prop = params.train_prop,
                           clip_thresh = 0.05)

diag[diag["suggested_action"] != "ok"]

In [None]:
# importlib.reload(plots) ###################

plots.plot_dual_histograms(
    df_before = df_feat_unsc,
    df_after  = df_feat_scal,
)

In [None]:
importlib.reload(params) #############
importlib.reload(plots) #############

df_plot = df_feat_scal.copy()

df_plot['close_raw'] = df_inds_unsc['close']

df_month = df_plot[
    df_plot.index.to_period('M') == params.month_to_check
].copy()

for day in df_month.index.normalize().unique():
    # select all timestamps on this day
    df_day = df_month[df_month.index.normalize() == day]
    if df_day.empty:
        continue

    plots.plot_trades(
      df_day,
      col_signal1 = params.label_col,
      col_close   = 'close_raw',
      start_plot  = None)      

In [None]:
print("saving df …")
df_feat_scal.to_csv(params.feat_all_csv)
print("saved df")