In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
RAW_DATA_OUTPUT_DIR="../data/raw"
PROCESSED_DATA_OUTPUT_DIR="../data/processed"

In [4]:
df=pd.read_parquet(f"{RAW_DATA_OUTPUT_DIR}/stock_and_market_data_2015_2025.parquet"
                   )

In [5]:
def get_df_details(df):
    print("Printing DF details \n")
    print(f"Shape {df.shape}")
    print(f"Missing {df.isna().sum()}")
    print(f"Columns {df.columns}")
    print(f"Duplicated {df.duplicated().sum()}")


In [6]:
get_df_details(df)

Printing DF details 

Shape (1316543, 12)
Missing Date            0
Ticker          0
Open            0
High            0
Low             0
Close           0
Volume          0
sp500_Open      0
sp500_High      0
sp500_Low       0
sp500_Close     0
sp500_Volume    0
dtype: int64
Columns Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume',
       'sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close', 'sp500_Volume'],
      dtype='object')
Duplicated 0


In [None]:
def calculate_rsi(prices,window=14):
    """Calculate RSI - momentum oscillator measuring speed of price movements"""
    delta=prices.diff()
    gain=(delta.where(delta>0,0)).rolling(window=window).mean()
    loss=(-delta.where(delta<0,0)).rolling(window=window).mean()

    rs=gain/loss
    rsi=100-(100/(1+rs))
    return rsi 



   

def feature_engineer(df):
    df_copy=df.copy()
    df_copy=df_copy.sort_values(by=["Ticker","Date"])
    #return for 5d would be 
    df_copy['return_5d'] = df_copy.groupby("Ticker")["Close"].transform(
        lambda x: (x.shift(1)/x.shift(6)-1)
    )
    #rsi 
    df_copy['rsi_14d']=df_copy.groupby("Ticker")["Close"].transform(
        lambda x:calculate_rsi(x).shift(1)
    )
    

    #volatility 
    #how much a stock goes up or down per time or its movement? check daily return see how it much it has fluctuated
    df_copy['daily_return'] = df_copy.groupby('Ticker')['Close'].pct_change().shift(1)
    df_copy["volatility_10d"]=df_copy.groupby("Ticker")["daily_return"].transform(
        lambda x:x.rolling(window=10).std()
    )
    df_copy["volatility_20d"]=df_copy.groupby("Ticker")["daily_return"].transform(
        lambda x:x.rolling(window=20).std()
    )

    #market 5d percentage return
    df_copy["sp500_return_5d"]=(df_copy["sp500_Close"].shift(1)/df_copy["sp500_Close"].shift(6)-1)

    #relative strength
    df_copy['relative_strength_5d']=df_copy['return_5d']-df_copy['sp500_return_5d']

    #more trend based signals 
    ema_12 = df_copy.groupby('Ticker')['Close'].transform(lambda x: x.ewm(span=12).mean())
    ema_26 = df_copy.groupby('Ticker')['Close'].transform(lambda x: x.ewm(span=26).mean())
    macd = ema_12 - ema_26
    df_copy['macd']=macd
    macd_signal = df_copy.groupby('Ticker')['macd'].transform(lambda x: x.ewm(span=9).mean())
    macd_histogram = macd - macd_signal
    df_copy['macd_histogram'] = macd_histogram.shift(1)

    # Shift all final feature avoid collinearity
    df_copy=df_copy.drop(columns=['macd'])

    df_copy['ema_8d']=df_copy.groupby("Ticker")["Close"].transform(lambda x: x.ewm(span=8).mean()).shift(1)

    df_copy['ema_21d']=df_copy.groupby("Ticker")["Close"].transform(lambda x: x.ewm(span=21).mean()).shift(1)

    df_copy['ema_8_21_cross'] = (df_copy['ema_8d'] > df_copy['ema_21d']).astype(int)

    #stochastic oscillator
    low_14 = df_copy.groupby('Ticker')['Low'].transform(lambda x: x.rolling(14, min_periods=14).min())
    high_14 = df_copy.groupby('Ticker')['High'].transform(lambda x: x.rolling(14, min_periods=14).max())
    df_copy['stochastic_k']=100*((df_copy["Close"]-low_14)/(high_14-low_14)).shift(1)
    df_copy['stochastic_d'] = df_copy.groupby('Ticker')['stochastic_k'].transform(lambda x: x.rolling(3).mean()).shift(1)

    #bollinger band
    sma_20 = df_copy.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=20).mean())
    std_20 = df_copy.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=20).std())
    upper_band = sma_20 + (2 * std_20)
    lower_band = sma_20 - (2 * std_20)
    df_copy['bollinger_percent_b'] = ((df_copy['Close'] - lower_band) / (upper_band - lower_band)).shift(1)

    #roc
    #the percentage change in price between the current price and the price n periods ago. It's a pure momentum oscillator.
    df_copy['roc_21d'] = df_copy.groupby("Ticker")["Close"].transform(
    lambda x: (x / x.shift(21) - 1)
    ).shift(1)

    #obv 
    direction = df_copy.groupby('Ticker')['Close'].transform(lambda x: x.diff()).fillna(0).apply(np.sign)
    obv = (df_copy['Volume'] * direction).groupby(df_copy['Ticker']).cumsum()
    df_copy['obv_scaled'] = obv.groupby(df_copy['Ticker']).transform(
        lambda x: (x - x.rolling(window=21).mean()) / x.rolling(window=21).std()
    ).shift(1)


    #atr 
    high_low=df_copy['High']- df_copy['Low']
    high_close=np.abs(df_copy['High']-df_copy.groupby("Ticker")['Close'].shift(1))
    low_close=np.abs(df_copy['Low']-df_copy.groupby("Ticker")["Close"].shift(1))
    tr=pd.concat([high_low,high_close,low_close], axis=1).max(axis=1)
    df_copy['atr_14d']=tr.groupby(df_copy['Ticker']).transform(
        lambda x:x.ewm(span=14, adjust=False).mean()
    ).shift(1)



    #target
    df_copy["target_5d"]=df_copy.groupby("Ticker")["Close"].transform(
        lambda x:(x.shift(-5)/x-1)
    )

    df_copy["target_regression"]=(df_copy["target_5d"].clip(
        df_copy["target_5d"].quantile(0.01),
        df_copy["target_5d"].quantile(0.99)
    ))

    #need stock that out performs the market
    df_copy['target_binary']=(df_copy['target_5d']>
                              df_copy.groupby("Date")['target_5d'].transform('median'))
    
    df_copy = df_copy.replace([np.inf, -np.inf], np.nan)

    return df_copy




In [9]:
df_with_features=feature_engineer(df)

In [10]:
get_df_details(df_with_features)

Printing DF details 

Shape (1316543, 28)
Missing Date                        0
Ticker                      0
Open                        0
High                        0
Low                         0
Close                       0
Volume                      0
sp500_Open                  0
sp500_High                  0
sp500_Low                   0
sp500_Close                 0
sp500_Volume                0
return_5d                3018
rsi_14d                  7350
daily_return              504
volatility_10d           5533
volatility_20d          10563
sp500_return_5d             6
relative_strength_5d     3018
macd_histogram              1
ema_8d                      1
ema_21d                     1
ema_8_21_cross              0
stochastic_k             6881
stochastic_d             8466
target_5d                2515
target_regression        2515
target_binary               0
dtype: int64
Columns Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume',
       'sp500_Open', '

In [11]:
# List your feature columns
feature_columns = ['return_5d', 'rsi_14d', 'volatility_10d', 'volatility_20d', 
                   'sp500_return_5d', 'relative_strength_5d',"target_5d",
                   "stochastic_k","stochastic_d",'ema_8_21_cross','ema_21d',
                   'ema_8d','macd_histogram',
                   "target_binary","target_regression"]

# Drop rows missing any of these features
df_clean = df_with_features.dropna(subset=feature_columns)

print(f"Original: {len(df):,} rows")
print(f"Clean: {len(df_clean):,} rows") 
print(f"Removed: {len(df) - len(df_clean):,} rows ({(len(df) - len(df_clean))/len(df):.1%})")

Original: 1,316,543 rows
Clean: 1,303,018 rows
Removed: 13,525 rows (1.0%)


In [12]:
df_clean.columns

Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume',
       'sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close', 'sp500_Volume',
       'return_5d', 'rsi_14d', 'daily_return', 'volatility_10d',
       'volatility_20d', 'sp500_return_5d', 'relative_strength_5d',
       'macd_histogram', 'ema_8d', 'ema_21d', 'ema_8_21_cross', 'stochastic_k',
       'stochastic_d', 'target_5d', 'target_regression', 'target_binary'],
      dtype='object')

In [13]:
df_clean.to_parquet(f"{PROCESSED_DATA_OUTPUT_DIR}/swing_trading_model_data.parquet", index=False)

In [14]:
df_clean["Ticker"].nunique()

503