In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [29]:
RAW_DATA_OUTPUT_DIR="../data/raw"
PROCESSED_DATA_OUTPUT_DIR="../data/processed"

In [4]:
df=pd.read_parquet(f"{RAW_DATA_OUTPUT_DIR}/stock_and_market_data_2015_2025.parquet"
                   )

In [5]:
def get_df_details(df):
    print("Printing DF details \n")
    print(f"Shape {df.shape}")
    print(f"Missing {df.isna().sum()}")
    print(f"Columns {df.columns}")
    print(f"Duplicated {df.duplicated().sum()}")


In [6]:
get_df_details(df)

Printing DF details 

Shape (1316543, 12)
Missing Date            0
Ticker          0
Open            0
High            0
Low             0
Close           0
Volume          0
sp500_Open      0
sp500_High      0
sp500_Low       0
sp500_Close     0
sp500_Volume    0
dtype: int64
Columns Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume',
       'sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close', 'sp500_Volume'],
      dtype='object')
Duplicated 0


In [None]:
def calculate_rsi(prices,window=14):
    """Calculate RSI - momentum oscillator measuring speed of price movements"""
    delta=prices.diff()
    gain=(delta.where(delta>0,0)).rolling(window=window).mean()
    loss=(-delta.where(delta<0,0)).rolling(window=window).mean()

    rs=gain/loss
    rsi=100-(100/(1+rs))
    return rsi 
    

def feature_engineer(df):
    df_copy=df.copy()
    #return for 5d would be 
    df_copy['return_5d'] = df_copy.groupby("Ticker")["Close"].transform(
        lambda x: (x.shift(1)/x.shift(6)-1)
    )
    #rsi 
    df_copy['rsi_14d']=df_copy.groupby("Ticker")["Close"].transform(
        lambda x:calculate_rsi(x).shift(1)
    )

    #volatility 
    #how much a stock goes up or down per time or its movement? check daily return see how it much it has fluctuated
    df_copy['daily_return'] = df_copy.groupby('Ticker')['Close'].pct_change().shift(1)
    df_copy["volatility_10d"]=df_copy.groupby("Ticker")["daily_return"].transform(
        lambda x:x.rolling(window=10).std()
    )
    df_copy["volatility_20d"]=df_copy.groupby("Ticker")["daily_return"].transform(
        lambda x:x.rolling(window=20).std()
    )

    #market 5d percentage return
    df_copy["sp500_return_5d"]=(df_copy["sp500_Close"].shift(1)/df_copy["sp500_Close"].shift(6)-1)

    #relative strength
    df_copy['relative_strength_5d']=df_copy['return_5d']-df_copy['sp500_return_5d']


    #target
    df_copy["target_5d"]=df_copy.groupby("Ticker")["Close"].transform(
        lambda x:(x.shift(-5)/x-1)
    )
    return df_copy




In [25]:
df_with_features=feature_engineer(df)

In [26]:
get_df_details(df_with_features)

Printing DF details 

Shape (1316543, 20)
Missing Date                        0
Ticker                      0
Open                        0
High                        0
Low                         0
Close                       0
Volume                      0
sp500_Open                  0
sp500_High                  0
sp500_Low                   0
sp500_Close                 0
sp500_Volume                0
return_5d                3018
rsi_14d                  7350
daily_return              504
volatility_10d           5533
volatility_20d          10563
sp500_return_5d             6
relative_strength_5d     3018
target_5d                2515
dtype: int64
Columns Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume',
       'sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close', 'sp500_Volume',
       'return_5d', 'rsi_14d', 'daily_return', 'volatility_10d',
       'volatility_20d', 'sp500_return_5d', 'relative_strength_5d',
       'target_5d'],
      dtype='object')
Duplica

In [27]:
# List your feature columns
feature_columns = ['return_5d', 'rsi_14d', 'volatility_10d', 'volatility_20d', 
                   'sp500_return_5d', 'relative_strength_5d',"target_5d"]

# Drop rows missing any of these features
df_clean = df_with_features.dropna(subset=feature_columns)

print(f"Original: {len(df):,} rows")
print(f"Clean: {len(df_clean):,} rows") 
print(f"Removed: {len(df) - len(df_clean):,} rows ({(len(df) - len(df_clean))/len(df):.1%})")

Original: 1,316,543 rows
Clean: 1,303,157 rows
Removed: 13,386 rows (1.0%)


In [30]:
df_clean.to_parquet(f"{PROCESSED_DATA_OUTPUT_DIR}/swing_trading_model_data.parquet", index=False)