In [1]:
import pandas as pd
import numpy as np

In [None]:
# Load EDA-processed data
df = pd.read_csv("C:/Users/ashua/Desktop/Stock Market Recommendation System/data/AAPL_after_EDA.csv", parse_dates=["Date"])
df.set_index("Date", inplace=True)

In [None]:
# Calculate 14-day RSI
delta = df["Close"].diff()
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)

avg_gain = gain.rolling(14).mean()
avg_loss = loss.rolling(14).mean()

rs = avg_gain / avg_loss
df["RSI_14"] = 100 - (100 / (1 + rs))

In [None]:
# Calculate MACD
ema_12 = df["Close"].ewm(span=12, adjust=False).mean()
ema_26 = df["Close"].ewm(span=26, adjust=False).mean()

df["MACD"] = ema_12 - ema_26
df["MACD_Signal"] = df["MACD"].ewm(span=9, adjust=False).mean()

In [None]:
# Calculate Bollinger Bands
rolling_mean = df["Close"].rolling(20).mean()
rolling_std = df["Close"].rolling(20).std()

df["BB_Upper"] = rolling_mean + (2 * rolling_std)
df["BB_Lower"] = rolling_mean - (2 * rolling_std)
df["BB_Width"] = df["BB_Upper"] - df["BB_Lower"]

In [None]:
# Calculate Average True Range (ATR)
high_low = df["High"] - df["Low"]
high_close = (df["High"] - df["Close"].shift()).abs()
low_close = (df["Low"] - df["Close"].shift()).abs()

true_range = high_low.combine(high_close, max).combine(low_close, max)
df["ATR_14"] = true_range.rolling(14).mean()

In [6]:
# Create target variable: Next day return
df["Future_Return"] = df["Close"].shift(-1) / df["Close"] - 1
df["Signal"] = (df["Future_Return"] > 0).astype(int)

In [7]:
#Final cleanup: Drop rows with NaN values
df.dropna(inplace=True)

In [8]:
#shape of the final dataframe
print("Final dataframe shape:", df.shape)

Final dataframe shape: (2445, 19)


In [9]:
#Check final dataframe columns
print("Final dataframe columns:", df.columns)

Final dataframe columns: Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits',
       'Return', 'SMA_20', 'SMA_50', 'Volatility', 'RSI_14', 'MACD',
       'MACD_Signal', 'BB_Upper', 'BB_Lower', 'BB_Width', 'Future_Return',
       'Signal'],
      dtype='object')


In [10]:
#Final dataframe head for verification
print(df.head())

                                Open       High        Low      Close  \
Date                                                                    
2016-04-06 00:00:00-04:00  24.988303  25.158322  24.754809  25.153788   
2016-04-07 00:00:00-04:00  24.924822  25.031367  24.509976  24.605186   
2016-04-08 00:00:00-04:00  24.689068  24.884022  24.521314  24.632395   
2016-04-11 00:00:00-04:00  24.702667  25.074442  24.670930  24.714001   
2016-04-12 00:00:00-04:00  24.786545  25.049509  24.632396  25.035908   

                              Volume  Dividends  Stock Splits    Return  \
Date                                                                      
2016-04-06 00:00:00-04:00  105616400        0.0           0.0  0.010473   
2016-04-07 00:00:00-04:00  127207600        0.0           0.0 -0.021810   
2016-04-08 00:00:00-04:00   94326800        0.0           0.0  0.001106   
2016-04-11 00:00:00-04:00  117630000        0.0           0.0  0.003313   
2016-04-12 00:00:00-04:00  108929200  

In [11]:
#Saving the new dataframe with features
df.index.name = "Date"

# Save final processed data
df.to_csv("C:/Users/ashua/Desktop/Stock Market Recommendation System/data/final_processed_data.csv")

print("Final processed data saved successfully")

Final processed data saved successfully
