In [13]:
import pandas as pd
import numpy as np

In [14]:
# load data
apple_df = pd.read_csv('data/AAPL.csv')

apple_df.head()

Unnamed: 0,Date,Close,Volume,Open,High,Low
0,2023-07-21,191.94,71904040,194.1,194.97,191.23
1,2023-07-20,193.13,59581200,195.09,196.47,192.495
2,2023-07-19,195.1,80507320,193.1,198.23,192.65
3,2023-07-18,193.73,48353770,193.35,194.33,192.415
4,2023-07-17,193.99,50520160,191.9,194.32,191.81


In [15]:
# daily returns feature
apple_df['Daily_Return'] = apple_df['Close'].pct_change( periods = 1 )
apple_df.head()

# 5 day rolling average for close price
apple_df['5_day_mean_close_price'] = apple_df['Close'].rolling(5).mean()
apple_df.head(10)

# 5 day rolling average of volume
apple_df['5_day_mean_volume'] = apple_df['Volume'].rolling(5).mean()
apple_df.head(10)


# Daily_Range = High - Low
apple_df['Daily_Range'] = apple_df['High'] - apple_df['Low']


# Volaity = 5 day rolling standard deviation of daily returns
apple_df['Volatility'] = apple_df['Daily_Return'].rolling(5).std()
apple_df.head(10)


Unnamed: 0,Date,Close,Volume,Open,High,Low,Daily_Return,5_day_mean_close_price,5_day_mean_volume,Daily_Range,Volatility
0,2023-07-21,191.94,71904040,194.1,194.97,191.23,,,,3.74,
1,2023-07-20,193.13,59581200,195.09,196.47,192.495,0.0062,,,3.975,
2,2023-07-19,195.1,80507320,193.1,198.23,192.65,0.0102,,,5.58,
3,2023-07-18,193.73,48353770,193.35,194.33,192.415,-0.007022,,,1.915,
4,2023-07-17,193.99,50520160,191.9,194.32,191.81,0.001342,193.578,62173298.0,2.51,
5,2023-07-14,190.69,41616240,190.23,191.1799,189.63,-0.017011,193.328,56115738.0,1.5499,0.010898
6,2023-07-13,190.54,41342340,190.5,191.19,189.78,-0.000787,192.81,52467966.0,1.41,0.010123
7,2023-07-12,189.77,60750250,189.68,191.7,188.47,-0.004041,191.744,48516552.0,3.23,0.007176
8,2023-07-11,188.08,46638120,189.16,189.3,186.6,-0.008906,190.614,48173422.0,2.7,0.007324
9,2023-07-10,188.61,59922160,189.26,189.99,187.035,0.002818,189.538,50053822.0,2.955,0.007707


In [16]:
# create new column called Quarter
apple_df['Quarter'] = pd.PeriodIndex(apple_df['Date'], freq='Q')

In [17]:
apple_df.head(35)

# impuute missing values
apple_df['5_day_mean_close_price'] = apple_df['5_day_mean_close_price'].fillna(0)
apple_df['5_day_mean_volume'] = apple_df['5_day_mean_volume'].fillna(0)
apple_df['Volatility'] = apple_df['Volatility'].fillna(0)
apple_df['Daily_Return'] = apple_df['Daily_Return'].fillna(0)

apple_df.head(10)

Unnamed: 0,Date,Close,Volume,Open,High,Low,Daily_Return,5_day_mean_close_price,5_day_mean_volume,Daily_Range,Volatility,Quarter
0,2023-07-21,191.94,71904040,194.1,194.97,191.23,0.0,0.0,0.0,3.74,0.0,2023Q3
1,2023-07-20,193.13,59581200,195.09,196.47,192.495,0.0062,0.0,0.0,3.975,0.0,2023Q3
2,2023-07-19,195.1,80507320,193.1,198.23,192.65,0.0102,0.0,0.0,5.58,0.0,2023Q3
3,2023-07-18,193.73,48353770,193.35,194.33,192.415,-0.007022,0.0,0.0,1.915,0.0,2023Q3
4,2023-07-17,193.99,50520160,191.9,194.32,191.81,0.001342,193.578,62173298.0,2.51,0.0,2023Q3
5,2023-07-14,190.69,41616240,190.23,191.1799,189.63,-0.017011,193.328,56115738.0,1.5499,0.010898,2023Q3
6,2023-07-13,190.54,41342340,190.5,191.19,189.78,-0.000787,192.81,52467966.0,1.41,0.010123,2023Q3
7,2023-07-12,189.77,60750250,189.68,191.7,188.47,-0.004041,191.744,48516552.0,3.23,0.007176,2023Q3
8,2023-07-11,188.08,46638120,189.16,189.3,186.6,-0.008906,190.614,48173422.0,2.7,0.007324,2023Q3
9,2023-07-10,188.61,59922160,189.26,189.99,187.035,0.002818,189.538,50053822.0,2.955,0.007707,2023Q3


In [18]:
# 5-day and 20-day exponential moving averages for closing price
apple_df['EMA_Close_5'] = apple_df['Close'].ewm(span=5, adjust=False).mean()
apple_df['EMA_Close_20'] = apple_df['Close'].ewm(span=20, adjust=False).mean()

In [19]:
print(apple_df.columns)

Index(['Date', 'Close', 'Volume', 'Open', 'High', 'Low', 'Daily_Return',
       '5_day_mean_close_price', '5_day_mean_volume', 'Daily_Range',
       'Volatility', 'Quarter', 'EMA_Close_5', 'EMA_Close_20'],
      dtype='object')


In [20]:
from sklearn.preprocessing import StandardScaler
# features needed to scale

# Extract numerical features
# we don't want to scale the target variable
numerical_features = ['Volume', 'Open', 'High', 'Low', 'Daily_Return',
                     '5_day_mean_close_price', '5_day_mean_volume', 'Daily_Range',
                     'Volatility', 'EMA_Close_5', 'EMA_Close_20']

# Initialize the scaler
scaler = StandardScaler()

# Scale the numerical features
apple_df[numerical_features] = scaler.fit_transform(apple_df[numerical_features])


In [21]:
# save to csv
apple_df.to_csv('data/clean/AAPL_feature_engineered.csv', index=False)

In [22]:
apple_df.head(10)

Unnamed: 0,Date,Close,Volume,Open,High,Low,Daily_Return,5_day_mean_close_price,5_day_mean_volume,Daily_Range,Volatility,Quarter,EMA_Close_5,EMA_Close_20
0,2023-07-21,191.94,-0.799352,2.351164,2.323634,2.337356,0.047208,-1.339424,-1.76425,1.362139,-1.519428,2023Q3,2.300955,2.280669
1,2023-07-20,193.13,-0.926764,2.369995,2.351814,2.361696,0.391079,-1.339424,-1.76425,1.506983,-1.519428,2023Q3,2.308483,2.28281
2,2023-07-19,195.1,-0.710398,2.332142,2.384878,2.364679,0.612965,-1.339424,-1.76425,2.496239,-1.519428,2023Q3,2.325964,2.288292
3,2023-07-18,193.73,-1.04285,2.336898,2.31161,2.360157,-0.342264,-1.339424,-1.76425,0.237284,-1.519428,2023Q3,2.328951,2.290787
4,2023-07-17,193.99,-1.020451,2.309316,2.311422,2.348516,0.121645,2.344107,-1.027362,0.604017,-1.519428,2023Q3,2.332587,2.293511
5,2023-07-14,190.69,-1.112513,2.27755,2.25243,2.306569,-0.896305,2.33935,-1.099157,0.012251,-0.434731,2023Q3,2.314136,2.290039
6,2023-07-13,190.54,-1.115345,2.282686,2.25262,2.309456,0.003579,2.329493,-1.142391,-0.073978,-0.511865,2023Q3,2.300886,2.286628
7,2023-07-12,189.77,-0.914677,2.267088,2.262201,2.284249,-0.176931,2.309209,-1.189224,1.047796,-0.805181,2023Q3,2.287182,2.282156
8,2023-07-11,188.08,-1.060589,2.257197,2.217113,2.248268,-0.44673,2.287706,-1.193291,0.721125,-0.790496,2023Q3,2.267355,2.275069
9,2023-07-10,188.61,-0.923239,2.259099,2.230076,2.256638,0.203504,2.267232,-1.171004,0.878297,-0.752318,2023Q3,2.25749,2.269611
