In [1]:
# Importing necessary libraries will be here

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sys
import os

project_root = os.path.abspath("..")
sys.path.append(project_root)

In [2]:
# we will identify our dataframe here and read the csv 
stock = pd.read_csv('../data/01_raw/stocks.csv')

print(stock.head())

             timestamp         name    last    high     low  chg_   chg_%  \
0  2025-11-12 20:30:02    Coca-Cola   71.71   71.89   71.34  0.10  +0.13%   
1  2025-11-12 20:30:02           3M  171.48  171.60  168.65  2.81  +1.67%   
2  2025-11-12 20:30:02  Walt Disney  116.35  116.42  114.53  1.50  +1.31%   
3  2025-11-12 20:30:02   Amazon.com  245.14  250.37  243.93 -3.96  -1.59%   
4  2025-11-12 20:30:02    Microsoft  501.35  509.25  499.12 -7.33  -1.44%   

      vol_      time  
0    4.78M  12:28:51  
1  864.41K  12:27:53  
2    5.19M  12:28:17  
3   14.07M  12:28:54  
4    8.81M  12:28:11  


In [3]:
stock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218398 entries, 0 to 218397
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   timestamp  218398 non-null  object 
 1   name       218398 non-null  object 
 2   last       218398 non-null  float64
 3   high       218398 non-null  float64
 4   low        218398 non-null  float64
 5   chg_       218398 non-null  float64
 6   chg_%      218398 non-null  object 
 7   vol_       218398 non-null  object 
 8   time       218398 non-null  object 
dtypes: float64(4), object(5)
memory usage: 15.0+ MB


In [4]:
cleaned_df = pd.read_csv("../data/02_processed/stocks_clean.csv")

In [5]:
print(cleaned_df.tail())

                  timestamp       name    last    high     low  chg_   chg_%  \
218393  2025-11-12 20:30:02   JPMorgan  321.27  322.25  316.21  5.65  0.0179   
218394  2025-11-12 20:30:02      Cisco   73.13   73.27   71.72  1.42  0.0198   
218395  2025-11-12 20:30:02  Citigroup  102.85  103.86  101.18  2.09  0.0207   
218396  2025-11-12 20:30:02      Apple  274.36  275.24  271.87 -0.89 -0.0032   
218397  2025-11-12 20:30:02  Coca-Cola   71.71   71.89   71.34  0.10  0.0013   

                vol_  
218393  3.560000e+08  
218394  1.542000e+09  
218395  6.910000e+08  
218396  2.079000e+09  
218397  4.780000e+08  


In [6]:
# Okay now let's test our timestamp features function
import pandas as pd
from src.features import add_time_features

cleaned_df = add_time_features(cleaned_df)

cleaned_df[["timestamp","hour","weekday","month","is_weekend"]].tail()

Unnamed: 0,timestamp,hour,weekday,month,is_weekend
218393,2025-11-12 20:30:02,20,2,11,0
218394,2025-11-12 20:30:02,20,2,11,0
218395,2025-11-12 20:30:02,20,2,11,0
218396,2025-11-12 20:30:02,20,2,11,0
218397,2025-11-12 20:30:02,20,2,11,0


In [7]:
# Testing price features function
import pandas as pd
from src.features import add_price_features

cleaned_df = add_price_features(cleaned_df)

cleaned_df[["timestamp","name","last","high_low_range","last_lag_1","return_1h","ma_last_3h","volatility_3h"]].tail(20)

Unnamed: 0,timestamp,name,last,high_low_range,last_lag_1,return_1h,ma_last_3h,volatility_3h
217803,2025-11-12 10:59:59,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
217839,2025-11-12 11:30:01,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
217874,2025-11-12 12:00:03,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
217892,2025-11-12 12:30:02,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
217936,2025-11-12 13:00:03,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
217974,2025-11-12 13:30:02,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
217991,2025-11-12 14:00:03,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
218024,2025-11-12 14:30:02,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
218050,2025-11-12 15:00:01,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0
218096,2025-11-12 15:30:03,Walt Disney,114.85,2.55,114.85,0.0,114.85,0.0


In [8]:
# Let's test the add_volume_features now
from src.features import add_volume_features

cleaned_df = add_volume_features(cleaned_df)

df_stock = cleaned_df[cleaned_df["name"] == "Walt Disney"].sort_values("timestamp")

df_stock[[
    "timestamp", "vol_",
    "vol_lag_1", "vol_change_1h",
    "vol_ma_3h", "vol_ma_6h"
]].head(15)

Unnamed: 0,timestamp,vol_,vol_lag_1,vol_change_1h,vol_ma_3h,vol_ma_6h
18,2025-03-17 00:45:59,102000000.0,,,102000000.0,102000000.0
36,2025-03-17 01:46:00,102000000.0,102000000.0,0.0,102000000.0,102000000.0
69,2025-03-17 02:46:01,102000000.0,102000000.0,0.0,102000000.0,102000000.0
98,2025-03-17 03:45:59,102000000.0,102000000.0,0.0,102000000.0,102000000.0
126,2025-03-17 04:46:00,102000000.0,102000000.0,0.0,102000000.0,102000000.0
151,2025-03-17 05:46:01,102000000.0,102000000.0,0.0,102000000.0,102000000.0
203,2025-03-17 06:46:00,102000000.0,102000000.0,0.0,102000000.0,102000000.0
210,2025-03-17 07:46:00,102000000.0,102000000.0,0.0,102000000.0,102000000.0
253,2025-03-17 08:46:00,102000000.0,102000000.0,0.0,102000000.0,102000000.0
295,2025-03-17 09:46:00,102000000.0,102000000.0,0.0,102000000.0,102000000.0


In [10]:
from src.features import build_features_dataset

df_with_features = build_features_dataset(cleaned_df)

print(df_with_features[[
    "timestamp", "name", "last", "vol_", 
    "vol_lag_1", "vol_change_1h", 
    "vol_ma_3h", "vol_ma_6h", 
    "return_1h", "high_low_range"
]].head(20))

              timestamp name    last         vol_    vol_lag_1  vol_change_1h  \
31  2025-03-17 01:46:00   3M  150.41  409000000.0  409000000.0       0.000000   
74  2025-03-17 02:46:01   3M  150.41  409000000.0  409000000.0       0.000000   
117 2025-03-17 03:45:59   3M  150.41  409000000.0  409000000.0       0.000000   
141 2025-03-17 04:46:00   3M  150.41  409000000.0  409000000.0       0.000000   
150 2025-03-17 05:46:01   3M  150.41  409000000.0  409000000.0       0.000000   
185 2025-03-17 06:46:00   3M  150.41  409000000.0  409000000.0       0.000000   
227 2025-03-17 07:46:00   3M  150.41  409000000.0  409000000.0       0.000000   
265 2025-03-17 08:46:00   3M  150.41  409000000.0  409000000.0       0.000000   
298 2025-03-17 09:46:00   3M  150.41  409000000.0  409000000.0       0.000000   
314 2025-03-17 10:45:59   3M  150.41  409000000.0  409000000.0       0.000000   
346 2025-03-17 11:46:00   3M  150.41  409000000.0  409000000.0       0.000000   
387 2025-03-17 12:46:00   3M