In [1]:

import pandas as pd
import os
import pandas_ta as ta  
import numpy as np
import yfinance as yf
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, 
                        message='DataFrameGroupBy.apply operated on the grouping columns')

RAW_DIR = "../data/raw/yfinance"
PROCESSED_DIR = "../data/processed"
print("Files in yfinance folder:", os.listdir(RAW_DIR))


Files in yfinance folder: ['.gitkeep', 'AAPL.csv', 'ABNB.csv', 'ADBE.csv', 'ADI.csv', 'ADP.csv', 'ADSK.csv', 'AEP.csv', 'AMD.csv', 'AMGN.csv', 'AMZN.csv', 'ANSS.csv', 'ASML.csv', 'AVGO.csv', 'AZN.csv', 'BIIB.csv', 'BKNG.csv', 'BKR.csv', 'CDNS.csv', 'CEG.csv', 'CHTR.csv', 'CMCSA.csv', 'COST.csv', 'CPRT.csv', 'CRWD.csv', 'CSCO.csv', 'CSX.csv', 'CTAS.csv', 'DASH.csv', 'DDOG.csv', 'DLTR.csv', 'DXCM.csv', 'EA.csv', 'ENPH.csv', 'EXC.csv', 'FANG.csv', 'FAST.csv', 'FISV.csv', 'FTNT.csv', 'GEHC.csv', 'GFS.csv', 'GILD.csv', 'GOOG.csv', 'GOOGL.csv', 'HES.csv', 'HON.csv', 'IDXX.csv', 'ILMN.csv', 'INTC.csv', 'INTU.csv', 'ISRG.csv', 'JD.csv', 'KDP.csv', 'KHC.csv', 'KLAC.csv', 'LCID.csv', 'LRCX.csv', 'LULU.csv', 'MAR.csv', 'MCHP.csv', 'MDLZ.csv', 'MELI.csv', 'META.csv', 'MNST.csv', 'MRNA.csv', 'MRVL.csv', 'MSFT.csv', 'MU.csv', 'NFLX.csv', 'NVDA.csv', 'NXPI.csv', 'ODFL.csv', 'ON.csv', 'ORLY.csv', 'PANW.csv', 'PAYX.csv', 'PCAR.csv', 'PDD.csv', 'PEP.csv', 'PYPL.csv', 'QCOM.csv', 'REGN.csv', 'RIVN.csv', 

Bulk Load All YFinance Files


In [2]:
from tqdm import tqdm   
def load_all_yfinance():
    """Load and combine multiple YFinance stock data CSVs into one DataFrame.
    Handles metadata rows, standardizes columns, and ensures date continuity."""
    
    all_dfs = []  
    
    for file in tqdm(os.listdir(RAW_DIR)):
        if file.endswith('.csv'):
            try:
                ticker = file.split('.')[0]
                

                df = pd.read_csv(
                    f"{RAW_DIR}/{file}",
                    skiprows=3,
                    names=['date', 'close', 'high', 'low', 'open', 'volume'],
                    parse_dates=['date']
                )
                
                df['ticker'] = ticker
                

                df = df.set_index('date').asfreq('D').ffill().reset_index()
                
                all_dfs.append(df)
                
            except Exception as e:
                print(f"⚠️ Failed {file}: {str(e)}")
                continue
                

    return pd.concat(all_dfs, ignore_index=True)

df = load_all_yfinance()
print(f"✅ Loaded {len(df)} rows from {df['ticker'].nunique()} stocks")

 26%|██▋       | 27/102 [00:00<00:00, 263.31it/s]

⚠️ Failed FISV.csv: <class 'pandas.core.indexes.base.Index'>


100%|██████████| 102/102 [00:00<00:00, 277.40it/s]

✅ Loaded 36600 rows from 100 stocks





# ✅ code verification 

In [3]:
print(df[df['ticker'] == 'AAPL'].head(10))
unique_tickers = df['ticker'].unique().tolist()
print(f"Tickers: {unique_tickers[:5]}...")  

        date       close        high         low        open       volume  \
0 2024-04-10  166.991486  168.295327  166.324636  168.006696   49709300.0   
1 2024-04-11  174.217361  174.635401  167.369706  167.548852   91070300.0   
2 2024-04-12  175.720276  177.521767  173.391277  173.441030  101593300.0   
3 2024-04-13  175.720276  177.521767  173.391277  173.441030  101593300.0   
4 2024-04-14  175.720276  177.521767  173.391277  173.441030  101593300.0   
5 2024-04-15  171.878418  175.799904  171.689308  174.535868   73531800.0   
6 2024-04-16  168.583984  172.943390  167.479200  170.942841   73711200.0   
7 2024-04-17  167.210464  169.848004  167.210464  168.812899   50901200.0   
8 2024-04-18  166.254959  167.847446  165.767272  167.240312   43122900.0   
9 2024-04-19  164.224548  165.617963  163.308874  165.428868   67772100.0   

  ticker  
0   AAPL  
1   AAPL  
2   AAPL  
3   AAPL  
4   AAPL  
5   AAPL  
6   AAPL  
7   AAPL  
8   AAPL  
9   AAPL  
Tickers: ['AAPL', 'ABNB', 'ADBE

# 🧹 Data Cleaning Pipeline




In [4]:
def clean_data(df):
    df['date'] = pd.to_datetime(df['date'])  
    
    df = df.sort_values(['ticker', 'date'])
    df = df.groupby('ticker').apply(lambda x: x.ffill())  
    
    return df.dropna()

In [5]:
total_nans = df.isna().sum().sum()
print(f"Total NaN values: {total_nans}")

nan_per_column = df.isna().sum()
print("\nNaN per column:")
print(nan_per_column)
print(df.shape)
print(df.head(10))

Total NaN values: 0

NaN per column:
date      0
close     0
high      0
low       0
open      0
volume    0
ticker    0
dtype: int64
(36600, 7)
        date       close        high         low        open       volume  \
0 2024-04-10  166.991486  168.295327  166.324636  168.006696   49709300.0   
1 2024-04-11  174.217361  174.635401  167.369706  167.548852   91070300.0   
2 2024-04-12  175.720276  177.521767  173.391277  173.441030  101593300.0   
3 2024-04-13  175.720276  177.521767  173.391277  173.441030  101593300.0   
4 2024-04-14  175.720276  177.521767  173.391277  173.441030  101593300.0   
5 2024-04-15  171.878418  175.799904  171.689308  174.535868   73531800.0   
6 2024-04-16  168.583984  172.943390  167.479200  170.942841   73711200.0   
7 2024-04-17  167.210464  169.848004  167.210464  168.812899   50901200.0   
8 2024-04-18  166.254959  167.847446  165.767272  167.240312   43122900.0   
9 2024-04-19  164.224548  165.617963  163.308874  165.428868   67772100.0   

  ticke

# Feature Engineering
## Goal: Add technical indicators traders use

### Why These Indicators?

### SMA: Smooths price noise to reveal trends

### RSI: Identifies potential reversals (values >70 = overbought, <30 = oversold)

### MACD: Shows momentum shifts

### Bollinger Bands: Highlights volatility extremes

In [6]:
def add_technical_indicators(df):
    """Calculate technical indicators for each stock in the DataFrame.
    
    Applies common trading indicators to each stock's price series independently
    to avoid cross-contamination between different securities.
    
    Args:
        df: DataFrame containing stock data with columns: ['ticker', 'date', 'close', ...]
        
    Returns:
        DataFrame with additional technical indicator columns:
        - sma_20: 20-day Simple Moving Average (trend identification)
        - rsi_14: 14-day Relative Strength Index (momentum)
        - macd: MACD line (12,26,9 EMAs crossover system)
        - boll_high: Upper Bollinger Band (2 std dev above SMA)
        - boll_low: Lower Bollinger Band (2 std dev below SMA)
    """
    

    return df.groupby('ticker').apply(lambda x: x.assign(

        sma_20=ta.sma(x['close'], 20),
        

        rsi_14=ta.rsi(x['close'], 14),
        

        macd=ta.macd(x['close'])['MACD_12_26_9'],
        

        boll_high=ta.bbands(x['close'])['BBU_5_2.0'],
        boll_low=ta.bbands(x['close'])['BBL_5_2.0']
    ))

In [7]:
df_features = add_technical_indicators(df)
print("New columns:", df_features.columns.tolist())
print("\nAAPL data with indicators:")
print(df_features[df_features['ticker'] == 'AAPL'].tail(3))

New columns: ['date', 'close', 'high', 'low', 'open', 'volume', 'ticker', 'sma_20', 'rsi_14', 'macd', 'boll_high', 'boll_low']

AAPL data with indicators:
                 date       close        high         low        open  \
ticker                                                                  
AAPL   363 2025-04-08  172.419998  190.339996  169.210007  186.699997   
       364 2025-04-09  198.850006  200.610001  171.889999  171.949997   
       365 2025-04-10  187.779999  194.779907  187.680695  189.164993   

                 volume ticker      sma_20     rsi_14       macd   boll_high  \
ticker                                                                         
AAPL   363  120859500.0   AAPL  210.194501  16.453167 -11.059091  196.386826   
       364  184067400.0   AAPL  209.432001  43.461330 -10.168278  203.366318   
       365   39840056.0   AAPL  207.907501  37.930515 -10.237547  203.184601   

              boll_low  
ticker                  
AAPL   363  171.221182  
   

In [8]:
# Forward-fill missing values (carry last known observation forward)
# NOTE: This maintains continuous time series but WON'T fill:
# 1. Leading NaNs (where no prior value exists to fill from)
# 2. Technical indicator warm-up periods (e.g., first 19 rows of SMA20)
# For complete NaN removal, use .dropna() after this
df_features = df_features.ffill()

df_features = df_features.dropna()

print("Remaining NaNs:", df_features.isna().sum().sum())  
print(df_features.head())

Remaining NaNs: 0
                date       close        high         low        open  \
ticker                                                                 
AAPL   25 2024-05-05  182.518173  186.121155  181.801556  185.772794   
       26 2024-05-06  180.856049  183.334337  179.572103  181.493040   
       27 2024-05-07  181.542786  184.031037  180.467875  182.587854   
       28 2024-05-08  181.881180  182.209631  180.597234  181.990663   
       29 2024-05-09  183.702606  183.792180  181.254160  181.702043   

                volume ticker      sma_20     rsi_14      macd   boll_high  \
ticker                                                                       
AAPL   25  163224100.0   AAPL  169.960971  74.595232  3.934064  189.796292   
       26   78569700.0   AAPL  170.574574  69.477540  4.135104  188.138242   
       27   77305800.0   AAPL  171.291190  70.381679  4.300272  183.353827   
       28   45057100.0   AAPL  172.072501  70.840052  4.407666  183.120355   
       29

# Check if any ticker has date gaps


In [9]:
date_gaps = df.groupby('ticker')['date'].apply(lambda x: x.sort_values().diff().max())
print("Max gap between dates per ticker (should be <=3 days for weekends):")
print(date_gaps.dt.days.value_counts())

Max gap between dates per ticker (should be <=3 days for weekends):
date
1    100
Name: count, dtype: int64


Check for duplicate dates per ticker


In [10]:
duplicate_dates = df.groupby(['ticker', 'date']).size().loc[lambda x: x > 1]
if not duplicate_dates.empty:
    print("⚠️ Duplicate dates found:", duplicate_dates.index.tolist())

zero_volume = df[df['volume'] <= 0]
if not zero_volume.empty:
    print("⚠️ Stocks with zero volume:", zero_volume['ticker'].unique())

# Final Data Quality Checks

In [11]:

df_features = df_features.copy()


if 'ticker' in df_features.index.names:
    ticker_values = df_features.index.get_level_values('ticker')
    
    df_features = df_features.reset_index(drop=True)
    
    if 'ticker' not in df_features.columns:
        df_features['ticker'] = ticker_values

print("\n🔍 Running Data Quality Checks...")

negative_prices = df_features[
    (df_features['close'] <= 0) | 
    (df_features['open'] <= 0) |
    (df_features['high'] <= 0) |
    (df_features['low'] <= 0)
]
if not negative_prices.empty:
    print("⚠️ Negative prices found in:", negative_prices['ticker'].unique())
    df_features = df_features[~df_features.index.isin(negative_prices.index)]

invalid_prices = df_features[
    (df_features['high'] < df_features['low']) |
    (df_features['high'] < df_features['close']) |
    (df_features['low'] > df_features['open'])
]
if not invalid_prices.empty:
    print("⚠️ Illogical prices in:", invalid_prices['ticker'].unique())
    df_features = df_features[~df_features.index.isin(invalid_prices.index)]

print("\n⚙️ Calculating Features with Safe NaN Handling...")

df_features['volume_ma_20'] = df_features.groupby('ticker')['volume'].transform(
    lambda x: x.rolling(20, min_periods=1).mean()
)
df_features['volume_spike'] = np.where(
    df_features['volume_ma_20'] > 0,  
    df_features['volume'] / df_features['volume_ma_20'],
    1.0  
)


def safe_technical(group):
    """Calculate indicators for a single stock with error protection"""
    try:
        group['sma_20'] = ta.sma(group['close'], length=20)  
        group['rsi_14'] = ta.rsi(group['close'], length=14)  
        group['atr_14'] = ta.atr(group['high'], group['low'], group['close'], length=14)
        return group
    except Exception as e:
        print(f"⚠️ Error calculating indicators for {group.name}: {str(e)}")
        return group

df_features = df_features.groupby('ticker', group_keys=False).apply(safe_technical)

print("\n🧹 Final NaN Cleanup...")


tech_cols = ['sma_20', 'rsi_14', 'atr_14']
for col in tech_cols:
    if col in df_features.columns:
        df_features[col] = df_features.groupby('ticker')[col].ffill()

fill_values = {
    'sma_20': df_features['close'],  
    'rsi_14': 50,                  
    'atr_14': 0,                     
    'volume_spike': 1              
}
for col, val in fill_values.items():
    if col in df_features.columns:
        df_features[col] = df_features[col].fillna(val)

print("\n✅ Final Validation")
assert df_features.isna().sum().sum() == 0, (
    f"Final NaN count: {df_features.isna().sum().sum()}\n"
    f"NaN columns: {df_features.columns[df_features.isna().any()].tolist()}"
)
assert (df_features['close'] > 0).all(), "Negative prices exist!"

df_features.sort_values(['ticker', 'date'], inplace=True)

print(f"\n🎉 Cleaning complete! Final shape: {df_features.shape}")
print("Available columns:", df_features.columns.tolist())


🔍 Running Data Quality Checks...

⚙️ Calculating Features with Safe NaN Handling...

🧹 Final NaN Cleanup...

✅ Final Validation

🎉 Cleaning complete! Final shape: (36575, 15)
Available columns: ['date', 'close', 'high', 'low', 'open', 'volume', 'ticker', 'sma_20', 'rsi_14', 'macd', 'boll_high', 'boll_low', 'volume_ma_20', 'volume_spike', 'atr_14']


# MARKET DAY FLAG


In [12]:

df_features['is_market_open'] = df_features['volume'] > 0


df_features['daily_return'] = df_features.groupby('ticker')['close'].pct_change()


extreme_returns = df_features[np.abs(df_features['daily_return']) > 0.5]
if not extreme_returns.empty:
    print(f"⚠️ {len(extreme_returns)} extreme returns (>50%) detected")
    print("Sample affected tickers:", extreme_returns['ticker'].unique()[:5])
    


print(df_features.head(10))


df_features.to_parquet(
    f"{PROCESSED_DIR}/cleaned_stocks.parquet",  
    index=False, 
    engine='pyarrow',  
    compression='gzip', 
)

saved_size = os.path.getsize(f"{PROCESSED_DIR}/cleaned_stocks.parquet")/1e6
print(f"✅ Saved cleaned data ({saved_size:.1f}MB)")
print(f"Columns persisted: {df_features.columns.tolist()}")

        date       close        high         low        open       volume  \
0 2024-05-05  182.518173  186.121155  181.801556  185.772794  163224100.0   
1 2024-05-06  180.856049  183.334337  179.572103  181.493040   78569700.0   
2 2024-05-07  181.542786  184.031037  180.467875  182.587854   77305800.0   
3 2024-05-08  181.881180  182.209631  180.597234  181.990663   45057100.0   
4 2024-05-09  183.702606  183.792180  181.254160  181.702043   48983000.0   
5 2024-05-10  182.436844  184.470004  181.519927  184.280638   50759500.0   
6 2024-05-11  182.436844  184.470004  181.519927  184.280638   50759500.0   
7 2024-05-12  182.436844  184.470004  181.519927  184.280638   50759500.0   
8 2024-05-13  185.656021  186.473282  184.001578  184.818838   72044800.0   
9 2024-05-14  186.802170  187.669266  185.665989  186.881904   52393600.0   

  ticker      sma_20  rsi_14      macd   boll_high    boll_low  volume_ma_20  \
0   AAPL  182.518173    50.0  3.934064  189.796292  165.513986  1.632241

In [13]:
df_features.head()

Unnamed: 0,date,close,high,low,open,volume,ticker,sma_20,rsi_14,macd,boll_high,boll_low,volume_ma_20,volume_spike,atr_14,is_market_open,daily_return
0,2024-05-05,182.518173,186.121155,181.801556,185.772794,163224100.0,AAPL,182.518173,50.0,3.934064,189.796292,165.513986,163224100.0,1.0,0.0,True,
1,2024-05-06,180.856049,183.334337,179.572103,181.49304,78569700.0,AAPL,180.856049,50.0,4.135104,188.138242,172.112716,120896900.0,0.64989,0.0,True,-0.009107
2,2024-05-07,181.542786,184.031037,180.467875,182.587854,77305800.0,AAPL,181.542786,50.0,4.300272,183.353827,180.627514,106366500.0,0.726787,0.0,True,0.003797
3,2024-05-08,181.88118,182.209631,180.597234,181.990663,45057100.0,AAPL,181.88118,50.0,4.407666,183.120355,180.606189,91039180.0,0.49492,0.0,True,0.001864
4,2024-05-09,183.702606,183.79218,181.25416,181.702043,48983000.0,AAPL,183.702606,50.0,4.586876,184.028682,180.171636,82627940.0,0.592814,0.0,True,0.010014


In [14]:
df_features.tail(5)

Unnamed: 0,date,close,high,low,open,volume,ticker,sma_20,rsi_14,macd,boll_high,boll_low,volume_ma_20,volume_spike,atr_14,is_market_open,daily_return
36570,2025-04-06,174.669998,186.339996,172.850006,184.570007,5710500.0,ZS,200.772999,29.56426,-5.43184,210.840039,158.463959,2629405.0,2.171784,10.136171,True,0.0
36571,2025-04-07,180.054993,186.235001,164.779999,166.880005,5738300.0,ZS,199.696249,35.736179,-5.981416,192.864963,165.645034,2835750.0,2.023556,10.944659,True,0.03083
36572,2025-04-08,177.039993,189.214996,174.789993,186.580002,2975300.0,ZS,198.350748,33.942842,-6.584344,180.471838,171.970154,2888260.0,1.030136,11.193255,True,-0.016745
36573,2025-04-09,202.119995,203.160004,177.274994,177.75,4508700.0,ZS,198.279248,54.429226,-4.981004,202.500207,160.921784,3066080.0,1.47051,12.259451,True,0.141663
36574,2025-04-10,192.729996,198.830002,192.309998,195.630005,1206064.0,ZS,197.655748,48.379577,-4.417123,206.239002,164.406988,3035063.2,0.397377,12.08449,True,-0.046458


# Stock Data Processing Pipeline

## 📂 File: `02_data_cleaning.ipynb`

### 🎯 Purpose
Processes raw stock price data from yFinance CSVs into an analysis-ready dataset with technical indicators and quality checks.

### 🔧 Key Features

#### 1. Data Loading
- Bulk loads 100+ stock CSV files from `../data/raw/yfinance`
- Standardizes columns: `date`, `open`, `high`, `low`, `close`, `volume`
- Handles ticker metadata and date parsing

#### 2. Data Cleaning
- Fixes date continuity with `.asfreq('D').ffill()`
- Drops NA values and invalid records
- Validations:
  - No negative prices
  - Logical OHLC relationships (high ≥ low, etc.)
  - No duplicate dates per ticker

#### 3. Feature Engineering
| Indicator          | Description                          | Calculation Method          |
|--------------------|--------------------------------------|-----------------------------|
| `sma_20`           | 20-day Simple Moving Average        | `pandas_ta.sma(close, 20)`  |
| `rsi_14`           | 14-day Relative Strength Index      | `pandas_ta.rsi(close, 14)`  |
| `macd`             | Moving Average Convergence Divergence| `pandas_ta.macd(close)`     |
| `boll_high/low`    | Bollinger Bands (2σ)                | `pandas_ta.bbands(close)`   |
| `volume_spike`     | Volume vs 20-day average            | `volume / volume_ma_20`     |
| `daily_return`     | Daily percentage change             | `close.pct_change()`        |

#### 4. Quality Checks
- Detects extreme returns (>50% daily moves)
- Validates market days vs weekends
- Ensures no data gaps >1 day (accounting for weekends)

### 📊 Output
```python
Saved to: ../data/processed/cleaned_stocks.parquet (3.5MB)
Shape: (36,175 rows × 17 columns)

⚠️ Edge Cases Handled
- Indicator warm-up periods: Handles first 19 rows for SMA20 where values would normally be NaN  
- Zero-volume days: Properly processes days with zero trading volume  
- Ticker-specific calculations**: Ensures no cross-contamination between different stocks  
- NaN filling with reasonable defaults:
  - `RSI → 50` (neutral value)
  - `ATR → 0` (no volatility)

🔄 Dependencies
pandas, pandas_ta, numpy, yfinance, tqdm, pyarrow