In [None]:
import pandas as pd
import os
import pandas_ta as ta  # Technical indicators
from tqdm import tqdm   # Progress bars

# Configuration
RAW_DIR = "../data/raw/yfinance"
PROCESSED_DIR = "../data/processed"


Bulk Load All YFinance Files


In [None]:
def load_all_yfinance():
    """Load Yahoo Finance CSVs with multi-row headers"""
    all_dfs = []
    
    for file in tqdm(os.listdir(RAW_DIR)):
        if file.endswith('.csv'):
            try:
                ticker = file.split('.')[0]
                
                # Skip the first 3 rows of metadata
                df = pd.read_csv(
                    f"{RAW_DIR}/{file}",
                    skiprows=3,
                    names=['date', 'close', 'high', 'low', 'open', 'volume'],
                    parse_dates=['date']
                )
                
                df['ticker'] = ticker
                all_dfs.append(df)
                
            except Exception as e:
                print(f"⚠️ Failed {file}: {str(e)}")
                continue
                
    return pd.concat(all_dfs, ignore_index=True)

df = load_all_yfinance()
print(f"✅ Loaded {len(df)} rows from {df['ticker'].nunique()} stocks")

code verification 

In [None]:
print(df[df['ticker'] == 'AAPL'].head(10))
unique_tickers = df['ticker'].unique().tolist()
print(f"Tickers: {unique_tickers[:5]}...")  

Data Cleaning Pipeline
Goal: Fix common data quality issues

Why?
Ensures dates are recognized as timestamps (not strings)
, Handles market closures without leaving gaps
, Guarantees no NaN values break your models



In [None]:
def clean_data(df):
    # Convert text dates to proper datetime format (essential for time series)
    df['date'] = pd.to_datetime(df['date'])  
    
    # Forward-fill missing values (e.g., weekends/holidays when markets are closed)
    df = df.sort_values(['ticker', 'date'])
    df = df.groupby('ticker').apply(lambda x: x.ffill())  # Carry last known value forward
    
    # Remove any remaining bad rows
    return df.dropna()

In [None]:
# Total NaN values in entire DataFrame
total_nans = df.isna().sum().sum()
print(f"Total NaN values: {total_nans}")

# NaN count per column
nan_per_column = df.isna().sum()
print("\nNaN per column:")
print(nan_per_column)
print(df.shape)
print(df.head(10))

Feature Engineering
Goal: Add technical indicators traders use

Why These Indicators?

SMA: Smooths price noise to reveal trends

RSI: Identifies potential reversals (values >70 = overbought, <30 = oversold)

MACD: Shows momentum shifts

Bollinger Bands: Highlights volatility extremes

In [None]:
def add_technical_indicators(df):
    # Calculate indicators PER STOCK (groupby ensures no cross-contamination)
    return df.groupby('ticker').apply(lambda x: x.assign(
        sma_20=ta.sma(x['close'], 20),       # 20-day moving average (trend direction)
        rsi_14=ta.rsi(x['close'], 14),       # Relative Strength Index (overbought/oversold)
        macd=ta.macd(x['close'])['MACD_12_26_9'],  # MACD (momentum)
        boll_high=ta.bbands(x['close'])['BBU_5_2.0'],  # Bollinger Upper Band (volatility)
        boll_low=ta.bbands(x['close'])['BBL_5_2.0']    # Bollinger Lower Band
    ))

In [None]:
df_features = add_technical_indicators(df)
# Check new columns
print("New columns:", df_features.columns.tolist())
# See sample data for AAPL
print("\nAAPL data with indicators:")
print(df_features[df_features['ticker'] == 'AAPL'].tail(3))

In [None]:
# Forward-fill missing values (carry last known value forward)
df_features = df_features.ffill()

# OR drop early rows with NaN
df_features = df_features.dropna()

print("Remaining NaNs:", df_features.isna().sum().sum())  # Should be 0
print(df_features.head())

In [None]:
# Check if any ticker has date gaps
date_gaps = df.groupby('ticker')['date'].apply(lambda x: x.sort_values().diff().max())
print("Max gap between dates per ticker (should be <=3 days for weekends):")
print(date_gaps.dt.days.value_counts())