In [247]:
import pandas as pd
import os
import pandas_ta as ta  # Technical indicators
from tqdm import tqdm   # Progress bars

# Configuration
RAW_DIR = "../data/raw/yfinance"
PROCESSED_DIR = "../data/processed"


Bulk Load All YFinance Files


In [248]:
def load_all_yfinance():
    """Load Yahoo Finance CSVs with multi-row headers"""
    all_dfs = []
    
    for file in tqdm(os.listdir(RAW_DIR)):
        if file.endswith('.csv'):
            try:
                ticker = file.split('.')[0]
                
                # Skip the first 3 rows of metadata
                df = pd.read_csv(
                    f"{RAW_DIR}/{file}",
                    skiprows=3,
                    names=['date', 'close', 'high', 'low', 'open', 'volume'],
                    parse_dates=['date']
                )
                
                df['ticker'] = ticker
                all_dfs.append(df)
                
            except Exception as e:
                print(f"⚠️ Failed {file}: {str(e)}")
                continue
                
    return pd.concat(all_dfs, ignore_index=True)

df = load_all_yfinance()
print(f"✅ Loaded {len(df)} rows from {df['ticker'].nunique()} stocks")

  0%|          | 0/102 [00:00<?, ?it/s]

100%|██████████| 102/102 [00:00<00:00, 438.48it/s]

✅ Loaded 25000 rows from 100 stocks



  return pd.concat(all_dfs, ignore_index=True)


code verification 

In [249]:
print(df[df['ticker'] == 'AAPL'].head(10))
unique_tickers = df['ticker'].unique().tolist()
print(f"Tickers: {unique_tickers[:5]}...")  

        date       close        high         low        open     volume ticker
0 2024-04-01  169.230927  170.445194  168.683508  170.385479   46240500   AAPL
1 2024-04-02  168.046494  168.544144  167.439360  168.285371   49329500   AAPL
2 2024-04-03  168.852692  169.877850  167.787728  167.996733   47691700   AAPL
3 2024-04-04  168.026611  171.112033  168.026611  169.489689   53704400   AAPL
4 2024-04-05  168.783051  169.589241  168.156006  168.792998   42055200   AAPL
5 2024-04-08  167.658356  168.404831  167.449351  168.235632   37425500   AAPL
6 2024-04-09  168.872620  169.280696  167.558831  167.907177   42451200   AAPL
7 2024-04-10  166.991486  168.295327  166.324636  168.006696   49709300   AAPL
8 2024-04-11  174.217361  174.635401  167.369706  167.548852   91070300   AAPL
9 2024-04-12  175.720276  177.521767  173.391277  173.441030  101593300   AAPL
Tickers: ['AAPL', 'ABNB', 'ADBE', 'ADI', 'ADP']...


Data Cleaning Pipeline
Goal: Fix common data quality issues

Why?
Ensures dates are recognized as timestamps (not strings)
, Handles market closures without leaving gaps
, Guarantees no NaN values break your models



In [250]:
def clean_data(df):
    # Convert text dates to proper datetime format (essential for time series)
    df['date'] = pd.to_datetime(df['date'])  
    
    # Forward-fill missing values (e.g., weekends/holidays when markets are closed)
    df = df.sort_values(['ticker', 'date'])
    df = df.groupby('ticker').apply(lambda x: x.ffill())  # Carry last known value forward
    
    # Remove any remaining bad rows
    return df.dropna()

In [251]:
# Total NaN values in entire DataFrame
total_nans = df.isna().sum().sum()
print(f"Total NaN values: {total_nans}")

# NaN count per column
nan_per_column = df.isna().sum()
print("\nNaN per column:")
print(nan_per_column)
print(df.shape)
print(df.head(10))

Total NaN values: 0

NaN per column:
date      0
close     0
high      0
low       0
open      0
volume    0
ticker    0
dtype: int64
(25000, 7)
        date       close        high         low        open     volume ticker
0 2024-04-01  169.230927  170.445194  168.683508  170.385479   46240500   AAPL
1 2024-04-02  168.046494  168.544144  167.439360  168.285371   49329500   AAPL
2 2024-04-03  168.852692  169.877850  167.787728  167.996733   47691700   AAPL
3 2024-04-04  168.026611  171.112033  168.026611  169.489689   53704400   AAPL
4 2024-04-05  168.783051  169.589241  168.156006  168.792998   42055200   AAPL
5 2024-04-08  167.658356  168.404831  167.449351  168.235632   37425500   AAPL
6 2024-04-09  168.872620  169.280696  167.558831  167.907177   42451200   AAPL
7 2024-04-10  166.991486  168.295327  166.324636  168.006696   49709300   AAPL
8 2024-04-11  174.217361  174.635401  167.369706  167.548852   91070300   AAPL
9 2024-04-12  175.720276  177.521767  173.391277  173.441030  101

Feature Engineering
Goal: Add technical indicators traders use

Why These Indicators?

SMA: Smooths price noise to reveal trends

RSI: Identifies potential reversals (values >70 = overbought, <30 = oversold)

MACD: Shows momentum shifts

Bollinger Bands: Highlights volatility extremes

In [252]:
def add_technical_indicators(df):
    # Calculate indicators PER STOCK (groupby ensures no cross-contamination)
    return df.groupby('ticker').apply(lambda x: x.assign(
        sma_20=ta.sma(x['close'], 20),       # 20-day moving average (trend direction)
        rsi_14=ta.rsi(x['close'], 14),       # Relative Strength Index (overbought/oversold)
        macd=ta.macd(x['close'])['MACD_12_26_9'],  # MACD (momentum)
        boll_high=ta.bbands(x['close'])['BBU_5_2.0'],  # Bollinger Upper Band (volatility)
        boll_low=ta.bbands(x['close'])['BBL_5_2.0']    # Bollinger Lower Band
    ))

In [253]:
df_features = add_technical_indicators(df)
# Check new columns
print("New columns:", df_features.columns.tolist())
# See sample data for AAPL
print("\nAAPL data with indicators:")
print(df_features[df_features['ticker'] == 'AAPL'].tail(3))

New columns: ['date', 'close', 'high', 'low', 'open', 'volume', 'ticker', 'sma_20', 'rsi_14', 'macd', 'boll_high', 'boll_low']

AAPL data with indicators:
                 date       close        high         low        open  \
ticker                                                                  
AAPL   247 2025-03-26  221.529999  225.020004  220.470001  223.509995   
       248 2025-03-27  223.850006  224.990005  220.559998  221.389999   
       249 2025-03-28  217.899994  223.809998  217.679993  221.669998   

              volume ticker    sma_20     rsi_14      macd   boll_high  \
ticker                                                                   
AAPL   247  34532700   AAPL  224.6010  44.768952 -4.701965  226.261696   
       248  37094800   AAPL  223.9285  47.775590 -4.108380  225.776406   
       249  39784100   AAPL  222.7315  41.531256 -4.071146  225.945208   

              boll_low  
ticker                  
AAPL   247  213.090306  
       248  217.475596  
       2

  return df.groupby('ticker').apply(lambda x: x.assign(


In [254]:
# Forward-fill missing values (carry last known value forward)
df_features = df_features.ffill()

# OR drop early rows with NaN
df_features = df_features.dropna()

print("Remaining NaNs:", df_features.isna().sum().sum())  # Should be 0
print(df_features.head())

Remaining NaNs: 0
                date       close        high         low        open  \
ticker                                                                 
AAPL   25 2024-05-06  180.856033  183.334321  179.572087  181.493025   
       26 2024-05-07  181.542770  184.031021  180.467859  182.587839   
       27 2024-05-08  181.881195  182.209646  180.597249  181.990679   
       28 2024-05-09  183.702591  183.792164  181.254145  181.702028   
       29 2024-05-10  182.436859  184.470019  181.519943  184.280653   

             volume ticker      sma_20     rsi_14      macd   boll_high  \
ticker                                                                    
AAPL   25  78569700   AAPL  170.363071  66.934952  2.860254  186.397304   
       26  77305800   AAPL  170.996579  67.682981  3.347573  188.474161   
       27  45057100   AAPL  171.741064  68.066382  3.718223  187.464700   
       28  48983000   AAPL  172.215326  70.120928  4.111543  184.028681   
       29  50759500   AAPL 

  df_features = df_features.ffill()


In [255]:
# Check if any ticker has date gaps
date_gaps = df.groupby('ticker')['date'].apply(lambda x: x.sort_values().diff().max())
print("Max gap between dates per ticker (should be <=3 days for weekends):")
print(date_gaps.dt.days.value_counts())

Max gap between dates per ticker (should be <=3 days for weekends):
date
4    100
Name: count, dtype: int64
