In [1]:
from pathlib import Path
import sys, os
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT / "src"))  # so we can `from cleaning import ...`

from cleaning import (
    sort_and_cast_ohlcv,
    ffill_ohlcv_by_date,
    add_returns,
    fill_missing_median,
    drop_missing,
    normalize_data,
    clip_extreme_zscores,
)

RAW  = PROJECT_ROOT / "data" / "raw"
PROC = PROJECT_ROOT / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)

RAW, PROC


(PosixPath('/Users/brian/bootcamp_Brian_Chang/project/data/raw'),
 PosixPath('/Users/brian/bootcamp_Brian_Chang/project/data/processed'))

In [2]:
def latest(pattern: str) -> Path | None:
    files = sorted(RAW.glob(pattern))
    return files[-1] if files else None

raw_prices = latest("api_yfinance_*.csv")
raw_prices


PosixPath('/Users/brian/bootcamp_Brian_Chang/project/data/raw/api_yfinance_AAPL_20250817-2321.csv')

In [3]:
df = pd.read_csv(raw_prices)
df.head(), df.shape, df.isna().sum()


(                        date        open        high         low       close  \
 0  2024-08-16 00:00:00-04:00  222.882700  225.779224  222.613947  225.002838   
 1  2024-08-19 00:00:00-04:00  224.674371  224.943125  222.006778  224.843582   
 2  2024-08-20 00:00:00-04:00  224.724146  226.117655  224.405621  225.460709   
 3  2024-08-21 00:00:00-04:00  225.470651  226.923879  224.007459  225.351196   
 4  2024-08-22 00:00:00-04:00  226.734776  227.282232  222.862797  223.489883   
 
      volume  Dividends  Stock Splits  
 0  44340200        0.0           0.0  
 1  40687800        0.0           0.0  
 2  30299000        0.0           0.0  
 3  34765500        0.0           0.0  
 4  43695300        0.0           0.0  ,
 (250, 8),
 date            0
 open            0
 high            0
 low             0
 close           0
 volume          0
 Dividends       0
 Stock Splits    0
 dtype: int64)

In [4]:
# 1) normalize schema & sort
df1 = sort_and_cast_ohlcv(df)

# 2) forward-fill tiny gaps in OHLCV (safe for occasional NaNs)
df2 = ffill_ohlcv_by_date(df1)

# 3) compute 1-day return from close
df3 = add_returns(df2, price_col="close", out_col="ret_1d")

# 4) optional: drop rows with too many NaNs (should be rare after ffill)
df4 = drop_missing(df3, threshold=0.6, axis="rows")

# 5) optional: clip extreme z-scores on returns to dampen huge outliers
df5 = clip_extreme_zscores(df4, columns=["ret_1d"], z=5.0)

df5.head(), df5.isna().sum()


  out["date"] = pd.to_datetime(out["date"], errors="coerce")


(                       date        open        high         low       close  \
 0 2024-08-16 00:00:00-04:00  222.882700  225.779224  222.613947  225.002838   
 1 2024-08-19 00:00:00-04:00  224.674371  224.943125  222.006778  224.843582   
 2 2024-08-20 00:00:00-04:00  224.724146  226.117655  224.405621  225.460709   
 3 2024-08-21 00:00:00-04:00  225.470651  226.923879  224.007459  225.351196   
 4 2024-08-22 00:00:00-04:00  226.734776  227.282232  222.862797  223.489883   
 
      volume  Dividends  Stock Splits    ret_1d  
 0  44340200        0.0           0.0       NaN  
 1  40687800        0.0           0.0 -0.056532  
 2  30299000        0.0           0.0  0.114122  
 3  34765500        0.0           0.0 -0.045556  
 4  43695300        0.0           0.0 -0.429814  ,
 date            84
 open             0
 high             0
 low              0
 close            0
 volume           0
 Dividends        0
 Stock Splits     0
 ret_1d           1
 dtype: int64)

In [6]:
from datetime import datetime
def ts(): return datetime.now().strftime("%Y%m%d-%H%M%S")

clean_path = PROC / f"prices_preprocessed_{ts()}.csv"
df5.to_csv(clean_path, index=False)
clean_path


PosixPath('/Users/brian/bootcamp_Brian_Chang/project/data/processed/prices_preprocessed_20250819-223535.csv')

In [7]:
summary = {
    "orig_shape": tuple(df.shape),
    "clean_shape": tuple(df5.shape),
    "orig_na_counts": df.isna().sum().to_dict(),
    "clean_na_counts": df5.isna().sum().to_dict(),
    "dtypes_after": df5.dtypes.astype(str).to_dict(),
    "date_span": (str(df5["date"].min()), str(df5["date"].max())) if "date" in df5 else None,
}
summary


{'orig_shape': (250, 8),
 'clean_shape': (250, 9),
 'orig_na_counts': {'date': 0,
  'open': 0,
  'high': 0,
  'low': 0,
  'close': 0,
  'volume': 0,
  'Dividends': 0,
  'Stock Splits': 0},
 'clean_na_counts': {'date': 84,
  'open': 0,
  'high': 0,
  'low': 0,
  'close': 0,
  'volume': 0,
  'Dividends': 0,
  'Stock Splits': 0,
  'ret_1d': 1},
 'dtypes_after': {'date': 'datetime64[ns, UTC-04:00]',
  'open': 'float64',
  'high': 'float64',
  'low': 'float64',
  'close': 'float64',
  'volume': 'Int64',
  'Dividends': 'float64',
  'Stock Splits': 'float64',
  'ret_1d': 'float64'},
 'date_span': ('2024-08-16 00:00:00-04:00', '2025-08-15 00:00:00-04:00')}