In [6]:
from pathlib import Path
import sys, os
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT / "src"))  # so we can `from cleaning import ...`

from cleaning import (
    sort_and_cast_ohlcv,
    ffill_ohlcv_by_date,
    add_returns,
    winsorize_zscores,
)

RAW  = PROJECT_ROOT / "data" / "raw"
PROC = PROJECT_ROOT / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)

RAW, PROC


(PosixPath('/Users/brian/bootcamp_Brian_Chang/project/data/raw'),
 PosixPath('/Users/brian/bootcamp_Brian_Chang/project/data/processed'))

In [7]:
def latest(pattern: str) -> Path | None:
    files = sorted(RAW.glob(pattern))
    return files[-1] if files else None

raw_prices = latest("api_yfinance_*.csv")
raw_prices


PosixPath('/Users/brian/bootcamp_Brian_Chang/project/data/raw/api_yfinance_AAPL_20250817-2321.csv')

In [8]:
df = pd.read_csv(raw_prices)
df.head(), df.shape, df.isna().sum()


(                        date        open        high         low       close  \
 0  2024-08-16 00:00:00-04:00  222.882700  225.779224  222.613947  225.002838   
 1  2024-08-19 00:00:00-04:00  224.674371  224.943125  222.006778  224.843582   
 2  2024-08-20 00:00:00-04:00  224.724146  226.117655  224.405621  225.460709   
 3  2024-08-21 00:00:00-04:00  225.470651  226.923879  224.007459  225.351196   
 4  2024-08-22 00:00:00-04:00  226.734776  227.282232  222.862797  223.489883   
 
      volume  Dividends  Stock Splits  
 0  44340200        0.0           0.0  
 1  40687800        0.0           0.0  
 2  30299000        0.0           0.0  
 3  34765500        0.0           0.0  
 4  43695300        0.0           0.0  ,
 (250, 8),
 date            0
 open            0
 high            0
 low             0
 close           0
 volume          0
 Dividends       0
 Stock Splits    0
 dtype: int64)

In [10]:
from datetime import datetime

# 1) normalize schema & dtypes
df1 = sort_and_cast_ohlcv(df)

# # 2) forward-fill small gaps (OHLCV only)
df2 = ffill_ohlcv_by_date(df1)

# # 3) returns & z-scores
df3 = add_returns(df2, price_col="close", ret_col="ret_1d", ret_z_col="ret_1d_z")

# # # 4) winsorize extreme outliers (e.g. z=5)
df4 = winsorize_zscores(df3, columns=["ret_1d"], z=5.0)

df4.head(), df4.shape, df4.isna().sum()

  dt = pd.to_datetime(s, format="%Y-%m-%d %H:%M:%S%z", errors="coerce")
  dt = dt.fillna(pd.to_datetime(s, errors="coerce", utc=False))


(                        date        open        high         low       close  \
 0  2024-08-16 00:00:00-04:00  222.882700  225.779224  222.613947  225.002838   
 1  2024-08-19 00:00:00-04:00  224.674371  224.943125  222.006778  224.843582   
 2  2024-08-20 00:00:00-04:00  224.724146  226.117655  224.405621  225.460709   
 3  2024-08-21 00:00:00-04:00  225.470651  226.923879  224.007459  225.351196   
 4  2024-08-22 00:00:00-04:00  226.734776  227.282232  222.862797  223.489883   
 
      volume  Dividends  Stock Splits    ret_1d  ret_1d_z  
 0  44340200        0.0           0.0       NaN       NaN  
 1  40687800        0.0           0.0 -0.070780 -0.050741  
 2  30299000        0.0           0.0  0.274469  0.120104  
 3  34765500        0.0           0.0 -0.048573 -0.039752  
 4  43695300        0.0           0.0 -0.825961 -0.424439  ,
 (250, 10),
 date            0
 open            0
 high            0
 low             0
 close           0
 volume          0
 Dividends       0
 Stock

In [11]:
from datetime import datetime
def ts(): return datetime.now().strftime("%Y%m%d-%H%M%S")

clean_path = PROC / f"prices_preprocessed_{ts()}.csv"
df4.to_csv(clean_path, index=False)
clean_path


PosixPath('/Users/brian/bootcamp_Brian_Chang/project/data/processed/prices_preprocessed_20250820-120204.csv')

In [12]:
summary = {
    "orig_shape": tuple(df.shape),
    "clean_shape": tuple(df4.shape),
    "orig_na_counts": df.isna().sum().to_dict(),
    "clean_na_counts": df4.isna().sum().to_dict(),
    "dtypes_after": df4.dtypes.astype(str).to_dict(),
    "date_span": (str(df4["date"].min()), str(df4["date"].max())) if "date" in df4 else None,
}
summary


{'orig_shape': (250, 8),
 'clean_shape': (250, 10),
 'orig_na_counts': {'date': 0,
  'open': 0,
  'high': 0,
  'low': 0,
  'close': 0,
  'volume': 0,
  'Dividends': 0,
  'Stock Splits': 0},
 'clean_na_counts': {'date': 0,
  'open': 0,
  'high': 0,
  'low': 0,
  'close': 0,
  'volume': 0,
  'Dividends': 0,
  'Stock Splits': 0,
  'ret_1d': 1,
  'ret_1d_z': 1},
 'dtypes_after': {'date': 'object',
  'open': 'float64',
  'high': 'float64',
  'low': 'float64',
  'close': 'float64',
  'volume': 'Int64',
  'Dividends': 'float64',
  'Stock Splits': 'float64',
  'ret_1d': 'float64',
  'ret_1d_z': 'float64'},
 'date_span': ('2024-08-16 00:00:00-04:00', '2025-08-15 00:00:00-04:00')}