In [2]:
from pathlib import Path
import os, datetime as dt
import pandas as pd
from dotenv import load_dotenv


# Project root is one level up from this notebook
PROJECT_ROOT = Path("..").resolve()

# Load .env from project root
load_dotenv(PROJECT_ROOT / ".env")

# Read dirs from env; they are relative to project root
RAW_DIR  = PROJECT_ROOT / os.getenv("DATA_DIR_RAW", "data/raw")
PROC_DIR = PROJECT_ROOT / os.getenv("DATA_DIR_PROCESSED", "data/processed")

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

def ts() -> str:
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")

print("RAW_DIR:", RAW_DIR)
print("PROC_DIR:", PROC_DIR)


RAW_DIR: /Users/brian/bootcamp_Brian_Chang/project/data/raw
PROC_DIR: /Users/brian/bootcamp_Brian_Chang/project/data/processed


In [3]:
from typing import Optional, List

def latest_raw(pattern: str = "api_yfinance_*.csv") -> Optional[Path]:
    files: List[Path] = sorted(RAW_DIR.glob(pattern))
    return files[-1] if files else None

latest = latest_raw()
if latest and latest.exists():
    print("Using latest raw file:", latest.name)
    df = pd.read_csv(latest, parse_dates=["date"])
else:
    print("No Stage 04 raw found; using a small sample DataFrame.")
    import numpy as np
    dates = pd.date_range("2024-01-01", periods=10, freq="D")
    df = pd.DataFrame({
        "date": dates,
        "ticker": ["AAPL"]*10,
        "price": 150 + np.random.randn(10).cumsum()
    })

df.info()
df.head()


Using latest raw file: api_yfinance_AAPL_20250817-2321.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          250 non-null    object 
 1   open          250 non-null    float64
 2   high          250 non-null    float64
 3   low           250 non-null    float64
 4   close         250 non-null    float64
 5   volume        250 non-null    int64  
 6   Dividends     250 non-null    float64
 7   Stock Splits  250 non-null    float64
dtypes: float64(6), int64(1), object(1)
memory usage: 15.8+ KB


Unnamed: 0,date,open,high,low,close,volume,Dividends,Stock Splits
0,2024-08-16 00:00:00-04:00,222.8827,225.779224,222.613947,225.002838,44340200,0.0,0.0
1,2024-08-19 00:00:00-04:00,224.674371,224.943125,222.006778,224.843582,40687800,0.0,0.0
2,2024-08-20 00:00:00-04:00,224.724146,226.117655,224.405621,225.460709,30299000,0.0,0.0
3,2024-08-21 00:00:00-04:00,225.470651,226.923879,224.007459,225.351196,34765500,0.0,0.0
4,2024-08-22 00:00:00-04:00,226.734776,227.282232,222.862797,223.489883,43695300,0.0,0.0


In [4]:
# Save CSV in RAW
csv_path = RAW_DIR / f"storage_demo_{ts()}.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV →", csv_path)

# Save Parquet in PROCESSED
parq_path = PROC_DIR / f"storage_demo_{ts()}.parquet"
try:
    df.to_parquet(parq_path)  # requires pyarrow or fastparquet
    print("Saved Parquet →", parq_path)
except Exception as e:
    print("Parquet save failed (engine missing?).", e)


Saved CSV → /Users/brian/bootcamp_Brian_Chang/project/data/raw/storage_demo_20250818-185020.csv
Saved Parquet → /Users/brian/bootcamp_Brian_Chang/project/data/processed/storage_demo_20250818-185020.parquet


In [5]:
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype

def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame, cols=("date","ticker")):
    checks = {
        "shape_equal": original.shape == reloaded.shape,
        "cols_present": all(c in reloaded.columns for c in cols),
    }
    if "date" in reloaded.columns:
        checks["date_is_datetime"] = is_datetime64_any_dtype(reloaded["date"])
    # check any numeric columns you care about:
    for col in ("price","open","high","low","close","volume"):
        if col in reloaded.columns:
            checks[f"{col}_is_numeric"] = is_numeric_dtype(reloaded[col])
    return checks

# reload CSV (parse 'date' if present)
csv_cols = pd.read_csv(csv_path, nrows=0).columns
parse = ["date"] if "date" in csv_cols else None
df_csv = pd.read_csv(csv_path, parse_dates=parse)
print("CSV validation:", validate_loaded(df, df_csv))

# reload Parquet (if created)
if parq_path.exists():
    try:
        df_parq = pd.read_parquet(parq_path)
        print("Parquet validation:", validate_loaded(df, df_parq))
    except Exception as e:
        print("Parquet read failed:", e)
else:
    print("Parquet file not present (skipped earlier).")


CSV validation: {'shape_equal': True, 'cols_present': False, 'date_is_datetime': False, 'open_is_numeric': True, 'high_is_numeric': True, 'low_is_numeric': True, 'close_is_numeric': True, 'volume_is_numeric': True}
Parquet validation: {'shape_equal': True, 'cols_present': False, 'date_is_datetime': True, 'open_is_numeric': True, 'high_is_numeric': True, 'low_is_numeric': True, 'close_is_numeric': True, 'volume_is_numeric': True}


In [6]:
import sys
sys.path.append(str(PROJECT_ROOT / "src"))

from utils_storage import write_df, read_df

# demo
csv2 = RAW_DIR  / f"storage_util_{ts()}.csv"
pq2  = PROC_DIR / f"storage_util_{ts()}.parquet"

write_df(df, csv2)
print("Reloaded CSV via util:", read_df(csv2).shape)

try:
    write_df(df, pq2)
    print("Reloaded Parquet via util:", read_df(pq2).shape)
except RuntimeError as e:
    print("Parquet util demo skipped:", e)


Reloaded CSV via util: (250, 8)
Reloaded Parquet via util: (250, 8)
