In [None]:
from pathlib import Path
import os, datetime as dt
import pandas as pd
import numpy as np
from dotenv import load_dotenv

load_dotenv(dotenv_path=Path("..") / ".env")

RAW_DIR  = Path(os.getenv("DATA_DIR_RAW", "../data/raw"))
PROC_DIR = Path(os.getenv("DATA_DIR_PROCESSED", "../data/processed"))

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

def ts(): return dt.datetime.now().strftime("%Y%m%d-%H%M%S")

print("RAW_DIR:", RAW_DIR.resolve())
print("PROC_DIR:", PROC_DIR.resolve())


RAW_DIR: /Users/brian/bootcamp_Brian_Chang/homework/homework5/data/raw
PROC_DIR: /Users/brian/bootcamp_Brian_Chang/homework/homework5/data/processed


In [None]:
# sample data
dates = pd.date_range("2024-01-01", periods=10, freq="D")
df = pd.DataFrame({"date": dates, "ticker": ["AAPL"]*10, "price": 150 + np.random.randn(10).cumsum()})
df.info()

csv_path = RAW_DIR / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV →", csv_path)

parq_path = PROC_DIR / f"sample_{ts()}.parquet"
try:
    df.to_parquet(parq_path)
    print("Saved Parquet →", parq_path)
except Exception as e:
    print("Parquet save failed (engine missing?).", e)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    10 non-null     datetime64[ns]
 1   ticker  10 non-null     object        
 2   price   10 non-null     float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 372.0+ bytes
Saved CSV → ../data/raw/sample_20250818-182252.csv
Saved Parquet → ../data/processed/sample_20250818-182252.parquet


In [3]:
def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame, cols=("date","ticker","price")):
    checks = {
        "shape_equal": original.shape == reloaded.shape,
        "cols_present": all(c in reloaded.columns for c in cols),
    }
    if "price" in reloaded: checks["price_is_numeric"] = pd.api.types.is_numeric_dtype(reloaded["price"])
    if "date" in reloaded:  checks["date_is_datetime"] = pd.api.types.is_datetime64_any_dtype(reloaded["date"])
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=["date"])
print("CSV validation:", validate_loaded(df, df_csv))

if parq_path.exists():
    try:
        df_parq = pd.read_parquet(parq_path)
        print("Parquet validation:", validate_loaded(df, df_parq))
    except Exception as e:
        print("Parquet read failed:", e)


CSV validation: {'shape_equal': True, 'cols_present': True, 'price_is_numeric': True, 'date_is_datetime': True}
Parquet validation: {'shape_equal': True, 'cols_present': True, 'price_is_numeric': True, 'date_is_datetime': True}


In [4]:
from typing import Union

def ensure_dir(path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)

def detect_format(path: Union[str, Path]):
    s = str(path).lower()
    if s.endswith(".csv"): return "csv"
    if s.endswith(".parquet") or s.endswith(".pq") or s.endswith(".parq"): return "parquet"
    raise ValueError(f"Unsupported format: {path}")

def write_df(df: pd.DataFrame, path: Union[str, Path]) -> Path:
    path = Path(path)
    ensure_dir(path)
    fmt = detect_format(path)
    if fmt == "csv":
        df.to_csv(path, index=False)
    else:  # parquet
        try:
            df.to_parquet(path)
        except Exception as e:
            raise RuntimeError("Parquet engine not available. Install 'pyarrow' or 'fastparquet'.") from e
    return path

def read_df(path: Union[str, Path]) -> pd.DataFrame:
    path = Path(path)
    fmt = detect_format(path)
    if fmt == "csv":
        # parse 'date' if present
        cols = pd.read_csv(path, nrows=0).columns
        parse = ["date"] if "date" in cols else None
        return pd.read_csv(path, parse_dates=parse)
    else:
        try:
            return pd.read_parquet(path)
        except Exception as e:
            raise RuntimeError("Parquet engine not available. Install 'pyarrow' or 'fastparquet'.") from e

# demo
csv2 = RAW_DIR / f"sample_util_{ts()}.csv"
pq2  = PROC_DIR / f"sample_util_{ts()}.parquet"

write_df(df, csv2)
print("Reload CSV via util:", read_df(csv2).shape)

try:
    write_df(df, pq2)
    print("Reload Parquet via util:", read_df(pq2).shape)
except RuntimeError as e:
    print("Parquet util demo skipped:", e)


Reload CSV via util: (10, 3)
Reload Parquet via util: (10, 3)
