# Stage 5

---
## Step 1: save in 2 formats

In [1]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

False

In [12]:
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

RAW -> /Users/shiyuxuan/bootcamp-Yuxuan-Shi/homework/homework5/notebooks/data/raw
PROC -> /Users/shiyuxuan/bootcamp-Yuxuan-Shi/homework/homework5/notebooks/data/processed


In [8]:
# Create a dataset
import numpy as np
dates = pd.date_range('20230101', periods=20)
df = pd.DataFrame({'date':dates, 'ticker':['META'] * 20, 'price':150 + np.random.randn(20).cumprod()})
df.head()   

Unnamed: 0,date,ticker,price
0,2023-01-01,META,151.764261
1,2023-01-02,META,152.036332
2,2023-01-03,META,153.994215
3,2023-01-04,META,157.288663
4,2023-01-05,META,147.232428


In [None]:
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')
# Save as csv
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)

# Save as parquet
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
csv_path, pq_path

(PosixPath('data/raw/sample_20250818-005657.csv'),
 PosixPath('data/processed/sample_20250818-005657.parquet'))

---
## Step 2: Reload and Validate

In [22]:
def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame, cols=('date','ticker','price')):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'cols_present': all(c in reloaded.columns for c in cols)
    }
    # dtype sanity checks
    if 'price' in reloaded.columns:
        checks['price_is_numeric'] = pd.api.types.is_numeric_dtype(reloaded['price'])
    if 'date' in reloaded.columns:
        checks['date_is_datetime'] = pd.api.types.is_datetime64_any_dtype(reloaded['date'])
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
print('CSV validation:', validate_loaded(df, df_csv))

if pq_path.exists():
    try:
        df_parq = pd.read_parquet(pq_path)
        print('Parquet validation:', validate_loaded(df, df_parq))
    except Exception as e:
        print('Parquet read failed:', e)
else:
    print('Parquet file not present (skipped earlier).')

CSV validation: {'shape_equal': True, 'cols_present': True, 'price_is_numeric': True, 'date_is_datetime': True}
Parquet validation: {'shape_equal': True, 'cols_present': True, 'price_is_numeric': True, 'date_is_datetime': True}


---
## Step 3: Refactor to utilities

In [24]:
from typing import Union

def ensure_dir(path: pathlib.Path):
    path.parent.mkdir(parents=True, exist_ok=True)

def detect_format(path: Union[str, pathlib.Path]):
    suf = str(path).lower()
    if suf.endswith('.csv'): return 'csv'
    if suf.endswith('.parquet') or suf.endswith('.pq') or suf.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format for: ' + str(path))

def write_df(df: pd.DataFrame, path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    ensure_dir(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        df.to_csv(path, index=False)
    elif fmt == 'parquet':
        try:
            df.to_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return path

def read_df(path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        return pd.read_csv(path, parse_dates=['date']) if 'date' in pd.read_csv(path, nrows=0).columns else pd.read_csv(path)
    elif fmt == 'parquet':
        try:
            return pd.read_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo utility usage
csv2 = RAW / f"prices_util_{ts()}.csv"
pq2  = RAW / f"prices_util_{ts()}.parquet"
write_df(df, csv2)
df2 = read_df(csv2)
print('Reloaded CSV via util, shape:', df2.shape)

try:
    write_df(df, pq2)
    df3 = read_df(pq2)
    print('Reloaded Parquet via util, shape:', df3.shape)
except RuntimeError as e:
    print('Parquet util demo skipped:', e)

Reloaded CSV via util, shape: (20, 3)
Reloaded Parquet via util, shape: (20, 3)
