# Homework Starter — Stage 05: Data Storage
Name: Haochen Zou
Date: 8/18/2025

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [1]:
import sys
import os
from pathlib import Path
import requests
import pandas as pd
from dotenv import load_dotenv

# 添加项目根目录到 sys.path，保证能导入 src 目录
project_root = Path().resolve().parent
sys.path.append(str(project_root))

# 导入模块
from src.storage import write_df, read_df, detect_format

# 加载 .env 并读取数据路径

from dotenv import load_dotenv
load_dotenv()

RAW = Path(os.getenv("DATA_DIR_RAW", "data/raw"))
PROC = Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))
RAW = (project_root / RAW).resolve()
PROC = (project_root / PROC).resolve()

RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

print('RAW ->', RAW)
print('PROC ->', PROC)

RAW -> D:\bootcamp_Haochen_Zou\homework\homework5\data\raw
PROC -> D:\bootcamp_Haochen_Zou\homework\homework5\data\processed


## 1) Create or Load a Sample DataFrame (random generated)
You may reuse data from prior stages or create a small synthetic dataset.

In [6]:
import numpy as np
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,148.435769
1,2024-01-02,AAPL,147.845628
2,2024-01-03,AAPL,148.452326
3,2024-01-04,AAPL,147.43077
4,2024-01-05,AAPL,146.661019


## 2) Save CSV to data/raw/ and Parquet to data/processed/ 
- Use timestamped filenames.
- Handle missing Parquet engine gracefully.

In [7]:
import datetime as dt
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# TODO: Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
csv_path

# TODO: Save Parquet
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

WindowsPath('D:/bootcamp_Haochen_Zou/homework/homework5/data/processed/sample_20250818-035014.parquet')

## 3) Reload and Validate 
- Compare shapes and key dtypes.

In [8]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
validate_loaded(df, df_csv)

{'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}

In [None]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        validate_loaded(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)

## 4) Utilities (DEMO)

In [9]:
# Demo
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"
write_df(df, p_csv); read_df(p_csv).head()
try:
    write_df(df, p_pq)
    read_df(p_pq).head()
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)