# Homework Starter — Stage 05: Data Storage
Name: Student
Date: December 2024

Objectives:
- Env-driven paths to `data/raw/` and `data/processed/`
- Save CSV and Parquet; reload and validate
- Abstract IO with utility functions; document choices

In [1]:
import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv

print("Setting up environment-driven paths...")
load_dotenv()
RAW = pathlib.Path(os.getenv('DATA_DIR_RAW', 'data/raw'))
PROC = pathlib.Path(os.getenv('DATA_DIR_PROCESSED', 'data/processed'))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)
print('RAW ->', RAW.resolve())
print('PROC ->', PROC.resolve())

Setting up environment-driven paths...
RAW -> C:\Users\papne\OneDrive\Desktop\FRE\Bootcamp 4\nyu_frebootcamp\homework\homework5\notebooks\data\raw
PROC -> C:\Users\papne\OneDrive\Desktop\FRE\Bootcamp 4\nyu_frebootcamp\homework\homework5\notebooks\data\processed


## 1) Create or Load a Sample DataFrame
You may reuse data from prior stages or create a small synthetic dataset.

In [3]:
print("Loading real Alpha Vantage data from homework 4...")
csv_file = RAW / "api_source-alpha_symbol-AAPL_20250817-230500.csv"
df = pd.read_csv(csv_file, parse_dates=['date'])
df = df.rename(columns={'adj_close': 'price'})
print(f"Loaded {len(df)} rows of AAPL stock data")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Columns: {list(df.columns)}")
df.head()


Loading real Alpha Vantage data from homework 4...
Loaded 100 rows of AAPL stock data
Date range: 2025-03-25 00:00:00 to 2025-08-15 00:00:00
Columns: ['date', 'price']


Unnamed: 0,date,price
0,2025-03-25,223.75
1,2025-03-26,221.53
2,2025-03-27,223.85
3,2025-03-28,217.9
4,2025-03-31,222.13


In [4]:
import numpy as np
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,148.713928
1,2024-01-02,AAPL,147.964757
2,2024-01-03,AAPL,147.692187
3,2024-01-04,AAPL,147.565747
4,2024-01-05,AAPL,147.70428


## 2) Save CSV to data/raw/ and Parquet to data/processed/ (TODO)
- Use timestamped filenames.
- Handle missing Parquet engine gracefully.

In [5]:
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

print("Task 1: Saving data in two formats...")

print("Saving CSV to data/raw/...")
csv_path = RAW / f"aapl_stock_{ts()}.csv"
df.to_csv(csv_path, index=False)
print(f"✓ CSV saved: {csv_path}")

print("Saving Parquet to data/processed/...")
pq_path = PROC / f"aapl_stock_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
    print(f"✓ Parquet saved: {pq_path}")
except Exception as e:
    print(f'✗ Parquet engine not available: {e}')
    print('Install pyarrow or fastparquet: pip install pyarrow')
    pq_path = None

print(f"\nSummary:")
print(f"CSV: {csv_path}")
print(f"Parquet: {pq_path}")


Task 1: Saving data in two formats...
Saving CSV to data/raw/...
✓ CSV saved: data\raw\aapl_stock_20250818-090532.csv
Saving Parquet to data/processed/...
✓ Parquet saved: data\processed\aapl_stock_20250818-090532.parquet

Summary:
CSV: data\raw\aapl_stock_20250818-090532.csv
Parquet: data\processed\aapl_stock_20250818-090532.parquet


In [6]:
def validate_loaded(original, reloaded, format_name):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
        'columns_match': list(original.columns) == list(reloaded.columns),
        'row_count': len(reloaded)
    }
    print(f"\n{format_name} Validation Results:")
    for check, result in checks.items():
        status = "✓" if result else "✗"
        print(f"  {status} {check}: {result}")
    return checks

print("Task 2: Reload and validate both formats...")

print("1. Loading and validating CSV...")
df_csv = pd.read_csv(csv_path, parse_dates=['date'])
csv_validation = validate_loaded(df, df_csv, "CSV")

print(f"\nCSV data sample:")
print(df_csv.head())


Task 2: Reload and validate both formats...
1. Loading and validating CSV...

CSV Validation Results:
  ✓ shape_equal: True
  ✓ date_is_datetime: True
  ✓ price_is_numeric: True
  ✓ columns_match: True
  ✓ row_count: 20

CSV data sample:
        date ticker       price
0 2024-01-01   AAPL  148.713928
1 2024-01-02   AAPL  147.964757
2 2024-01-03   AAPL  147.692187
3 2024-01-04   AAPL  147.565747
4 2024-01-05   AAPL  147.704280


In [7]:
print("2. Loading and validating Parquet...")
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        pq_validation = validate_loaded(df, df_pq, "Parquet")
        print(f"\nParquet data sample:")
        print(df_pq.head())
        print(f"\nParquet dtypes:")
        print(df_pq.dtypes)
    except Exception as e:
        print(f'✗ Parquet read failed: {e}')
        print('Install pyarrow: pip install pyarrow')
else:
    print("✗ Skipping Parquet validation - file was not created")


2. Loading and validating Parquet...

Parquet Validation Results:
  ✓ shape_equal: True
  ✓ date_is_datetime: True
  ✓ price_is_numeric: True
  ✓ columns_match: True
  ✓ row_count: 20

Parquet data sample:
        date ticker       price
0 2024-01-01   AAPL  148.713928
1 2024-01-02   AAPL  147.964757
2 2024-01-03   AAPL  147.692187
3 2024-01-04   AAPL  147.565747
4 2024-01-05   AAPL  147.704280

Parquet dtypes:
date      datetime64[ns]
ticker            object
price            float64
dtype: object


In [8]:
import typing as t

print("Task 3: Creating utility functions...")

def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    
    print(f"Writing {fmt.upper()} to {p}")
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install: pip install pyarrow') from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    
    print(f"Reading {fmt.upper()} from {p}")
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['date']) if 'date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install: pip install pyarrow') from e

print("✓ Utility functions created successfully!")


Task 3: Creating utility functions...
✓ Utility functions created successfully!


In [9]:
print("Testing utility functions...")

print("\n1. Testing CSV utilities:")
p_csv = RAW / f"util_test_{ts()}.csv"
write_df(df, p_csv)
df_test_csv = read_df(p_csv)
print(f"✓ CSV test successful - shape: {df_test_csv.shape}")

print("\n2. Testing Parquet utilities:")
p_pq = PROC / f"util_test_{ts()}.parquet"
try:
    write_df(df, p_pq)
    df_test_pq = read_df(p_pq)
    print(f"✓ Parquet test successful - shape: {df_test_pq.shape}")
except RuntimeError as e:
    print(f'✗ Parquet test failed: {e}')

print("\n✓ All utility tests completed!")


Testing utility functions...

1. Testing CSV utilities:
Writing CSV to data\raw\util_test_20250818-090610.csv
Reading CSV from data\raw\util_test_20250818-090610.csv
✓ CSV test successful - shape: (20, 3)

2. Testing Parquet utilities:
Writing PARQUET to data\processed\util_test_20250818-090610.parquet
Reading PARQUET from data\processed\util_test_20250818-090610.parquet
✓ Parquet test successful - shape: (20, 3)

✓ All utility tests completed!


In [10]:
def ts(): return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

# TODO: Save CSV
csv_path = RAW / f"sample_{ts()}.csv"
df.to_csv(csv_path, index=False)
csv_path

# TODO: Save Parquet
pq_path = PROC / f"sample_{ts()}.parquet"
try:
    df.to_parquet(pq_path)
except Exception as e:
    print('Parquet engine not available. Install pyarrow or fastparquet to complete this step.')
    pq_path = None
pq_path

WindowsPath('data/processed/sample_20250818-090617.parquet')

## 3) Reload and Validate (TODO)
- Compare shapes and key dtypes.

In [11]:
def validate_loaded(original, reloaded):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'date_is_datetime': pd.api.types.is_datetime64_any_dtype(reloaded['date']) if 'date' in reloaded.columns else False,
        'price_is_numeric': pd.api.types.is_numeric_dtype(reloaded['price']) if 'price' in reloaded.columns else False,
    }
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
validate_loaded(df, df_csv)

{'shape_equal': True, 'date_is_datetime': True, 'price_is_numeric': True}

In [None]:
if pq_path:
    try:
        df_pq = pd.read_parquet(pq_path)
        validate_loaded(df, df_pq)
    except Exception as e:
        print('Parquet read failed:', e)

## 4) Utilities (TODO)
- Implement `detect_format`, `write_df`, `read_df`.
- Use suffix to route; create parent dirs if needed; friendly errors for Parquet.

In [13]:
import typing as t, pathlib

def detect_format(path: t.Union[str, pathlib.Path]):
    s = str(path).lower()
    if s.endswith('.csv'): return 'csv'
    if s.endswith('.parquet') or s.endswith('.pq') or s.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format: ' + s)

def write_df(df: pd.DataFrame, path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path); p.parent.mkdir(parents=True, exist_ok=True)
    fmt = detect_format(p)
    if fmt == 'csv':
        df.to_csv(p, index=False)
    else:
        try:
            df.to_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return p

def read_df(path: t.Union[str, pathlib.Path]):
    p = pathlib.Path(path)
    fmt = detect_format(p)
    if fmt == 'csv':
        return pd.read_csv(p, parse_dates=['date']) if 'date' in pd.read_csv(p, nrows=0).columns else pd.read_csv(p)
    else:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e

# Demo
p_csv = RAW / f"util_{ts()}.csv"
p_pq  = PROC / f"util_{ts()}.parquet"
write_df(df, p_csv); read_df(p_csv).head()
try:
    write_df(df, p_pq)
    read_df(p_pq).head()
except RuntimeError as e:
    print('Skipping Parquet util demo:', e)

## 5) Documentation (TODO)
- Update README with a **Data Storage** section (folders, formats, env usage).
- Summarize validation checks and any assumptions.