## Student Homework Sheet — Stage 05: Data Storage

In the lecture, we learned how to save/load CSV and Parquet with environment-driven paths, organize
data/raw/ vs data/processed/, and document storage choices.
Now, you will adapt these patterns to implement a reproducible storage layer for one dataset.

In [1]:
#import 

import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv
import numpy as np

In [2]:
#Load DataFrame using provided starter
dates = pd.date_range('2024-01-01', periods=20, freq='D')
df = pd.DataFrame({'date': dates, 'ticker': ['AAPL']*20, 'price': 150 + np.random.randn(20).cumsum()})
df.head()

Unnamed: 0,date,ticker,price
0,2024-01-01,AAPL,150.008515
1,2024-01-02,AAPL,147.311955
2,2024-01-03,AAPL,146.960687
3,2024-01-04,AAPL,146.144038
4,2024-01-05,AAPL,147.797597


In [3]:

import pandas as pd

#Save to data/raw/ as CSV and to data/processed/ as Parquet.
df.to_csv("data/raw/AAPL.csv", index=False)
df.to_parquet('data/processed/AAPL.parquet', engine='fastparquet', index=False)

# Load variables from .env
# load_dotenv()

# DATA_DIR_RAW = os.getenv("DATA_DIR_RAW")
# DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED")

# # Use them in write_df
# write_df(df, os.path.join(DATA_DIR_RAW, "AAPL.csv"))
# write_df(df, os.path.join(DATA_DIR_PROCESSED, "AAPL.parquet"))

# for col in df.columns:
#     if isinstance(df[col].dtype, pd.PeriodDtype):
#         df[col] = df[col].astype(str)


In [4]:
#Reload both files.
import pandas as pd
import numpy as np

# --- 1. Create sample DataFrame ---
dates = pd.date_range("2024-01-01", periods=20, freq="D")
df = pd.DataFrame({
    "date": dates,
    "ticker": ["AAPL"] * 20,
    "price": 150 + np.random.randn(20).cumsum()
})

# --- 2. Save CSV & Parquet (avoid pyarrow bug by using fastparquet) ---
df.to_csv("data/raw/AAPL.csv", index=False)
df.to_parquet("data/processed/AAPL.parquet", index=False, engine="fastparquet")

# --- 3. Reload both files ---
df_raw = pd.read_csv("data/raw/AAPL.csv", parse_dates=["date"])
df_processed = pd.read_parquet("data/processed/AAPL.parquet", engine="fastparquet")

# --- 4. Align dtypes before validation ---
df_raw = df_raw.astype(df_processed.dtypes.to_dict())

# --- 5. Validation function ---
def validate_dataframes(df1, df2):
    if df1.shape != df2.shape:
        print("❌ Shapes differ:")
        print(f" - df1: {df1.shape}")
        print(f" - df2: {df2.shape}")
        return False
    if not df1.dtypes.equals(df2.dtypes):
        print("❌ Dtypes differ:")
        print(f" - df1: {df1.dtypes}")
        print(f" - df2: {df2.dtypes}")
        return False
    print("✅ DataFrames match in shape and dtypes.")
    return True

# --- 6. Run validation ---
validate_dataframes(df_raw, df_processed)


✅ DataFrames match in shape and dtypes.


True

In [5]:
#Implement write_df and read_df that route by file suffix (csv/parquet).
#Handle missing directories and missing Parquet engine with a clear message.

import os
import pandas as pd

def write_df(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    ext = os.path.splitext(path)[1].lower()

    # Fix: convert Period columns to strings
    for col in df.select_dtypes(include=["period"]).columns:
        df[col] = df[col].astype(str)

    if ext == ".csv":
        df.to_csv(path, index=False)

    elif ext == ".parquet":
        try:
            df.to_parquet(path, index=False, engine="pyarrow")
        except Exception as e:
            print(f"⚠ pyarrow failed ({e}), trying fastparquet...")
            try:
                df.to_parquet(path, index=False, engine="fastparquet")
            except ImportError:
                raise ImportError(
                    "Neither pyarrow nor fastparquet is available. "
                    "Install one with: pip install pyarrow OR pip install fastparquet"
                )
    else:
        raise ValueError(f"Unsupported file extension: {ext}")


def read_df(path):
    ext = os.path.splitext(path)[1].lower()

    if ext == ".csv":
        return pd.read_csv(path)
    elif ext == ".parquet":
        try:
            return pd.read_parquet(path, engine="pyarrow")
        except Exception as e:
            print(f"⚠ pyarrow failed ({e}), trying fastparquet...")
            return pd.read_parquet(path, engine="fastparquet")
    else:
        raise ValueError(f"Unsupported file extension: {ext}")
