In [12]:
from pathlib import Path
from datetime import datetime
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv
import sys
project_root = Path.cwd().parent
sys.path.append(str(project_root / "src"))
from utils import write_df, read_df, get_summary_stats

In [4]:

ENV_PATH = find_dotenv()
PROJECT_ROOT = Path(ENV_PATH).resolve().parent
load_dotenv(ENV_PATH)

RAW_DIR = (PROJECT_ROOT / os.getenv("DATA_DIR_RAW", "data/raw")).resolve()
PROCESSED_DIR = (PROJECT_ROOT / os.getenv("DATA_DIR_PROCESSED", "data/processed")).resolve()

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)

PROJECT_ROOT: C:\Users\K\bootcamp_chengyu_gong
RAW_DIR: C:\Users\K\bootcamp_chengyu_gong\data\raw
PROCESSED_DIR: C:\Users\K\bootcamp_chengyu_gong\data\processed


In [5]:
# create a df
df = pd.DataFrame({"index": range(11), "value": range(11)})
print(df.head())

   index  value
0      0      0
1      1      1
2      2      2
3      3      3
4      4      4


In [14]:
csv_path = RAW_DIR / "mydata.csv"
parquet_path = PROCESSED_DIR / "mydata.parquet"

df.to_csv(csv_path, index=False)
df.to_parquet(parquet_path, index=False)

print("Saved to:", csv_path, parquet_path)

Saved to: C:\Users\K\bootcamp_chengyu_gong\data\raw\mydata.csv C:\Users\K\bootcamp_chengyu_gong\data\processed\mydata.parquet


In [8]:
df_csv = pd.read_csv(csv_path)
df_parq = pd.read_parquet(parquet_path, engine="pyarrow")

print("CSV shape:", df_csv.shape)
print("Parquet shape:", df_parq.shape)

CSV shape: (11, 2)
Parquet shape: (11, 2)


In [9]:
def validate_df(df_loaded, df_original):
    report = {}
    
    report["shape_expected"] = df_original.shape
    report["shape_loaded"] = df_loaded.shape
    report["shape_ok"] = (df_loaded.shape == df_original.shape)
    
    expected_dtypes = {col: str(dtype) for col, dtype in df_original.dtypes.items()}
    loaded_dtypes = {col: str(dtype) for col, dtype in df_loaded.dtypes.items()}
    
    dtype_mismatches = {}
    for col, exp in expected_dtypes.items():
        if col in loaded_dtypes and loaded_dtypes[col] != exp:
            dtype_mismatches[col] = {"expected": exp, "actual": loaded_dtypes[col]}
    
    report["dtypes_expected"] = expected_dtypes
    report["dtypes_loaded"] = loaded_dtypes
    report["dtypes_ok"] = (len(dtype_mismatches) == 0)
    report["dtype_mismatches"] = dtype_mismatches
    
    report["all_ok"] = report["shape_ok"] and report["dtypes_ok"]
    return report

In [10]:
print("CSV validation report:")
print(validate_df(df_csv, df))

print("\nParquet validation report:")
print(validate_df(df_parq, df))


CSV validation report:
{'shape_expected': (11, 2), 'shape_loaded': (11, 2), 'shape_ok': True, 'dtypes_expected': {'index': 'int64', 'value': 'int64'}, 'dtypes_loaded': {'index': 'int64', 'value': 'int64'}, 'dtypes_ok': True, 'dtype_mismatches': {}, 'all_ok': True}

Parquet validation report:
{'shape_expected': (11, 2), 'shape_loaded': (11, 2), 'shape_ok': True, 'dtypes_expected': {'index': 'int64', 'value': 'int64'}, 'dtypes_loaded': {'index': 'int64', 'value': 'int64'}, 'dtypes_ok': True, 'dtype_mismatches': {}, 'all_ok': True}


In [16]:
# 写入
write_df(df, csv_path)
write_df(df, parquet_path)

# 读取
df_csv = read_df(csv_path)
df_parq = read_df(parquet_path)

print("CSV loaded shape:", df_csv.shape)
print("Parquet loaded shape:", df_parq.shape)

CSV loaded shape: (11, 2)
Parquet loaded shape: (11, 2)
