### Get data from folders using `.env`

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

DATA_DIR_RAW = os.getenv("DATA_DIR_RAW")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED")

os.makedirs(DATA_DIR_RAW, exist_ok=True)
os.makedirs(DATA_DIR_PROCESSED, exist_ok=True)

### Sample Data

In [6]:
df = pd.DataFrame({
    "id": [1, 2, 3, 4, 5],
    "name": ["A", "B", "C", "D", "E"],
    "value": [10.5, 20.1, 30.2, 40.3, 50.4]
})

raw_path = os.path.join(DATA_DIR_RAW, "sample.csv")
processed_path = os.path.join(DATA_DIR_PROCESSED, "sample.parquet")

df.to_csv(raw_path, index=False)
df.to_parquet(processed_path, index=False)

df_csv = pd.read_csv(raw_path)
df_parquet = pd.read_parquet(processed_path)

### Validation Function

In [7]:
def validate(df1, df2):
    return {
        "shape_match": df1.shape == df2.shape,
        "columns_match": list(df1.columns) == list(df2.columns),
        "dtypes_csv": df1.dtypes.to_dict(),
        "dtypes_parquet": df2.dtypes.to_dict()
    }

results = validate(df_csv, df_parquet)

### Utility Functions

In [8]:
def write_df(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    suffix = os.path.splitext(path)[1].lower()
    if suffix == ".csv":
        df.to_csv(path, index=False)
    elif suffix == ".parquet":
        try:
            df.to_parquet(path, index=False)
        except ImportError:
            raise RuntimeError("Parquet support requires pyarrow or fastparquet")
    else:
        raise ValueError(f"Unsupported file format: {suffix}")

def read_df(path):
    suffix = os.path.splitext(path)[1].lower()
    if suffix == ".csv":
        return pd.read_csv(path)
    elif suffix == ".parquet":
        try:
            return pd.read_parquet(path)
        except ImportError:
            raise RuntimeError("Parquet support requires pyarrow or fastparquet")
    else:
        raise ValueError(f"Unsupported file format: {suffix}")


In [9]:
df_csv = read_df(os.path.join(DATA_DIR_RAW, "sample.csv"))
df_parquet = read_df(os.path.join(DATA_DIR_PROCESSED, "sample.parquet"))
df_csv

Unnamed: 0,id,name,value
0,1,A,10.5
1,2,B,20.1
2,3,C,30.2
3,4,D,40.3
4,5,E,50.4


In [10]:
df_parquet

Unnamed: 0,id,name,value
0,1,A,10.5
1,2,B,20.1
2,3,C,30.2
3,4,D,40.3
4,5,E,50.4
