In [3]:
import os
import pandas as pd
from dotenv import load_dotenv
import pandas as pd

In [4]:
load_dotenv()

DATA_DIR_RAW = os.getenv("DATA_DIR_RAW", "data/raw")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED", "data/processed")

os.makedirs(DATA_DIR_RAW, exist_ok=True)
os.makedirs(DATA_DIR_PROCESSED, exist_ok=True)

data = {
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "score": [85.5, 90.2, 78.9, 88.0, 92.3]
}

df = pd.DataFrame(data)

csv_path = os.path.join(DATA_DIR_RAW, "sample_data.csv")
df.to_csv(csv_path, index=False)

parquet_path = os.path.join(DATA_DIR_PROCESSED, "sample_data.parquet")
df.to_parquet(parquet_path, index=False)

print(f"Data saved to: {csv_path} (CSV)")
print(f"Data saved to: {parquet_path} (Parquet)")

df.head()

Data saved to: data/raw\sample_data.csv (CSV)
Data saved to: data/processed\sample_data.parquet (Parquet)


Unnamed: 0,id,name,score
0,1,Alice,85.5
1,2,Bob,90.2
2,3,Charlie,78.9
3,4,David,88.0
4,5,Eva,92.3


In [5]:
df_csv = pd.read_csv(csv_path)
df_parquet = pd.read_parquet(parquet_path)

In [8]:
def validate_frames(a, b, expected_dtypes):
    results = {}
    results["shape_match"] = a.shape == b.shape
    results["rows"] = a.shape[0]
    results["cols"] = a.shape[1]
    dtype_ok = {}
    for col, dt in expected_dtypes.items():
        dtype_ok[col] = (str(a[col].dtype) == dt) and (str(b[col].dtype) == dt)
    results["dtype_checks"] = dtype_ok
    return results

In [9]:
expected = {"id": "int64", "name": "object", "score": "float64"}
val = validate_frames(df_csv, df_parquet, expected)

In [11]:
print("CSV shape:", df_csv.shape)
print("Parquet shape:", df_parquet.shape)
print("Shapes match:", val["shape_match"])
print("Dtype checks:")
for k, v in val["dtype_checks"].items():
    print(f" {k}: {v} (csv={df_csv[k].dtype}, parquet={df_parquet[k].dtype})")


df_csv.head()

CSV shape: (5, 3)
Parquet shape: (5, 3)
Shapes match: True
Dtype checks:
 id: True (csv=int64, parquet=int64)
 name: True (csv=object, parquet=object)
 score: True (csv=float64, parquet=float64)


Unnamed: 0,id,name,score
0,1,Alice,85.5
1,2,Bob,90.2
2,3,Charlie,78.9
3,4,David,88.0
4,5,Eva,92.3


In [12]:
def write_df(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    ext = os.path.splitext(path)[1].lower()
    if ext == ".csv":
        df.to_csv(path, index=False)
    elif ext == ".parquet":
        try:
            df.to_parquet(path, index=False)
        except ImportError:
            print("Parquet support is missing. Please install pyarrow or fastparquet.")
    else:
        print(f"Unsupported file extension: {ext}")

In [13]:
def read_df(path):
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return None
    ext = os.path.splitext(path)[1].lower()
    if ext == ".csv":
        return pd.read_csv(path)
    elif ext == ".parquet":
        try:
            return pd.read_parquet(path)
        except ImportError:
            print("Parquet support is missing. Please install pyarrow or fastparquet.")
        return None
    else:
        print(f"Unsupported file extension: {ext}")
        return None

In [14]:
write_test_csv = os.path.join(DATA_DIR_RAW, "test_out.csv")
write_test_parquet = os.path.join(DATA_DIR_PROCESSED, "test_out.parquet")


write_df(df, write_test_csv)
write_df(df, write_test_parquet)


print("Reload test CSV:")
print(read_df(write_test_csv).head())


print("Reload test Parquet:")
print(read_df(write_test_parquet).head())

Reload test CSV:
   id     name  score
0   1    Alice   85.5
1   2      Bob   90.2
2   3  Charlie   78.9
3   4    David   88.0
4   5      Eva   92.3
Reload test Parquet:
   id     name  score
0   1    Alice   85.5
1   2      Bob   90.2
2   3  Charlie   78.9
3   4    David   88.0
4   5      Eva   92.3
