# 01 — Ingest raw interactions → processed parquet

This notebook:
- Loads the raw interactions CSV
- Performs minimal cleaning / type standardization
- Writes `data/processed/interactions.parquet`

> The pipeline equivalent is: `python -m src.pipelines.01_make_processed`


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# In notebooks, assume we are running from repo_root/notebooks
PROJECT_ROOT = Path.cwd().parent

RAW_PATH = PROJECT_ROOT / "data" / "raw" / "customer_interactions_fact_2_years.csv"
OUT_PATH = PROJECT_ROOT / "data" / "processed" / "interactions.parquet"
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)

RAW_PATH

In [None]:
# Load raw CSV (large)
df_raw = pd.read_csv(
    RAW_PATH,
    parse_dates=["event_time"],
    low_memory=False
)

df_raw.shape, df_raw.head()

In [None]:
# Minimal cleaning
df = df_raw.copy()

# Ensure datetime
df["event_time"] = pd.to_datetime(df["event_time"], errors="coerce")

# Required columns check
required = {"external_customerkey", "event_time", "interaction_type"}
missing_cols = required - set(df.columns)
assert not missing_cols, f"Missing columns: {missing_cols}"

# Drop rows missing critical values
df = df.dropna(subset=["external_customerkey", "event_time", "interaction_type"]).copy()

# Standardize key string cols
df["interaction_type"] = df["interaction_type"].astype("string").str.strip()
df["external_customerkey"] = df["external_customerkey"].astype("string").str.strip()

for c in ["channel", "shop", "incoming_outgoing"]:
    if c in df.columns:
        df[c] = df[c].astype("string").str.strip()

df.shape

In [None]:
print("rows:", len(df))
print("customers:", df["external_customerkey"].nunique())
print("event types:", df["interaction_type"].nunique())
print("time range:", df["event_time"].min(), "→", df["event_time"].max())

df["interaction_type"].value_counts().head(15)

In [None]:
# Missingness overview
(df.isna().mean().sort_values(ascending=False).head(20))

In [None]:
# Write processed parquet
df.to_parquet(OUT_PATH, index=False)
print("Wrote:", OUT_PATH, "rows:", len(df))

In [None]:
# Quick read-back check
df_check = pd.read_parquet(OUT_PATH)
df_check.shape, df_check.head()