In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Feature Engineering

Goal: derive simple features from experiment records, including normalization and categorical indicators.

Why it matters: quick engineered features speed up baseline models and exploratory plots.

How to run and adapt: run once data is available; extend the feature list to include ratios, rolling means, or other research-specific fields.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd

experiments = pd.read_csv(DATA_DIR / "sample_tabular" / "experiments_sample.csv")
experiments["timestamp"] = pd.to_datetime(experiments["timestamp"])

experiments["metric_normalized"] = (experiments["metric_value"] - experiments["metric_value"].mean()) / experiments["metric_value"].std()
experiments["day_of_week"] = experiments["timestamp"].dt.day_name()
experiments["is_treatment"] = experiments["condition"].str.contains("treatment")
experiments[["experiment_id", "condition", "metric_value", "metric_normalized", "day_of_week", "is_treatment"]].head()
