In [None]:
import sys, subprocess, os
from pathlib import Path

# Colab Setup
if "google.colab" in sys.modules:
    print("Running in Google Colab. Installing dependencies...")
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema", "plotly", "tqdm"])
    
    # Check for data
    if not (Path.cwd() / "data").exists():
        print("Data directory not found. Cloning repository...")
        subprocess.run(["git", "clone", "https://github.com/aire-program/aire-researcher-sandbox.git", "_repo"])
        
        # Move data and scripts to current directory
        if (Path("_repo/data").exists()):
            print("Moving data and scripts...")
            subprocess.run(["mv", "_repo/data", "."])
            subprocess.run(["mv", "_repo/scripts", "."])
            subprocess.run(["rm", "-rf", "_repo"])
        else:
            print("Warning: Data not found in cloned repo.")
    else:
        print("Data directory found.")


# Engineer Features from Experiments

**What**: Derive new features from existing experimental data to enhance analytical power.

**Why**: Raw data often lacks the specific signals needed for modeling. Feature engineering transforms raw variables into more meaningful representations.

**How**:
1. **Normalize metrics** (Z-score standardization).
2. **Extract temporal features** (e.g., day of week).
3. **Create binary indicators** for categorical variables.

**Key Concept**: **Feature Engineering** is the process of using domain knowledge to extract features from raw data that make machine learning algorithms work better.

By the end of this notebook, you will have completed the listed steps and produced the outputs described in the success criteria.

### Success criteria
- You computed normalized metrics.
- You derived categorical/time features.
- You prepared a feature-rich table for downstream use.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd

experiments = pd.read_csv(DATA_DIR / "sample_tabular" / "experiments_sample.csv")
experiments["timestamp"] = pd.to_datetime(experiments["timestamp"])

experiments["metric_normalized"] = (experiments["metric_value"] - experiments["metric_value"].mean()) / experiments["metric_value"].std()
experiments["day_of_week"] = experiments["timestamp"].dt.day_name()
experiments["is_treatment"] = experiments["condition"].str.contains("treatment")
experiments[["experiment_id", "condition", "metric_value", "metric_normalized", "day_of_week", "is_treatment"]].head()


### If you get stuck / What to try next

If you get stuck: verify the cleaning notebook ran and that dependencies installed. What to try next: prototype simple models or export features to the notebook outputs for quick views.