In [None]:
import sys, subprocess
if "google.colab" in sys.modules:
    subprocess.run(["pip", "install", "-q", "pandas", "numpy", "scikit-learn", "requests", "pydantic", "jsonschema"])


# Clean Synthetic Experiment Records

**What:** Load experiment data, parse timestamps, and check for missing or out-of-range values.

**Why:** Basic validation catches issues before modeling and keeps results trustworthy.

**How:** Install dependencies in Colab via the first cell, confirm data generation, then run cells in order. Sanity checks here mean looking for nulls, bad ranges, or malformed timestamps.

**You will learn:** How to inspect tabular research data quickly and decide whether more cleaning is needed.

### Success criteria
- You loaded experiment records.
- You parsed timestamps and checked ranges.
- You flagged or confirmed data quality.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd

experiments = pd.read_csv(DATA_DIR / "sample_tabular" / "experiments_sample.csv")
experiments["timestamp"] = pd.to_datetime(experiments["timestamp"])

print("Dataset shape", experiments.shape)
print("Missing values", experiments.isna().sum())
experiments.head()


## Basic range checks

In [None]:
metric_out_of_range = (~experiments["metric_value"].between(0, 100)).sum()
print(f"Records outside expected metric range: {metric_out_of_range}")
experiments.describe()


### If you get stuck / What to try next

If you get stuck: ensure timestamps parse by rerunning the cleaning cell; confirm data generation. What to try next: create features in pipelines/tabular/feature_engineering.ipynb and visualize them in the Streamlit Tabular Workflows page.