In [None]:
import sys
import subprocess

if "google.colab" in sys.modules:
    print("Detected Google Colab runtime. Installing dependencies...")
    packages = ["streamlit", "pandas", "numpy", "scikit-learn", "requests"]
    subprocess.check_call([sys.executable, "-m", "pip", "install", *packages])


# Tabular Cleaning

Goal: load synthetic experiment results, parse timestamps, and surface potential quality issues.

Why it matters: sanity checks protect downstream analyses from malformed or out-of-range values.

How to run and adapt: execute after data generation; add domain-specific validation rules that reflect your own experiments.

In [None]:
from pathlib import Path


def find_data_dir() -> Path:
    candidates = [Path.cwd() / "data", Path.cwd().parent / "data", Path.cwd().parent.parent / "data"]
    for candidate in candidates:
        if (candidate / "sample_texts" / "articles_sample.csv").exists():
            return candidate
    raise FileNotFoundError("data directory not found. Run scripts/generate_synthetic_data.py.")

DATA_DIR = find_data_dir()


In [None]:
import pandas as pd

experiments = pd.read_csv(DATA_DIR / "sample_tabular" / "experiments_sample.csv")
experiments["timestamp"] = pd.to_datetime(experiments["timestamp"])

print("Dataset shape", experiments.shape)
print("Missing values", experiments.isna().sum())
experiments.head()


## Basic range checks

In [None]:
metric_out_of_range = (~experiments["metric_value"].between(0, 100)).sum()
print(f"Records outside expected metric range: {metric_out_of_range}")
experiments.describe()
