# Project Structure Setup
Create a classic ML skeleton with standard directories and placeholders.

In [None]:
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
folders = [
    "data/raw",
    "data/processed",
    "models",
    "reports/figures",
    "src",
    "tests",
    "configs",
    "scripts",
    "notebooks",
    ".vscode",
]
for folder in folders:
    path = PROJECT_ROOT / folder
    path.mkdir(parents=True, exist_ok=True)
    print(f"created {path}")

# placeholder files
for keep in ["data/raw/.gitkeep", "data/processed/.gitkeep", "models/.gitkeep", "reports/.gitkeep"]:
    keep_path = PROJECT_ROOT / keep
    keep_path.parent.mkdir(parents=True, exist_ok=True)
    keep_path.touch(exist_ok=True)
    print(f"touched {keep_path}")


# Create requirements.txt
Write common ML dependencies to requirements.txt.

In [None]:
requirements = """\
numpy
pandas
scikit-learn
matplotlib
seaborn
pyyaml
joblib
click
"""
req_path = PROJECT_ROOT / "requirements.txt"
req_path.write_text(requirements.strip() + "\n", encoding="utf-8")
print(f"wrote {req_path}")


# Data Directory and Sample Dataset
Generate a toy dataset and store raw/processed splits.

In [None]:
import pandas as pd
import numpy as np

raw_df = pd.DataFrame(
    {
        "feature1": np.random.randn(200),
        "feature2": np.random.rand(200) * 10,
        "target": np.random.randint(0, 2, size=200),
    }
)
raw_path = PROJECT_ROOT / "data/raw/sample.csv"
raw_df.to_csv(raw_path, index=False)
print(f"wrote raw sample to {raw_path}")

# simple train/val split
train_df = raw_df.sample(frac=0.8, random_state=42)
val_df = raw_df.drop(train_df.index)
train_path = PROJECT_ROOT / "data/processed/train.csv"
val_path = PROJECT_ROOT / "data/processed/val.csv"
train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
print(f"processed splits saved to {train_path} and {val_path}")


# Source Package Initialization
Create src/ package and ensure __init__.py exists.

In [None]:
src_init = PROJECT_ROOT / "src/__init__.py"
src_init.parent.mkdir(parents=True, exist_ok=True)
src_init.touch(exist_ok=True)
print(f"ensured package at {src_init}")


# Data Loading Module (`src/data/load_data.py`)
Implement helpers to read raw CSVs and write cleaned splits.

In [None]:
data_dir = PROJECT_ROOT / "src/data"
data_dir.mkdir(parents=True, exist_ok=True)
load_data_py = data_dir / "load_data.py"
load_data_py.write_text(
    """
from pathlib import Path
from typing import Tuple
import pandas as pd

RAW_DIR = Path(__file__).resolve().parents[2] / "data" / "raw"
PROCESSED_DIR = Path(__file__).resolve().parents[2] / "data" / "processed"


def load_raw(name: str) -> pd.DataFrame:
    path = RAW_DIR / name
    return pd.read_csv(path)


def save_processed(train: pd.DataFrame, val: pd.DataFrame, train_name: str = "train.csv", val_name: str = "val.csv") -> Tuple[Path, Path]:
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    train_path = PROCESSED_DIR / train_name
    val_path = PROCESSED_DIR / val_name
    train.to_csv(train_path, index=False)
    val.to_csv(val_path, index=False)
    return train_path, val_path
""".strip()
    + "\n",
    encoding="utf-8",
)
print(f"wrote {load_data_py}")


# Feature Engineering Module (`src/features/build_features.py`)
Add basic preprocessing and pipeline builder.

In [None]:
features_dir = PROJECT_ROOT / "src/features"
features_dir.mkdir(parents=True, exist_ok=True)
build_features_py = features_dir / "build_features.py"
build_features_py.write_text(
    """
from typing import List
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


NUMERIC = ["feature1", "feature2"]


def build_preprocessor(numeric_features: List[str] = None) -> ColumnTransformer:
    numeric_features = numeric_features or NUMERIC
    numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
    preprocessor = ColumnTransformer(
        transformers=[("num", numeric_transformer, numeric_features)], remainder="drop"
    )
    return preprocessor


def separate_features_targets(df: pd.DataFrame, target_col: str = "target"):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y
""".strip()
    + "\n",
    encoding="utf-8",
)
print(f"wrote {build_features_py}")


# Model Definition (`src/models/model.py`)
Provide a simple model factory.

In [None]:
models_dir = PROJECT_ROOT / "src/models"
models_dir.mkdir(parents=True, exist_ok=True)
model_py = models_dir / "model.py"
model_py.write_text(
    """
from typing import Literal
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


ModelName = Literal["logreg", "rf"]


def make_model(name: ModelName = "logreg"):
    if name == "rf":
        return RandomForestClassifier(n_estimators=200, random_state=42)
    return LogisticRegression(max_iter=500)
""".strip()
    + "\n",
    encoding="utf-8",
)
print(f"wrote {model_py}")


# Training Script (`src/train.py`)
Train a model, persist it, and log metrics.

In [None]:
train_py = PROJECT_ROOT / "src/train.py"
train_py.write_text(
    """
from pathlib import Path
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score

from src.data.load_data import load_raw, save_processed
from src.features.build_features import build_preprocessor, separate_features_targets
from src.models.model import make_model


def train(raw_name: str = "sample.csv", model_name: str = "logreg", model_dir: Path = Path("models")) -> dict:
    df = load_raw(raw_name)
    train_df = df.sample(frac=0.8, random_state=42)
    val_df = df.drop(train_df.index)
    save_processed(train_df, val_df)

    X_train, y_train = separate_features_targets(train_df)
    X_val, y_val = separate_features_targets(val_df)

    preprocessor = build_preprocessor()
    model = make_model(model_name)
    clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)
    metrics = {
        "accuracy": float(accuracy_score(y_val, y_pred)),
        "f1": float(f1_score(y_val, y_pred)),
    }

    model_dir = Path(model_dir)
    model_dir.mkdir(parents=True, exist_ok=True)
    artifact = model_dir / f"model_{model_name}.joblib"
    joblib.dump(clf, artifact)
    return {"metrics": metrics, "artifact": artifact}


if __name__ == "__main__":
    result = train()
    print(result)
""".strip()
    + "\n",
    encoding="utf-8",
)
print(f"wrote {train_py}")


# Evaluation Script (`src/evaluate.py`)
Load the saved model, score on validation data, and save reports.

In [None]:
evaluate_py = PROJECT_ROOT / "src/evaluate.py"
evaluate_py.write_text(
    """
from pathlib import Path
import json
import joblib
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, f1_score

from src.data.load_data import load_raw, save_processed
from src.features.build_features import separate_features_targets


def evaluate(raw_name: str = "sample.csv", model_path: Path = Path("models/model_logreg.joblib"), report_dir: Path = Path("reports")) -> dict:
    df = load_raw(raw_name)
    train_df = df.sample(frac=0.8, random_state=42)
    val_df = df.drop(train_df.index)
    save_processed(train_df, val_df)

    _, y_train = separate_features_targets(train_df)
    X_val, y_val = separate_features_targets(val_df)

    clf = joblib.load(model_path)
    y_pred = clf.predict(X_val)

    metrics = {
        "accuracy": float(accuracy_score(y_val, y_pred)),
        "f1": float(f1_score(y_val, y_pred)),
    }
    report = classification_report(y_val, y_pred, output_dict=True)

    report_dir = Path(report_dir)
    report_dir.mkdir(parents=True, exist_ok=True)
    report_path = report_dir / "classification_report.json"
    with report_path.open("w", encoding="utf-8") as f:
        json.dump(report, f, indent=2)

    return {"metrics": metrics, "report_path": report_path}


if __name__ == "__main__":
    result = evaluate()
    print(result)
""".strip()
    + "\n",
    encoding="utf-8",
)
print(f"wrote {evaluate_py}")


# Configuration File (`config.yaml`)
Create a YAML config for paths and hyperparameters.

In [None]:
config_path = PROJECT_ROOT / "configs/config.yaml"
config_path.write_text(
    """
paths:
  raw_data: data/raw
  processed_data: data/processed
  models: models
  reports: reports
model:
  name: logreg
  random_state: 42
train:
  test_size: 0.2
  random_state: 42
""".strip()
    + "\n",
    encoding="utf-8",
)
print(f"wrote {config_path}")


# Unit Tests (`tests/`)
Add pytest stubs for data loading and model training.

In [None]:
tests_dir = PROJECT_ROOT / "tests"
tests_dir.mkdir(parents=True, exist_ok=True)

(tests_dir / "test_data.py").write_text(
    """
from src.data.load_data import load_raw

def test_load_raw_smoke():
    df = load_raw("sample.csv")
    assert not df.empty
""".strip()
    + "\n",
    encoding="utf-8",
)

(tests_dir / "test_model.py").write_text(
    """
from src.train import train

def test_train_returns_metrics(tmp_path):
    result = train(model_dir=tmp_path)
    assert "metrics" in result
    assert result["artifact"].exists()
""".strip()
    + "\n",
    encoding="utf-8",
)
print("wrote test stubs")


# CLI Entrypoint (`main.py`)
Expose prep/train/eval commands.

In [None]:
main_py = PROJECT_ROOT / "main.py"
main_py.write_text(
    """
import click
from src.train import train
from src.evaluate import evaluate


@click.group()
def cli():
    """CLI for classic ML pipeline."""


@cli.command()
@click.option("--raw-name", default="sample.csv")
@click.option("--model-name", default="logreg")
def train_cmd(raw_name: str, model_name: str):
    result = train(raw_name=raw_name, model_name=model_name)
    click.echo(result)


@cli.command()
@click.option("--raw-name", default="sample.csv")
@click.option("--model-path", default="models/model_logreg.joblib")
def evaluate_cmd(raw_name: str, model_path: str):
    result = evaluate(raw_name=raw_name, model_path=model_path)
    click.echo(result)


if __name__ == "__main__":
    cli()
""".strip()
    + "\n",
    encoding="utf-8",
)
print(f"wrote {main_py}")


# Makefile Tasks
Add common automation targets.

In [None]:
makefile_path = PROJECT_ROOT / "Makefile"
makefile_path.write_text(
    """
install:
\tpip install -r requirements.txt

lint:
\tpython -m compileall src

test:
\tpytest -q

train:
\tpython main.py train-cmd

evaluate:
\tpython main.py evaluate-cmd
""".strip()
    + "\n",
    encoding="utf-8",
)
print(f"wrote {makefile_path}")


# VS Code Tasks and Launch Configurations
Generate tasks and launch configs for tests and scripts.

In [None]:
import json

vscode_dir = PROJECT_ROOT / ".vscode"
vscode_dir.mkdir(parents=True, exist_ok=True)

tasks = {
    "version": "2.0.0",
    "tasks": [
        {
            "label": "pytest",
            "type": "shell",
            "command": "pytest",
            "group": "test",
            "problemMatcher": "$pytest",
        },
        {
            "label": "train",
            "type": "shell",
            "command": "python",
            "args": ["main.py", "train-cmd"],
        },
        {
            "label": "evaluate",
            "type": "shell",
            "command": "python",
            "args": ["main.py", "evaluate-cmd"],
        },
    ],
}
(vscode_dir / "tasks.json").write_text(json.dumps(tasks, indent=2), encoding="utf-8")

launch = {
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python: Train",
            "type": "python",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "args": ["train-cmd"],
        },
        {
            "name": "Python: Evaluate",
            "type": "python",
            "request": "launch",
            "program": "${workspaceFolder}/main.py",
            "args": ["evaluate-cmd"],
        },
    ],
}
(vscode_dir / "launch.json").write_text(json.dumps(launch, indent=2), encoding="utf-8")
print("wrote .vscode/tasks.json and launch.json")


# Dockerfile and `.dockerignore`
Containerize the project and trim build context.

In [None]:
dockerfile_path = PROJECT_ROOT / "Dockerfile"
dockerfile_path.write_text(
    """
FROM python:3.10-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["python", "main.py", "train-cmd"]
""".strip()
    + "\n",
    encoding="utf-8",
)

ignore_path = PROJECT_ROOT / ".dockerignore"
ignore_path.write_text(
    """
__pycache__
*.pyc
.env
.vscode
.venv
.data
reports
models
""".strip()
    + "\n",
    encoding="utf-8",
)
print("wrote Dockerfile and .dockerignore")


# Run End-to-End Pipeline from Notebook
Call the CLI commands to train and evaluate.

In [None]:
import subprocess, sys

print("running training via CLI...")
subprocess.run([sys.executable, "main.py", "train-cmd"], cwd=PROJECT_ROOT)
print("running evaluation via CLI...")
subprocess.run([sys.executable, "main.py", "evaluate-cmd"], cwd=PROJECT_ROOT)
