# defining success metrics and evaluation criteria for the project

## ðŸ“š Learning Objectives

By completing this notebook, you will:
- Train a baseline classifier with scikit-learn
- Evaluate using accuracy + confusion matrix
- Show how to encode categorical features

## ðŸ”— Prerequisites

- âœ… Python basics
- âœ… Jupyter Notebook basics

---

## Official Structure Reference

This notebook covers practical activities from **Course 12, Unit 1**:
- defining success metrics and evaluation criteria for the project
- **Source:** `DETAILED_UNIT_DESCRIPTIONS.md`

---


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# small synthetic dataset
rng = np.random.default_rng(123)
n = 600
x1 = rng.normal(size=n)
x2 = rng.normal(size=n)
color = rng.choice(['red','green','blue'], size=n)

y = ((x1 + 0.8*x2 + (color == 'red')*0.6 + rng.normal(scale=0.5, size=n)) > 0.2).astype(int)

df = pd.DataFrame({'x1': x1, 'x2': x2, 'color': color, 'y': y})
X = df.drop(columns=['y'])
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

pre = ColumnTransformer([
 ('cat', OneHotEncoder(handle_unknown='ignore'), ['color']),
], remainder='passthrough')

clf = Pipeline([
 ('pre', pre),
 ('model', LogisticRegression(max_iter=1000))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print('accuracy:', accuracy_score(y_test, y_pred))
print('confusion matrix:', confusion_matrix(y_test, y_pred))
print('\nreport:', classification_report(y_test, y_pred))
