<a href="https://colab.research.google.com/github/awsdevguru/PearsonMLFoundations/blob/dev/2_4_02_Hands_on_Lab_Simple_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hands-On Lab: Simple Classifier (Titanic) — with Code

**Goal:** build a reproducible pipeline that predicts Survived using scikit-learn.

## 0) Setup & Data Load

In [None]:
# Colab/Notebook setup
import numpy as np
import pandas as pd

# Viz
import matplotlib.pyplot as plt

# Data & model utils
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score, RocCurveDisplay
)

# Load Titanic from seaborn (reliable in Colab) or fallback to URL
try:
    import seaborn as sns
    df = sns.load_dataset("titanic")
except Exception:
    df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

df.head()

In [None]:
#category counts
df.count()

## 1) Target & Feature Selection

In [None]:
# Harmonize column names across sources
df.columns = [c.strip().lower() for c in df.columns]

# Pick a compact, informative feature set
# Works for both seaborn and Kaggle-like versions
candidate_cols = [c for c in df.columns if c in
                  ["survived","pclass","sex","age","sibsp","parch","fare","embarked"]]

data = df[candidate_cols].copy()

# Drop rows with missing target
data = data.dropna(subset=["survived"])

X = data.drop(columns=["survived"])
y = data["survived"].astype(int)

X.head(), y.head()

## 2) Train/Test Split (stratify to keep class balance)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=73
)

y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

## 3) Preprocessing: Numeric vs Categorical

In [None]:
# Identify column types present
num_features = [c for c in ["age","sibsp","parch","fare"] if c in X.columns]
cat_features = [c for c in ["pclass","sex","embarked"] if c in X.columns]

numeric_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_features),
        ("cat", categorical_pipe, cat_features)
    ]
)
preprocess

## 4) Model: Logistic Regression in a Pipeline

In [None]:
clf = Pipeline(steps=[
    ("prep", preprocess),
    ("model", LogisticRegression(max_iter=1000, C=1.0))
])

clf

## 5) Train

In [None]:
clf.fit(X_train, y_train)

## 6) Evaluate (Accuracy, Precision, Recall, F1)

In [None]:
y_pred = clf.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1       :", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

## 7) Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Died (0)","Survived (1)"])
disp.plot(values_format="d")
plt.title("Confusion Matrix — Logistic Regression")
plt.show()

## 8) ROC-AUC & Probability Thresholds

In [None]:
# Probabilities
y_proba = clf.predict_proba(X_test)[:,1]

# ROC-AUC
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("ROC Curve — Logistic Regression")
plt.show()

# Try a custom threshold (e.g., 0.4)
thr = 0.40
y_pred_custom = (y_proba >= thr).astype(int)
print(f"Threshold={thr:.2f}  Acc={accuracy_score(y_test,y_pred_custom):.3f}  "
      f"Prec={precision_score(y_test,y_pred_custom):.3f}  Rec={recall_score(y_test,y_pred_custom):.3f}  "
      f"F1={f1_score(y_test,y_pred_custom):.3f}")

## 9) Quick Cross-Validation on Train Set

In [None]:
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring="f1")
print("CV F1 mean ± std:", f"{cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

## 10) Inspect “Feature Effects” (approximate)

For pipelines with one-hot, coefficients map to transformed columns. We'll recover feature names to pair with weights.

In [None]:
# Refit on full training to ensure fitted steps
clf.fit(X_train, y_train)

# Get transformed feature names
ohe = clf.named_steps["prep"].named_transformers_["cat"].named_steps["onehot"]
cat_out = ohe.get_feature_names_out(cat_features) if len(cat_features)>0 else np.array([])
num_out = np.array(num_features)
all_feats = np.concatenate([num_out, cat_out])

# Coefficients
coefs = clf.named_steps["model"].coef_.ravel()
feat_importance = pd.DataFrame({"feature": all_feats, "coef": coefs}).sort_values("coef", ascending=False)
feat_importance.head(10)

## 11) Minimal Hyperparameter Tuning (Grid Search)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__C": [0.1, 1.0, 10.0],
    "model__penalty": ["l2"]
}

grid = GridSearchCV(clf, param_grid, cv=3, scoring="f1", n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
best = grid.best_estimator_
print("Test F1:", f1_score(y_test, best.predict(X_test)))

## 12) Save & Load the Trained Pipeline

In [None]:
import joblib
joblib.dump(best, "titanic_logreg_pipeline.joblib")

loaded = joblib.load("titanic_logreg_pipeline.joblib")
print("Reloaded model test accuracy:", accuracy_score(y_test, loaded.predict(X_test)))


## 13) "What-If" Prediction (single row)

In [None]:
# Build a single, realistic passenger row (adjust to the columns you have)
one = pd.DataFrame([{
    "pclass": 2,
    "sex": "female",
    "age": 28,
    "sibsp": 0,
    "parch": 0,
    "fare": 20.0,
    "embarked": "S"
}])

print("Survival probability:", loaded.predict_proba(one)[0,1])
print("Predicted class     :", int(loaded.predict(one)[0]))

## 14) EDA

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,4))
if "sex" in X.columns:
    (data.groupby("sex")["survived"].mean().sort_values()
         .plot(kind="barh", ax=ax[0], title="Survival rate by sex"))
if "pclass" in X.columns:
    (data.groupby("pclass")["survived"].mean().sort_values()
         .plot(kind="barh", ax=ax[1], title="Survival rate by pclass"))
plt.tight_layout()
plt.show()

You now have a clean, end-to-end classifier with:

* Train/test split
* Robust preprocessing (impute, scale, encode)
* Interpretable metrics (Accuracy, P/R/F1, ROC-AUC)
* Threshold tuning, CV, light hyperparam search
* Save/load + single-row inference