In [None]:
# end_to_end_ml.py
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1) Create synthetic dataset (tabular)
rng = np.random.default_rng(0)
n = 500  # number of samples

age = rng.integers(18, 80, size=n)                       # numeric
income = rng.normal(loc=50_000, scale=20_000, size=n)    # numeric
gender = rng.choice(['M', 'F'], size=n, p=[0.55, 0.45])  # categorical
bp = rng.normal(loc=120, scale=15, size=n)               # numeric: blood pressure
smoker = rng.choice(['yes', 'no'], size=n, p=[0.2, 0.8]) # categorical

# construct a target with some rule + noise
risk_score = (0.03 * (age - 40) + 0.00002 * (income - 50_000) +
              0.02 * (bp - 120) + (gender == 'M') * 0.1 + (smoker == 'yes') * 0.3)
prob = 1 / (1 + np.exp(-risk_score))
y = (rng.random(size=n) < prob).astype(int)  # binary label: 1 = high risk, 0 = low risk

df = pd.DataFrame({
    'age': age,
    'income': income,
    'gender': gender,
    'bp': bp,
    'smoker': smoker,
    'label': y
})

# 2) Introduce some missing values for demonstration
mask = rng.random(size=n) < 0.05
df.loc[mask, 'income'] = np.nan
mask2 = rng.random(size=n) < 0.03
df.loc[mask2, 'bp'] = np.nan

# 3) Train/test split
X = df.drop(columns=['label'])
y = df['label'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# 4) Preprocessing pipeline
numeric_features = ['age', 'income', 'bp']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),   # fill missing numeric values
    ('scaler', StandardScaler())                     # standardize
])

categorical_features = ['gender', 'smoker']
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
    ('onehot', OneHotEncoder(drop='first', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

# 5) Define model candidates wrapped in pipelines
models = {
    'decision_tree': Pipeline([('pre', preprocessor),
                               ('clf', DecisionTreeClassifier(random_state=0))]),
    'random_forest': Pipeline([('pre', preprocessor),
                               ('clf', RandomForestClassifier(random_state=0))]),
    'svm': Pipeline([('pre', preprocessor),
                     ('clf', SVC(probability=False, random_state=0))]),
    'knn': Pipeline([('pre', preprocessor),
                     ('clf', KNeighborsClassifier())])
}

# 6) Simple cross-validated baseline scores
print("Cross-validated accuracy (3-fold) for each model:")
for name, pipe in models.items():
    scores = cross_val_score(pipe, X_train, y_train, cv=3, scoring='accuracy')
    print(f"  {name}: mean={scores.mean():.3f} std={scores.std():.3f}")

# 7) Grid search (example) for Random Forest and SVM
param_grid_rf = {
    'clf__n_estimators': [50, 100],
    'clf__max_depth': [None, 5, 10]
}
grid_rf = GridSearchCV(models['random_forest'], param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)
print("Random Forest best params:", grid_rf.best_params_)
print("Random Forest best CV score:", grid_rf.best_score_)

param_grid_svm = {
    'clf__C': [0.1, 1.0, 10.0],
    'clf__kernel': ['rbf', 'linear']
}
grid_svm = GridSearchCV(models['svm'], param_grid_svm, cv=3, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_train, y_train)
print("SVM best params:", grid_svm.best_params_)
print("SVM best CV score:", grid_svm.best_score_)

# 8) Evaluate best estimators on test set
best_rf = grid_rf.best_estimator_
best_svm = grid_svm.best_estimator_

for name, estimator in [('RandomForest', best_rf), ('SVM', best_svm)]:
    y_pred = estimator.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Test Accuracy: {acc:.3f}")
    print(classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 9) Quick check of Decision Tree importance (train a tree for interpretability)
models['decision_tree'].fit(X_train, y_train)
dt_clf = models['decision_tree'].named_steps['clf']
print("\nDecision Tree feature importances (after preprocessing):")
print(dt_clf.feature_importances_)
