# Week 5 — Support Vector Machines (CKD Capstone)
**Dataset:** `/mnt/data/Chronic_Kidney_Dsease_data.csv`  
**Target:** `Diagnosis` (binary)  
**Notes:** Drops identifier columns `['PatientID', 'DoctorInCharge']` if present; imputes missing values; handles categoricals with one-hot; uses class weighting for imbalance.


## 0) Setup & imports

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC, SVR
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, confusion_matrix, 
                             classification_report)

plt.rcParams['figure.figsize'] = (7,4)
np.random.seed(42)
print('Libraries imported OK')


## 1) Load CKD data

In [None]:
DATA_PATH = "/mnt/data/Chronic_Kidney_Dsease_data.csv"
TARGET_COL = "Diagnosis"
ID_COLS = ['PatientID', 'DoctorInCharge']

df = pd.read_csv(DATA_PATH)
print('Shape:', df.shape)
df.head()

## 2) Train/test split and preprocessing

In [None]:
# Drop IDs if present
df = df.drop(columns=[c for c in ID_COLS if c in df.columns])

# Features / target
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# Numeric vs categorical
num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
cat_cols = [c for c in X.columns if c not in num_cols]
print(f'Numeric: {len(num_cols)} | Categorical: {len(cat_cols)}')

# Split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Pipelines
numeric_tf = Pipeline([('imputer', SimpleImputer(strategy='median')), 
                       ('scaler', StandardScaler())])
categorical_tf = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), 
                           ('oh', OneHotEncoder(handle_unknown='ignore'))])

preprocess = ColumnTransformer(
    [('num', numeric_tf, num_cols), ('cat', categorical_tf, cat_cols)]
)

## 3) Baselines: Linear vs RBF SVM

In [None]:
models = {
    "linear": Pipeline([('prep', preprocess),
                        ('svm', SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42))]),
    "rbf":    Pipeline([('prep', preprocess),
                        ('svm', SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42))]),
    "poly":   Pipeline([('prep', preprocess),
                        ('svm', SVC(kernel='poly', degree=3, class_weight='balanced', probability=True, random_state=42))]),
}
rows = []
for name, m in models.items():
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    y_prob = m.predict_proba(X_test)[:,1]
    rows.append({
        'kernel': name,
        'accuracy': accuracy_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_prob)
    })
pd.DataFrame(rows)

## 4) 5-fold CV comparison

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
def cv_kernel(name, est):
    pipe = Pipeline([('prep', preprocess), ('svm', est)])
    scores = cross_validate(pipe, X, y, cv=cv, scoring=['accuracy','f1','roc_auc'])
    return {'kernel': name,
            'acc_cv': scores['test_accuracy'].mean(),
            'f1_cv': scores['test_f1'].mean(),
            'auc_cv': scores['test_roc_auc'].mean()}

cv_rows = []
for name, est in {
    'linear': SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42),
    'rbf':    SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42),
    'poly':   SVC(kernel='poly', degree=3, class_weight='balanced', probability=True, random_state=42)
}.items():
    cv_rows.append(cv_kernel(name, est))
cv_df = pd.DataFrame(cv_rows); cv_df

## 5) Regularization sweeps: C and gamma (RBF)

In [None]:
def sweep_param(param, values):
    means = []
    for v in values:
        est = SVC(kernel='rbf', class_weight='balanced', probability=True, random_state=42)
        pipe = Pipeline([('prep', preprocess), ('svm', est.set_params(**{param: v}))])
        s = cross_validate(pipe, X, y, cv=cv, scoring='roc_auc')['test_score'].mean()
        means.append(s)
    return means

import numpy as np, matplotlib.pyplot as plt
C_vals = np.logspace(-2, 2, 8); gamma_vals = np.logspace(-3, 1, 9)

auc_C = sweep_param('C', C_vals)
plt.figure(); plt.plot(C_vals, auc_C, marker='o'); plt.xscale('log'); plt.xlabel('C'); plt.ylabel('CV ROC AUC'); plt.title('RBF: C sweep'); plt.show()

auc_g = sweep_param('gamma', gamma_vals)
plt.figure(); plt.plot(gamma_vals, auc_g, marker='o'); plt.xscale('log'); plt.xlabel('gamma'); plt.ylabel('CV ROC AUC'); plt.title('RBF: gamma sweep'); plt.show()

## 6) GridSearchCV on RBF/Poly

In [None]:
param_grid = {
    'svm__kernel': ['rbf','poly'],
    'svm__C': np.logspace(-2, 2, 6),
    'svm__gamma': ['scale'] + list(np.logspace(-3, 0, 4)),
    'svm__degree': [2,3]
}
pipe = Pipeline([('prep', preprocess), ('svm', SVC(class_weight='balanced', probability=True, random_state=42))])
gs = GridSearchCV(pipe, param_grid, scoring='roc_auc', cv=cv)
gs.fit(X_train, y_train)
print('Best params:', gs.best_params_)
print('Best CV ROC AUC:', gs.best_score_)

best = gs.best_estimator_
from sklearn.metrics import ConfusionMatrixDisplay
print(classification_report(y_test, best.predict(X_test)))
ConfusionMatrixDisplay.from_predictions(y_test, best.predict(X_test)); plt.title('Confusion Matrix (Test)'); plt.show()

## 7) Notes: kernel trick & regularization

SVMs achieve non-linear decision boundaries via a kernel that computes pairwise similarity without explicitly mapping features.  
`C` controls regularization; larger `C` can overfit. For RBF and polynomial kernels, `gamma` (and `degree`) adjust complexity/scale.


## 8) Conclusions (fill in)

- Which kernel worked best on CKD?  
- How did `C` and `gamma` affect performance?  
- Class imbalance observations and next steps for Milestone One.
