# Capstone Analysis Notebook

This notebook contains the step-by-step pipeline for your capstone:

- Phase 1: Descriptive analytics (EDA)
- Phase 2: Unsupervised ML (clustering)
- Phase 2: Feature selection
- Phase 3: Supervised ML comparison

**Dataset path:** `/mnt/data/Data_08_Simulated Loan Risk Assessment Data.csv`

Run cells sequentially.

In [8]:
# Cell 1: Imports and config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import silhouette_score, adjusted_rand_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = (10,4)
sns.set(style='whitegrid')

DATA_PATH = r"/mnt/data/Data_08_Simulated Loan Risk Assessment Data.csv"
SUMMARY_OUT = r"/mnt/data/capstone_analysis_summary.csv"

print('Ready. Dataset path:', DATA_PATH)

Ready. Dataset path: /mnt/data/Data_08_Simulated Loan Risk Assessment Data.csv


In [11]:
# Cell 2: Load dataset & preview
df = pd.read_csv ('Data_08_Simulated Loan Risk Assessment Data.csv')
print('Shape:', df.shape)
display(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'Data_08_Simulated Loan Risk Assessment Data.csv'

In [None]:
# Cell 3: dtypes & missing values
display(df.dtypes)
display(df.isnull().sum().sort_values(ascending=False).head(30))

In [None]:
# Cell 4: Descriptive statistics & central tendency
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
display(df[num_cols].describe().T)

ct = df[num_cols].agg(['mean','median','std','min','max']).T
ct['mode'] = df[num_cols].mode().iloc[0]
ct['skew'] = df[num_cols].skew()
ct['kurtosis'] = df[num_cols].kurtosis()
display(ct)

In [None]:
# Cell 5: Histograms and boxplots for numeric columns
for col in num_cols:
    fig, axes = plt.subplots(1,2, figsize=(12,3))
    sns.histplot(df[col].dropna(), kde=True, ax=axes[0])
    axes[0].set_title(f'Histogram: {col}')
    sns.boxplot(x=df[col], ax=axes[1])
    axes[1].set_title(f'Boxplot: {col}')
    plt.tight_layout()
    plt.show()

In [None]:
# Cell 6: Candidate target detection
candidate_targets = [c for c in df.columns if df[c].nunique() <= 10]
print('Candidate targets (<=10 unique):', candidate_targets)
target = 'loan_status' if 'loan_status' in df.columns else (candidate_targets[0] if candidate_targets else None)
print('Using target column:', target)
if target:
    display(df[target].value_counts())

In [None]:
# Cell 7: Impute and scale numeric features
imp = SimpleImputer(strategy='median')
X_num = pd.DataFrame(imp.fit_transform(df[num_cols]), columns=num_cols)

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=num_cols)
print('Prepared numeric feature matrix:', X_scaled.shape)

In [None]:
# Cell 8: PCA(2) for visualization
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
print('Explained variance ratio (PC1,PC2):', pca.explained_variance_ratio_)
plt.scatter(X_pca[:,0], X_pca[:,1], s=10)
plt.title('PCA(2) projection of numeric features')
plt.xlabel('PC1'); plt.ylabel('PC2')
plt.show()

In [None]:
# Cell 9: KMeans silhouette search (k=2..6)
best_k, best_score = None, -1
for k in range(2,7):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labs = kmeans.fit_predict(X_scaled)
    try:
        s = silhouette_score(X_scaled, labs)
    except:
        s = -1
    print(f'k={k} silhouette={s:.4f}')
    if s > best_score:
        best_score = s; best_k = k; best_labels = labs; best_kmeans = kmeans

print('Best k:', best_k, 'score:', best_score)
plt.scatter(X_pca[:,0], X_pca[:,1], c=best_labels, s=10)
plt.title(f'KMeans clusters (k={best_k}) projected to PCA(2)')
plt.show()

In [None]:
# Cell 10: Compare clusters to actual target
if target and target in df.columns:
    print('Crosstab of KMeans clusters vs target')
    display(pd.crosstab(best_labels, df[target].fillna('MISSING')))
    try:
        ari = adjusted_rand_score(pd.factorize(df[target].fillna(df[target].mode()[0]))[0], best_labels)
        print('Adjusted Rand Index:', ari)
    except Exception as e:
        print('Could not compute ARI:', e)
else:
    print('No suitable target column found.')

In [None]:
# Cell 11: Correlation matrix & high-correlation pairs
corr = X_num.corr().abs()
plt.figure(figsize=(12,10))
sns.heatmap(corr, cmap='viridis', vmax=1, vmin=0)
plt.title('Absolute correlation matrix (numeric features)')
plt.show()

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
high_corr = [(i,j,upper.loc[i,j]) for i in upper.index for j in upper.columns if (not pd.isna(upper.loc[i,j])) and upper.loc[i,j] > 0.85]
print('Highly correlated pairs (corr>0.85):', high_corr)

In [None]:
# Cell 12: RandomForest feature importances and SelectFromModel
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
if 'target' not in globals():
    target = 'loan_status' if 'loan_status' in df.columns else (candidate_targets[0] if candidate_targets else None)
if target and target in df.columns:
    y = pd.factorize(df[target].fillna(df[target].mode()[0]))[0]
else:
    y = np.zeros(X_num.shape[0], dtype=int)

rf.fit(X_num, y)
importances = pd.Series(rf.feature_importances_, index=X_num.columns).sort_values(ascending=False)
display(importances.head(20))

sel = SelectFromModel(rf, threshold='median', prefit=True)
selected_features = X_num.columns[sel.get_support()].tolist()
print('RF-selected features (threshold=median):', selected_features)

In [None]:
# Cell 13: Optional LassoCV (if target suitable)
try:
    if target and target in df.columns and df[target].nunique() <= 10:
        lasso = LassoCV(cv=5, random_state=42, max_iter=5000).fit(X_scaled, y)
        coef = pd.Series(lasso.coef_, index=X_scaled.columns).sort_values(key=lambda x: np.abs(x), ascending=False)
        display(coef.head(20))
    else:
        print('Skipping LassoCV: no suitable target or too many unique classes.')
except Exception as e:
    print('LassoCV error:', e)

In [None]:
# Cell 14: Prepare classifiers and parameter grids
features = selected_features if len(selected_features)>0 else X_num.columns.tolist()
X = X_num[features]
print('Feature matrix for modeling shape:', X.shape)

classifiers = {
    'LogisticRegression': LogisticRegression(max_iter=2000),
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'KNeighbors': KNeighborsClassifier()
}
param_grids = {
    'LogisticRegression': {'C':[0.01,0.1,1]},
    'RandomForest': {'n_estimators':[100,200], 'max_depth':[5,10,None]},
    'GradientBoosting': {'n_estimators':[100,200], 'learning_rate':[0.01,0.1]},
    'SVC': {'C':[0.1,1], 'kernel':['rbf','linear']},
    'KNeighbors': {'n_neighbors':[3,5]}
}

In [None]:
# Cell 15: GridSearchCV and evaluation across sample fractions
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []
sample_fracs = [0.1, 0.3, 0.5, 1.0]

for frac in sample_fracs:
    print('\nTraining fraction:', frac)
    if frac < 1.0:
        X_sub, _, y_sub, _ = train_test_split(X, y, train_size=frac, stratify=y, random_state=42)
    else:
        X_sub, y_sub = X, y

    for name, clf in classifiers.items():
        try:
            grid = GridSearchCV(clf, param_grids[name], cv=skf, scoring='accuracy', n_jobs=-1)
            grid.fit(X_sub, y_sub)
            best = grid.best_estimator_
            cv_score = cross_val_score(best, X_sub, y_sub, cv=skf, scoring='accuracy', n_jobs=-1).mean()
            Xtr, Xte, ytr, yte = train_test_split(X_sub, y_sub, test_size=0.2, stratify=y_sub, random_state=42)
            best.fit(Xtr, ytr)
            ypred = best.predict(Xte)
            proba = best.predict_proba(Xte)[:,1] if hasattr(best, 'predict_proba') and len(np.unique(y))==2 else None
            results.append({
                'frac': frac, 'classifier': name, 'best_params': grid.best_params_, 'cv_acc_mean': cv_score,
                'test_acc': accuracy_score(yte, ypred),
                'precision': precision_score(yte, ypred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0),
                'recall': recall_score(yte, ypred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0),
                'f1': f1_score(yte, ypred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0),
                'roc_auc': roc_auc_score(yte, proba) if proba is not None else np.nan
            })
            print(f'{name} done. cv_acc={cv_score:.3f}')
        except Exception as e:
            print(f'Error with {name}:', e)

results_df = pd.DataFrame(results).sort_values(['frac','cv_acc_mean'], ascending=[True, False])
display(results_df.head(50))
results_df.to_csv(SUMMARY_OUT, index=False)
print('Saved summary to:', SUMMARY_OUT)

## Next steps

- Encode categorical features and re-run the pipeline
- Try LightGBM / XGBoost and larger hyperparameter searches
- Add SHAP explainability for the best model
- Tune thresholds and calibrate probabilities for production use

You can run the notebook top-to-bottom. If you want, I can also run any of these follow-ups for you now.