# Minimal Modeling Notebook â€” Loan Risk (GitHub-ready)

This notebook is a concise, well-documented modeling notebook suitable for publishing on GitHub. It focuses on the modeling pipeline: data load, simple preprocessing, training two models, evaluation, and saving artifacts.

**Dataset path:** `/mnt/data/Data_08_Simulated Loan Risk Assessment Data.csv`

Run cells sequentially.

In [None]:
# Cell 1: Imports and config
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import joblib

plt.rcParams['figure.figsize'] = (8,4)
sns.set(style='whitegrid')

DATA_PATH = r"/mnt/data/Data_08_Simulated Loan Risk Assessment Data.csv"
print('Dataset path:', DATA_PATH)

In [None]:
# Cell 2: Load data and quick inspect
df = pd.read_csv(DATA_PATH)
print('Shape:', df.shape)
display(df.head())
print('\nData types:')
display(df.dtypes)

In [None]:
# Cell 3: Choose target and brief check
# Use 'loan_status' if present otherwise pick a low-cardinality column
target = 'loan_status' if 'loan_status' in df.columns else ([c for c in df.columns if df[c].nunique()<=10][0] if any(df[c].nunique()<=10 for c in df.columns) else None)
print('Target chosen:', target)
if target is None:
    raise ValueError('No suitable target found. Set target manually.')
display(df[target].value_counts())

In [None]:
# Cell 4: Split features and target, identify column types
X = df.drop(columns=[target]).copy()
y = pd.factorize(df[target].fillna(df[target].mode()[0]))[0]

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()
print('Numeric cols:', num_cols)
print('Categorical cols:', cat_cols)

In [None]:
# Cell 5: Preprocessing pipelines (simple & reproducible)
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])
preprocessor = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)])

# Quick transform to show resultant shape
X_trans = preprocessor.fit_transform(X)
print('Transformed feature matrix shape:', X_trans.shape)

In [None]:
# Cell 6: Train-test split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('Train size:', X_tr.shape[0], 'Test size:', X_te.shape[0])

In [None]:
# Cell 7: Train Logistic Regression (pipeline)
pipe_lr = Pipeline([('pre', preprocessor), ('clf', LogisticRegression(max_iter=1000))])
pipe_lr.fit(X_tr, y_tr)
pred_lr = pipe_lr.predict(X_te)
proba_lr = pipe_lr.predict_proba(X_te)[:,1] if hasattr(pipe_lr.named_steps['clf'], 'predict_proba') and len(np.unique(y))==2 else None

print('Logistic Regression metrics:')
print('Accuracy:', accuracy_score(y_te, pred_lr))
print('F1:', f1_score(y_te, pred_lr, average='binary' if len(np.unique(y))==2 else 'macro'))

In [None]:
# Cell 8: Train RandomForest (pipeline)
pipe_rf = Pipeline([('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))])
pipe_rf.fit(X_tr, y_tr)
pred_rf = pipe_rf.predict(X_te)
proba_rf = pipe_rf.predict_proba(X_te)[:,1] if hasattr(pipe_rf.named_steps['clf'], 'predict_proba') and len(np.unique(y))==2 else None

print('RandomForest metrics:')
print('Accuracy:', accuracy_score(y_te, pred_rf))
print('F1:', f1_score(y_te, pred_rf, average='binary' if len(np.unique(y))==2 else 'macro'))

In [None]:
# Cell 9: Compare models (detailed metrics and confusion matrices)
def eval_model(y_true, y_pred, y_proba=None):
    res = {'accuracy': accuracy_score(y_true,y_pred),
           'precision': precision_score(y_true,y_pred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0),
           'recall': recall_score(y_true,y_pred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0),
           'f1': f1_score(y_true,y_pred, average='binary' if len(np.unique(y))==2 else 'macro', zero_division=0),
           'roc_auc': roc_auc_score(y_true,y_proba) if y_proba is not None else np.nan}
    return res

res_lr = eval_model(y_te, pred_lr, proba_lr)
res_rf = eval_model(y_te, pred_rf, proba_rf)
results_df = pd.DataFrame([res_lr, res_rf], index=['LogisticRegression','RandomForest'])
display(results_df)

print('\nConfusion matrix (RandomForest):')
print(confusion_matrix(y_te, pred_rf))

In [None]:
# Cell 10: Save best model and results
# Choose RandomForest as an example "best" model (you can change this logic)
joblib.dump(pipe_rf, '/mnt/data/best_model.pkl')
results_df.to_csv('/mnt/data/model_results_summary.csv', index=True)
print('Saved model to /mnt/data/best_model.pkl')
print('Saved results to /mnt/data/model_results_summary.csv')

## How to use this notebook on GitHub

- Include `requirements.txt` listing the libraries (scikit-learn, pandas, numpy, matplotlib, seaborn, joblib).
- Add a short `README.md` explaining how to run the notebook and what each cell does.
- Use Git LFS for storing any large model artifacts (if needed).