# Predicting Loan Approval (Kaggle-ready)

Notebook prepared to run on Kaggle. It includes full preprocessing, baseline models, GridSearchCV experiments, evaluation metrics and plots. Put the `loan_data.csv` dataset file in the notebook dataset folder on Kaggle or update the path.

## Requirements

This notebook uses: `pandas`, `numpy`, `scikit-learn`, `matplotlib`, `seaborn`, `joblib`.

Kaggle environment usually has these preinstalled. For local runs, use the included `requirements.txt`.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import joblib
sns.set(style='whitegrid')


In [None]:
# Load dataset (auto-detect for local or Kaggle)
import os
from glob import glob
csv_candidates = []

# 1) Local file in working dir
local_path = "loan_data.csv"
if os.path.exists(local_path):
    csv_candidates.append(local_path)

# 2) Look for common Kaggle dataset path(s)
kaggle_base = "/kaggle/input"
if os.path.exists(kaggle_base):
    # search for any csv under /kaggle/input that looks like loan data
    for p in glob(os.path.join(kaggle_base, "**", "*.csv"), recursive=True):
        # heuristics: filename contains 'loan' OR parent folder contains 'loan'
        fname = os.path.basename(p).lower()
        if 'loan' in fname or 'loan' in p.lower():
            csv_candidates.append(p)

# 3) As a fallback, add any csv in working dir
for p in glob("*.csv"):
    if p not in csv_candidates:
        csv_candidates.append(p)

if not csv_candidates:
    print("No CSV files found. Please upload 'loan_data.csv' to the notebook directory or add the Kaggle dataset via Add Data.")
    df = None
else:
    # pick the first candidate (most likely match); show all found for transparency
    print("Found CSV candidates (in order):")
    for i,p in enumerate(csv_candidates):
        print(f"{i+1}. {p}")
    csv_path = csv_candidates[0]
    print("\nUsing:", csv_path)
    df = pd.read_csv(csv_path)
    print("Loaded dataframe shape:", df.shape)
    display(df.head())

In [None]:
# Quick EDA
try:
    df.info()
    display(df.isna().sum())
    display(df.describe(include='all').T)
except NameError:
    print("Dataframe 'df' not loaded. Run the previous cell to load the CSV.")

In [None]:
# Preprocessing & Feature Engineering
if 'df' in globals():
    df = df.copy()
    if 'Loan_ID' in df.columns:
        df.drop('Loan_ID', axis=1, inplace=True)
    # target
    df['Loan_Status'] = df['Loan_Status'].map({'Y':1, 'N':0})
    # Dependents: convert '3+' to '3'
    if 'Dependents' in df.columns:
        df['Dependents'] = df['Dependents'].replace('3+', '3')
    # TotalIncome
    df['TotalIncome'] = df.get('ApplicantIncome', 0) + df.get('CoapplicantIncome', 0)
    # log transforms (handle missing safely)
    df['LoanAmount_log'] = np.log1p(df['LoanAmount'].fillna(df['LoanAmount'].median()))
    df['TotalIncome_log'] = np.log1p(df['TotalIncome'].fillna(df['TotalIncome'].median()))
    display(df.head())
else:
    print('Load the data first.')

In [None]:
# Train-test split (70/30)
if 'df' in globals():
    target = 'Loan_Status'
    X = df.drop(columns=[target])
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.30, random_state=42, stratify=y
    )
    print("Train:", X_train.shape, "Test:", X_test.shape)
else:
    print('Data not loaded')

In [None]:
# Preprocessing pipeline
numeric_features = [c for c in ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','TotalIncome','LoanAmount_log','TotalIncome_log'] if c in X.columns]
categorical_features = [c for c in X.columns if c not in numeric_features]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [None]:
# Evaluation helper
def evaluate_model(model, X_train, X_test, y_train, y_test, name="model"):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    metrics = {
        'model': name,
        'train_accuracy': accuracy_score(y_train, y_pred_train),
        'test_accuracy': accuracy_score(y_test, y_pred_test),
        'train_precision': precision_score(y_train, y_pred_train, zero_division=0),
        'test_precision': precision_score(y_test, y_pred_test, zero_division=0),
        'train_recall': recall_score(y_train, y_pred_train, zero_division=0),
        'test_recall': recall_score(y_test, y_pred_test, zero_division=0),
        'train_f1': f1_score(y_train, y_pred_train, zero_division=0),
        'test_f1': f1_score(y_test, y_pred_test, zero_division=0),
    }
    return metrics

In [None]:
# Baseline models (fit and evaluate)
models = {
    'Logistic_L2': Pipeline(steps=[('pre', preprocessor), ('clf', LogisticRegression(penalty='l2', solver='liblinear', max_iter=1000, random_state=42))]),
    'DecisionTree': Pipeline(steps=[('pre', preprocessor), ('clf', DecisionTreeClassifier(random_state=42))]),
    'RandomForest': Pipeline(steps=[('pre', preprocessor), ('clf', RandomForestClassifier(n_estimators=100, random_state=42))]),
    'AdaBoost': Pipeline(steps=[('pre', preprocessor), ('clf', AdaBoostClassifier(n_estimators=50, random_state=42))]),
    'SVM': Pipeline(steps=[('pre', preprocessor), ('clf', SVC(kernel='rbf', probability=True, random_state=42))]),
}

results = []
for name, pipe in models.items():
    try:
        pipe.fit(X_train, y_train)
        res = evaluate_model(pipe, X_train, X_test, y_train, y_test, name=name)
        results.append(res)
        print(name, "done.")
    except Exception as e:
        print("Error for", name, ":", e)

results_df = pd.DataFrame(results)
display(results_df)

In [None]:
# Confusion matrices and classification reports
for name, pipe in models.items():
    try:
        print('\nModel:', name)
        y_pred = pipe.predict(X_test)
        print(classification_report(y_test, y_pred, zero_division=0))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d')
        plt.title(f'{name} - Confusion Matrix')
        plt.xlabel('Predicted'); plt.ylabel('Actual')
        plt.show()
    except Exception as e:
        print('Error plotting for', name, e)

In [None]:
# Decision Tree visualization (shallow)
try:
    dt_pipe = models['DecisionTree']
    dt = dt_pipe.named_steps['clf']
    # feature names after preprocessing
    cat_ohe = dt_pipe.named_steps['pre'].named_transformers_['cat'].named_steps['onehot']
    cat_cols = list(cat_ohe.get_feature_names_out(dt_pipe.named_steps['pre'].transformers_[1][2]))
    num_cols = [c for c in numeric_features]
    feature_names = list(num_cols) + cat_cols
    plt.figure(figsize=(18,10))
    plot_tree(dt, feature_names=feature_names, class_names=['N','Y'], filled=True, max_depth=3)
    plt.show()
except Exception as e:
    print('Decision tree plot error:', e)

In [None]:
# GridSearchCV - Logistic Regression
param_grid_lr = {
    'clf__penalty': ['l1','l2','elasticnet'],
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__solver': ['saga'],
    'clf__l1_ratio': [0.0, 0.5, 1.0]
}
pipe_lr = Pipeline(steps=[('pre', preprocessor), ('clf', LogisticRegression(max_iter=5000, random_state=42))])
grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_lr.fit(X_train, y_train)
print('Best LR params:', grid_lr.best_params_)
best_lr = grid_lr.best_estimator_
res_lr = evaluate_model(best_lr, X_train, X_test, y_train, y_test, name='LogisticGrid')
res_lr

In [None]:
# GridSearchCV - Decision Tree
param_grid_dt = {
    'clf__criterion': ['gini','entropy'],
    'clf__max_depth': [3,5,7,10, None],
    'clf__min_samples_split': [2,5,10],
    'clf__min_samples_leaf': [1,2,4]
}
pipe_dt = Pipeline(steps=[('pre', preprocessor), ('clf', DecisionTreeClassifier(random_state=42))])
grid_dt = GridSearchCV(pipe_dt, param_grid_dt, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_dt.fit(X_train, y_train)
print('Best DT params:', grid_dt.best_params_)
best_dt = grid_dt.best_estimator_
res_dt = evaluate_model(best_dt, X_train, X_test, y_train, y_test, name='DecisionTreeGrid')
res_dt

In [None]:
# GridSearchCV - Random Forest (smaller grid to save time)
param_grid_rf = {
    'clf__n_estimators': [100,200],
    'clf__max_depth': [None, 5, 10],
    'clf__min_samples_split': [2,5],
}
pipe_rf = Pipeline(steps=[('pre', preprocessor), ('clf', RandomForestClassifier(random_state=42))])
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, cv=4, scoring='f1', n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)
print('Best RF params:', grid_rf.best_params_)
best_rf = grid_rf.best_estimator_
res_rf = evaluate_model(best_rf, X_train, X_test, y_train, y_test, name='RandomForestGrid')
res_rf

In [None]:
# GridSearchCV - AdaBoost
param_grid_ab = {
    'clf__n_estimators': [50,100,200],
    'clf__learning_rate': [0.5,1.0,1.5]
}
pipe_ab = Pipeline(steps=[('pre', preprocessor), ('clf', AdaBoostClassifier(random_state=42))])
grid_ab = GridSearchCV(pipe_ab, param_grid_ab, cv=4, scoring='f1', n_jobs=-1, verbose=1)
grid_ab.fit(X_train, y_train)
print('Best AB params:', grid_ab.best_params_)
best_ab = grid_ab.best_estimator_
res_ab = evaluate_model(best_ab, X_train, X_test, y_train, y_test, name='AdaBoostGrid')
res_ab

In [None]:
# SVM (no GridSearch)
pipe_svm = Pipeline(steps=[('pre', preprocessor), ('clf', SVC(kernel='rbf', probability=True, random_state=42))])
pipe_svm.fit(X_train, y_train)
res_svm = evaluate_model(pipe_svm, X_train, X_test, y_train, y_test, name='SVM_RBF')
res_svm

In [None]:
# Combine all results (baseline + tuned)
all_results = pd.DataFrame(results) if 'results' in globals() else pd.DataFrame()
others = []
for var in ['res_lr','res_dt','res_rf','res_ab','res_svm']:
    if var in globals():
        others.append(globals()[var])
if len(others):
    for r in others:
        all_results = all_results.append(r, ignore_index=True)
if not all_results.empty:
    all_results = all_results.sort_values(by='test_f1', ascending=False).reset_index(drop=True)
display(all_results)
else:
    print("No results to show yet.")

In [None]:
# Save the best model (example)
if 'all_results' in globals() and not all_results.empty:
    best_name = all_results.loc[0,'model']
    print("Best model according to test_f1:", best_name)
    # Map names to estimators
    names_map = {
        'Logistic_L2': models.get('Logistic_L2'),
        'DecisionTree': models.get('DecisionTree'),
        'RandomForest': models.get('RandomForest'),
        'AdaBoost': models.get('AdaBoost'),
        'SVM': models.get('SVM'),
        'LogisticGrid': globals().get('best_lr'),
        'DecisionTreeGrid': globals().get('best_dt'),
        'RandomForestGrid': globals().get('best_rf'),
        'AdaBoostGrid': globals().get('best_ab'),
        'SVM_RBF': globals().get('pipe_svm')
    }
    best_model = names_map.get(best_name)
    if best_model is not None:
        joblib.dump(best_model, 'best_model.joblib')
        print('Saved best_model.joblib')
    else:
        print('Best model not found in map; save your preferred estimator manually.')
else:
    print('No results dataframe available.')

## Next steps

- Verify dataset upload on Kaggle (upload `loan_data.csv`).
- Run cells sequentially.
- Export results, download `best_model.joblib` if needed.

---

### To upload this notebook to Kaggle:
1. Create a new Kaggle notebook, choose 'Upload' and upload this `.ipynb` file.
2. Upload dataset file `loan_data.csv` to the notebook's dataset files or attach a dataset.

Good luck!