<a href="https://colab.research.google.com/github/ajvilleg/teach-zindi-loan-default-prediction-challenge/blob/main/loan_default_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [108]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import seaborn as sns
import matplotlib.pyplot as plt


In [84]:
# New as of 2025-09-03: Display the full output for print/display code, not just the last result.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [85]:
# Load the training and test datasets
train_df = pd.read_csv('https://raw.githubusercontent.com/ajvilleg/teach-zindi-loan-default-prediction-challenge/refs/heads/main/raw/trainperf.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/ajvilleg/teach-zindi-loan-default-prediction-challenge/refs/heads/main/raw/testperf.csv')


In [86]:
train_df.shape

(4368, 10)

In [87]:
train_df

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good
1,8a85886e54beabf90154c0a29ae757c0,301965204,2,2017-07-05 17:04:41.000000,2017-07-05 16:04:18.000000,15000.0,17250.0,30,,Good
2,8a8588f35438fe12015444567666018e,301966580,7,2017-07-06 14:52:57.000000,2017-07-06 13:52:51.000000,20000.0,22250.0,15,,Good
3,8a85890754145ace015429211b513e16,301999343,3,2017-07-27 19:00:41.000000,2017-07-27 18:00:35.000000,10000.0,11500.0,15,,Good
4,8a858970548359cc0154883481981866,301962360,9,2017-07-03 23:42:45.000000,2017-07-03 22:42:39.000000,40000.0,44000.0,30,,Good
...,...,...,...,...,...,...,...,...,...,...
4363,8a858e6d58b0cc520158beeb14b22a5a,302003163,2,2017-07-30 09:19:42.000000,2017-07-30 08:18:30.000000,10000.0,13000.0,30,,Bad
4364,8a858ee85cf400f5015cf44ab1c42d5c,301998967,2,2017-07-27 15:35:47.000000,2017-07-27 14:35:40.000000,10000.0,13000.0,30,,Bad
4365,8a858f365b2547f3015b284597147c94,301995576,3,2017-07-25 16:25:57.000000,2017-07-25 15:24:47.000000,10000.0,11500.0,15,,Bad
4366,8a858f935ca09667015ca0ee3bc63f51,301977679,2,2017-07-14 13:50:27.000000,2017-07-14 12:50:21.000000,10000.0,13000.0,30,8a858eda5c8863ff015c9dead65807bb,Bad


In [88]:
train_df.head()

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56.000000,2017-07-25 07:22:47.000000,30000.0,34500.0,30,,Good
1,8a85886e54beabf90154c0a29ae757c0,301965204,2,2017-07-05 17:04:41.000000,2017-07-05 16:04:18.000000,15000.0,17250.0,30,,Good
2,8a8588f35438fe12015444567666018e,301966580,7,2017-07-06 14:52:57.000000,2017-07-06 13:52:51.000000,20000.0,22250.0,15,,Good
3,8a85890754145ace015429211b513e16,301999343,3,2017-07-27 19:00:41.000000,2017-07-27 18:00:35.000000,10000.0,11500.0,15,,Good
4,8a858970548359cc0154883481981866,301962360,9,2017-07-03 23:42:45.000000,2017-07-03 22:42:39.000000,40000.0,44000.0,30,,Good


In [89]:
train_df.tail()

Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby,good_bad_flag
4363,8a858e6d58b0cc520158beeb14b22a5a,302003163,2,2017-07-30 09:19:42.000000,2017-07-30 08:18:30.000000,10000.0,13000.0,30,,Bad
4364,8a858ee85cf400f5015cf44ab1c42d5c,301998967,2,2017-07-27 15:35:47.000000,2017-07-27 14:35:40.000000,10000.0,13000.0,30,,Bad
4365,8a858f365b2547f3015b284597147c94,301995576,3,2017-07-25 16:25:57.000000,2017-07-25 15:24:47.000000,10000.0,11500.0,15,,Bad
4366,8a858f935ca09667015ca0ee3bc63f51,301977679,2,2017-07-14 13:50:27.000000,2017-07-14 12:50:21.000000,10000.0,13000.0,30,8a858eda5c8863ff015c9dead65807bb,Bad
4367,8a858fd458639fcc015868eb14b542ad,301967124,8,2017-07-06 21:01:06.000000,2017-07-06 20:01:01.000000,30000.0,34500.0,30,,Bad


In [90]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4368 entries, 0 to 4367
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   customerid     4368 non-null   object 
 1   systemloanid   4368 non-null   int64  
 2   loannumber     4368 non-null   int64  
 3   approveddate   4368 non-null   object 
 4   creationdate   4368 non-null   object 
 5   loanamount     4368 non-null   float64
 6   totaldue       4368 non-null   float64
 7   termdays       4368 non-null   int64  
 8   referredby     587 non-null    object 
 9   good_bad_flag  4368 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 341.4+ KB


In [91]:
test_df.shape

(1450, 9)

In [92]:
test_df.head()


Unnamed: 0,customerid,systemloanid,loannumber,approveddate,creationdate,loanamount,totaldue,termdays,referredby
0,8a858899538ddb8e015390510b321f08,301998974,4,40:48.0,39:35.0,10000,12250.0,30,
1,8a858959537a097401537a4e316e25f7,301963615,10,43:40.0,42:34.0,40000,44000.0,30,
2,8a8589c253ace09b0153af6ba58f1f31,301982236,6,15:11.0,15:04.0,20000,24500.0,30,
3,8a858e095aae82b7015aae86ca1e030b,301971730,8,00:54.0,00:49.0,30000,34500.0,30,
4,8a858e225a28c713015a30db5c48383d,301959177,4,04:33.0,04:27.0,20000,24500.0,30,


In [93]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   customerid    1450 non-null   object 
 1   systemloanid  1450 non-null   int64  
 2   loannumber    1450 non-null   int64  
 3   approveddate  1450 non-null   object 
 4   creationdate  1450 non-null   object 
 5   loanamount    1450 non-null   int64  
 6   totaldue      1450 non-null   float64
 7   termdays      1450 non-null   int64  
 8   referredby    184 non-null    object 
dtypes: float64(1), int64(4), object(4)
memory usage: 102.1+ KB


In [94]:
# New as of 2025-09-03: Preserve customerid for final output
test_customer_ids = test_df['customerid']

In [95]:
# Drop columns not useful for prediction
drop_columns = ['customerid', 'systemloanid', 'approveddate', 'creationdate', 'referredby']
train_df = train_df.drop(columns=drop_columns)
test_df = test_df.drop(columns=drop_columns)

In [96]:
# Encode target variable
label_encoder = LabelEncoder()
train_df['good_bad_flag'] = label_encoder.fit_transform(train_df['good_bad_flag'])

In [97]:
train_df.head()


Unnamed: 0,loannumber,loanamount,totaldue,termdays,good_bad_flag
0,12,30000.0,34500.0,30,1
1,2,15000.0,17250.0,30,1
2,7,20000.0,22250.0,15,1
3,3,10000.0,11500.0,15,1
4,9,40000.0,44000.0,30,1


In [98]:
train_df.tail()

Unnamed: 0,loannumber,loanamount,totaldue,termdays,good_bad_flag
4363,2,10000.0,13000.0,30,0
4364,2,10000.0,13000.0,30,0
4365,3,10000.0,11500.0,15,0
4366,2,10000.0,13000.0,30,0
4367,8,30000.0,34500.0,30,0


In [99]:

# Separate features and target
X = train_df.drop(columns=['good_bad_flag'])
y = train_df['good_bad_flag']


In [100]:
X

Unnamed: 0,loannumber,loanamount,totaldue,termdays
0,12,30000.0,34500.0,30
1,2,15000.0,17250.0,30
2,7,20000.0,22250.0,15
3,3,10000.0,11500.0,15
4,9,40000.0,44000.0,30
...,...,...,...,...
4363,2,10000.0,13000.0,30
4364,2,10000.0,13000.0,30
4365,3,10000.0,11500.0,15
4366,2,10000.0,13000.0,30


In [101]:
y

Unnamed: 0,good_bad_flag
0,1
1,1
2,1
3,1
4,1
...,...
4363,0
4364,0
4365,0
4366,0


In [102]:
# New as of 2025-09-03: Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()



In [103]:
numeric_features

['loannumber', 'loanamount', 'totaldue', 'termdays']

In [104]:
# New as of 2025-09-03: Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [105]:
# New as of 2025-09-03: Full pipeline with RandomForest
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [106]:
# New as of 2025-09-03: Full pipeline with RandomForest
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


# Train model
model.fit(X, y)

Model Evaluation

In [107]:

# ------------------------------------------------------------
# New as of 2025-09-03: Stratified train/validation split for evaluation
# ------------------------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)


# Fit on training split
model.fit(X_train, y_train)


# Predict on validation split
y_val_pred = model.predict(X_val)


In [112]:

# ------------------------------------------------------------
# New as of 2025-09-03: Confusion Matrix (print + plot)
# ------------------------------------------------------------
cm = confusion_matrix(y_val, y_val_pred, labels=[0, 1])
print("Confusion Matrix (rows = Actual, cols = Predicted):\n", cm)

# Access individual elements of the confusion matrix
tn, fp, fn, tp = cm.ravel()

# Print the instances
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")


Confusion Matrix (rows = Actual, cols = Predicted):
 [[  5 185]
 [  8 676]]
True Negatives: 5
False Positives: 185
False Negatives: 8
True Positives: 676


In [113]:
# ------------------------------------------------------------
# New as of 2025-09-03: Classification Report
# ------------------------------------------------------------
print("\nClassification Report (Validation Set):")
print(classification_report(y_val, y_val_pred, target_names=['Bad', 'Good']))



Classification Report (Validation Set):
              precision    recall  f1-score   support

         Bad       0.38      0.03      0.05       190
        Good       0.79      0.99      0.88       684

    accuracy                           0.78       874
   macro avg       0.58      0.51      0.46       874
weighted avg       0.70      0.78      0.70       874



In [None]:
# New as of 2025-09-03: Predict on the test set using pipeline
predictions = model.predict(test_df)

In [114]:
predictions

array([1, 1, 1, ..., 1, 1, 1])

In [None]:
# Save predictions to a CSV file
output_df = pd.DataFrame({'customerid': test_customer_ids, 'Good_Bad_flag': predictions})
output_df.to_csv("loan_default_predictions.csv", index=False)


In [None]:
print("Loan default predictions saved to loan_default_predictions.csv.")
