In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.metrics import classification_report
import kagglehub
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
path = kagglehub.dataset_download("kmldas/loan-default-prediction")

print("Path to dataset files:", path)

df = pd.read_csv(path + '/Default_Fin.csv')

Downloading from https://www.kaggle.com/api/v1/datasets/download/kmldas/loan-default-prediction?dataset_version_number=2...


100%|██████████████████████████████| 109k/109k [00:00<00:00, 402kB/s]

Extracting files...
Path to dataset files: /home/ivan/.cache/kagglehub/datasets/kmldas/loan-default-prediction/versions/2





In [4]:
# print size and amount of features
print(df.shape)
# display first 5 rows
display(df.head())
# display data types info
df.info()
print(df.dtypes)
# display missing data info
missing_count = df.isna().sum()
missing_share = df.isna().mean()
display(pd.DataFrame({"missing_count": missing_count, "missing_share": missing_share}))
# display statistic data
display(df.describe(include="all"))

(10000, 5)


Unnamed: 0,Index,Employed,Bank Balance,Annual Salary,Defaulted?
0,1,1,8754.36,532339.56,0
1,2,0,9806.16,145273.56,0
2,3,1,12882.6,381205.68,0
3,4,1,6351.0,428453.88,0
4,5,1,9427.92,461562.0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Index          10000 non-null  int64  
 1   Employed       10000 non-null  int64  
 2   Bank Balance   10000 non-null  float64
 3   Annual Salary  10000 non-null  float64
 4   Defaulted?     10000 non-null  int64  
dtypes: float64(2), int64(3)
memory usage: 390.8 KB
Index              int64
Employed           int64
Bank Balance     float64
Annual Salary    float64
Defaulted?         int64
dtype: object


Unnamed: 0,missing_count,missing_share
Index,0,0.0
Employed,0,0.0
Bank Balance,0,0.0
Annual Salary,0,0.0
Defaulted?,0,0.0


Unnamed: 0,Index,Employed,Bank Balance,Annual Salary,Defaulted?
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,0.7056,10024.498524,402203.782224,0.0333
std,2886.89568,0.455795,5804.579486,160039.674988,0.179428
min,1.0,0.0,0.0,9263.64,0.0
25%,2500.75,0.0,5780.79,256085.52,0.0
50%,5000.5,1.0,9883.62,414631.74,0.0
75%,7500.25,1.0,13995.66,525692.76,0.0
max,10000.0,1.0,31851.84,882650.76,1.0


In [5]:
X = df.drop(columns=['Defaulted?'])
y = df['Defaulted?']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

log_reg = LogisticRegression(max_iter=10000)

param_grid = [
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    {
        'penalty': ['l1'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    }
]

grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=5,
    scoring='f1_macro',
    n_jobs=-1
)

grid.fit(X_train, y_train)
print(grid.best_params_)

best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))