In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("Credit (1).csv")
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,duration,campaign,approval
0,30.0,unemployed,married,primary,1787,no,79.0,1,no
1,33.0,services,married,secondary,4789,yes,220.0,1,no
2,35.0,management,single,tertiary,1350,yes,185.0,1,no
3,30.0,management,married,tertiary,1476,yes,199.0,4,no
4,59.0,blue-collar,married,secondary,0,yes,226.0,1,no


In [4]:
print("Dataset shape:", df.shape)
print(df.info())
print(df.isnull().sum())  # Check for nulls

Dataset shape: (4521, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        4519 non-null   float64
 1   job        4515 non-null   object 
 2   marital    4518 non-null   object 
 3   education  4517 non-null   object 
 4   balance    4521 non-null   int64  
 5   housing    4518 non-null   object 
 6   duration   4519 non-null   float64
 7   campaign   4521 non-null   int64  
 8   approval   4521 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 318.0+ KB
None
age          2
job          6
marital      3
education    4
balance      0
housing      3
duration     2
campaign     0
approval     0
dtype: int64


In [5]:
df['housing'].fillna(df['housing'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['housing'].fillna(df['housing'].mode()[0], inplace=True)


In [6]:
df['housing'].isnull().sum()

np.int64(0)

In [7]:
# Step 4: Handle missing/null values safely
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])  # use assignment
    else:
        df[col] = df[col].fillna(df[col].median())  # use assignment


In [8]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
duration     0
campaign     0
approval     0
dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder

# Step 6: Encode categorical (object) columns using LabelEncoder
le = LabelEncoder()

encoded_columns = []  # to keep track of which columns were encoded and what values they had

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])
        encoded_columns.append((col, dict(zip(le.classes_, le.transform(le.classes_)))))

# Display encoded values for each column
for col, mapping in encoded_columns:
    print(f"Column '{col}':")
    for original, encoded in mapping.items():
        print(f"  {original} -> {encoded}")
    print()


Column 'job':
  admin. -> 0
  blue-collar -> 1
  entrepreneur -> 2
  housemaid -> 3
  management -> 4
  retired -> 5
  self-employed -> 6
  services -> 7
  student -> 8
  technician -> 9
  unemployed -> 10
  unknown -> 11

Column 'marital':
  divorced -> 0
  married -> 1
  single -> 2

Column 'education':
  primary -> 0
  secondary -> 1
  tertiary -> 2
  unknown -> 3

Column 'housing':
  no -> 0
  yes -> 1

Column 'approval':
  no -> 0
  yes -> 1



In [10]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,duration,campaign,approval
0,30.0,10,1,0,1787,0,79.0,1,0
1,33.0,7,1,1,4789,1,220.0,1,0
2,35.0,4,2,2,1350,1,185.0,1,0
3,30.0,4,1,2,1476,1,199.0,4,0
4,59.0,1,1,1,0,1,226.0,1,0


In [11]:
df.to_csv("cleaned_credit.csv")

In [12]:
X = df.drop('approval', axis=1)  # Replace 'default' with your actual target column
y = df['approval']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
y_pred = model.predict(X_test)

In [16]:
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc*100:.2f}%")

Accuracy: 88.73%


In [17]:
import joblib
joblib.dump(model, "loan_approval_model.pkl")

['loan_approval_model.pkl']