In [336]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split , cross_val_score , RandomizedSearchCV 
from sklearn.linear_model import LogisticRegression 
from sklearn import svm 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score 
import joblib

In [337]:
df = pd.read_csv('loan_data.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
3,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
4,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y


In [338]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            381 non-null    object 
 1   Gender             376 non-null    object 
 2   Married            381 non-null    object 
 3   Dependents         373 non-null    object 
 4   Education          381 non-null    object 
 5   Self_Employed      360 non-null    object 
 6   ApplicantIncome    381 non-null    int64  
 7   CoapplicantIncome  381 non-null    float64
 8   LoanAmount         381 non-null    float64
 9   Loan_Amount_Term   370 non-null    float64
 10  Credit_History     351 non-null    float64
 11  Property_Area      381 non-null    object 
 12  Loan_Status        381 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 38.8+ KB


In [339]:
# Handling Missing Value 
df.isnull().sum() 


Loan_ID               0
Gender                5
Married               0
Dependents            8
Education             0
Self_Employed        21
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     11
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [340]:
df.isnull().mean()*100 


Loan_ID              0.000000
Gender               1.312336
Married              0.000000
Dependents           2.099738
Education            0.000000
Self_Employed        5.511811
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     2.887139
Credit_History       7.874016
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [341]:
df = df.dropna(subset= ['Gender', 'Dependents','Loan_Amount_Term'])


In [342]:
df.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        20
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       30
Property_Area         0
Loan_Status           0
dtype: int64

In [343]:
df['Self_Employed'].unique()

array(['No', 'Yes', nan], dtype=object)

In [344]:
df['Self_Employed'].mode()

0    No
Name: Self_Employed, dtype: object

In [345]:
df['Credit_History'].unique()

array([ 1., nan,  0.])

In [346]:
df['Credit_History'].mean()

np.float64(0.8597560975609756)

In [347]:
df['Self_Employed'].fillna(df['Self_Employed'].mode(0)[0],inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Self_Employed'].fillna(df['Self_Employed'].mode(0)[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)


In [348]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [349]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [350]:
df['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [351]:
df['Dependents'].unique()

array(['1', '0', '2', '3+'], dtype=object)

In [352]:
df['Dependents'].replace('3+' , '4', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].replace('3+' , '4', inplace=True)


In [353]:
df['Married'].unique()

array(['Yes', 'No'], dtype=object)

In [354]:
encoding = { 
    'Gender' : {'Male': 1, 'Female': 0}, 
    'Married' : {'Yes': 1 , 'No': 0}, 
    'Dependents' : {'0': 0, '1': 1, '2':2, '4':4},
    'Education': {'Graduate': 1, 'Not Graduate': 0},
    'Self_Employed' : {'Yes': 1 , 'No':0},
    'Property_Area' : {'Rural': 0 , 'Semiurban': 2 , 'Urban': 1},
    'Loan_Status' : {'Y': 1, 'N': 0}
 }

In [355]:
df.replace(encoding , inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 358 entries, 0 to 380
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            358 non-null    object 
 1   Gender             358 non-null    int64  
 2   Married            358 non-null    int64  
 3   Dependents         358 non-null    int64  
 4   Education          358 non-null    int64  
 5   Self_Employed      358 non-null    int64  
 6   ApplicantIncome    358 non-null    int64  
 7   CoapplicantIncome  358 non-null    float64
 8   LoanAmount         358 non-null    float64
 9   Loan_Amount_Term   358 non-null    float64
 10  Credit_History     358 non-null    float64
 11  Property_Area      358 non-null    int64  
 12  Loan_Status        358 non-null    int64  
dtypes: float64(4), int64(8), object(1)
memory usage: 39.2+ KB


  df.replace(encoding , inplace=True)


In [356]:
print(df.columns.tolist())


['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']


In [357]:
df.columns = df.columns.str.strip()  # removes extra spaces in column names


In [358]:
print(X.dtypes)


Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
dtype: object


In [359]:
# Clean column names
df.columns = df.columns.str.strip()

# Drop Loan_ID safely
if 'Loan_ID' in df.columns:
    df = df.drop('Loan_ID', axis=1)

# Recreate X and y to make sure they reflect the clean data
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

# Confirm all numeric
print(X.dtypes)


Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
dtype: object


In [360]:
X =df.drop('Loan_Status', axis = 1)
y = df['Loan_Status']

In [361]:
num_cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [362]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,1,1,0,0.71163,0.092069,0.80598,0.285826,1.0,0
1,1,1,0,1,1,-0.398856,-0.539332,-1.350425,0.285826,1.0,1
2,1,1,0,0,0,-0.691384,0.447965,0.527735,0.285826,1.0,1
3,1,0,0,1,0,1.705666,-0.539332,1.25813,0.285826,1.0,1
4,1,1,0,0,0,-0.866761,0.095418,-0.341784,0.285826,1.0,1


In [363]:
def evaluate_model(model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cross_val = cross_val_score(model, X, y, cv=5)
    avg_cross_val = np.mean(cross_val)   # âœ… consistent variable name
    print(f"{model.__class__.__name__} - Accuracy: {accuracy:.2f}, Cross-val-score: {avg_cross_val:.2f}")
    return avg_cross_val


In [364]:
model = {
    LogisticRegression(),
    svm.SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
}

In [365]:
model_score = {model.__class__.__name__: evaluate_model(model) for model in models}


LogisticRegression - Accuracy: 0.85, Cross-val-score: 0.84
SVC - Accuracy: 0.85, Cross-val-score: 0.83
DecisionTreeClassifier - Accuracy: 0.86, Cross-val-score: 0.78
RandomForestClassifier - Accuracy: 0.83, Cross-val-score: 0.83
GradientBoostingClassifier - Accuracy: 0.86, Cross-val-score: 0.82


In [366]:
def tune_model(model, param_grid):
    tuner = RandomizedSearchCV(model, param_grid, cv=5, n_iter=20, verbose= True, random_state= 42)
    tuner.fit(X, y)
    print(f"Best Score for {model.__class__.__name__}: {tuner.best_score_: .2f}")
    print(f"Best Parameter for {model.__class__.__name__}: {tuner.best_params_}")
    return tuner.best_estimator_

In [367]:
log_reg_grid = {'C': np.logspace(-4,4,20), "solver": ["liblinear"]}
svc_grid = {'C':[0.25,0.50,0.75,1], "kernel": ['linear']}

rf_grid = {
    'n_estimators': np.arange(10,1000,10),
    'max_features': ['log2','sqrt'],
    'max_depth':  [None , 3,5,10,20,30],
    'min_samples_split': [2,5,20,50,100],
    'min_samples_leaf': [1,2,5,1]

}

In [368]:
best_log_reg = tune_model(LogisticRegression(), log_reg_grid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for LogisticRegression:  0.84
Best Parameter for LogisticRegression: {'solver': 'liblinear', 'C': np.float64(1.623776739188721)}


In [369]:
best_svc_reg = tune_model(svm.SVC(), svc_grid)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Score for SVC:  0.84
Best Parameter for SVC: {'kernel': 'linear', 'C': 0.25}




In [372]:
best_rf = tune_model(RandomForestClassifier(), rf_grid)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Score for RandomForestClassifier:  0.84
Best Parameter for RandomForestClassifier: {'n_estimators': np.int64(550), 'min_samples_split': 100, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10}


In [373]:
final_model = best_rf

In [374]:
joblib.dump(final_model , 'loan_status_predictor.pkl')

['loan_status_predictor.pkl']

In [376]:
# Prediction System
 
sample_data = pd.DataFrame({
    'Gender': [1],
    'Married': [1],
    'Dependents': [2],
    'Education': [0],
    'Self_Employed': [0],
    'ApplicantIncome': [2889],
    'CoapplicantIncome': [0.0],
    'LoanAmount': [45],
    'Loan_Amount_Term': [180],
    'Credit_History': [1],
    'Property_Area': [1]

})

sample_data[num_cols] = scaler.transform(sample_data[num_cols])
loaded_model = joblib.load('loan_status_predictor.pkl')
prediction = loaded_model.predict(sample_data)

result = "Loan Approved" if prediction[0] == 1 else "Loan Not Approved"
print(f"\nPrediction Result: {result}")


Prediction Result: Loan Approved
