In [49]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder


# Load the data
data = pd.read_csv("Loans_datasets.csv")

# Show first few rows
display(data.head())

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [50]:
label_enc = LabelEncoder()

In [51]:
data['Gender'] = label_enc.fit_transform(data['Gender'])
data['Married'] = label_enc.fit_transform(data['Married'])
data['Education'] = label_enc.fit_transform(data['Education'])
data['Self_Employed'] = label_enc.fit_transform(data['Self_Employed'])
data['Property_Area'] = label_enc.fit_transform(data['Property_Area'])

In [52]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,0,0,5849,0.0,,360.0,1.0,2,Y
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,N
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,Y
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,Y
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,Y


In [53]:
# Fill missing values (updated for warning message)
data.ffill(inplace=True)

# Convert target variable to binary
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})

# Drop Loan_ID (not useful for prediction)
data.drop('Loan_ID', axis=1, inplace=True)

# One-hot encode categorical variables
data = pd.get_dummies(data)

In [54]:
from sklearn.model_selection import train_test_split

X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [55]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Initialize and train the model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

import warnings
warnings.filterwarnings('ignore')

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [60]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Splitting dataset (assumes you already have X and y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the model
model = xgb.XGBClassifier(eval_metric='logloss')  # or add use_label_encoder=False if needed

# Fit (train) the model
model.fit(X_train, y_train)

# Now you can predict
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7723577235772358


In [61]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ROC-AUC score (if you care about how well probabilities separate the classes)
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.2f}")

Confusion Matrix:
 [[21 22]
 [ 6 74]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.49      0.60        43
           1       0.77      0.93      0.84        80

    accuracy                           0.77       123
   macro avg       0.77      0.71      0.72       123
weighted avg       0.77      0.77      0.76       123

ROC-AUC Score: 0.74


In [62]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

grid = GridSearchCV(xgb.XGBClassifier(eval_metric='logloss'),
                    param_grid=params, cv=3, scoring='accuracy')

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

# Train model with best parameters
best_model = grid.best_estimator_

Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Best cross-validation accuracy: 0.7963115367349993


In [63]:
import matplotlib.pyplot as plt

xgb.plot_importance(model)
plt.title("Feature Importance")
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-validated accuracy scores:", scores)
print("Mean cross-validated accuracy: {:.2f}".format(scores.mean()))

In [None]:
#Integrating Loan Prediction Model to Web App | Diango

In [None]:
import joblib

joblib.dump(model, 'loan_model.pkl')

**Confusion Matrix**

***This tells you how many predictions were:***

In [None]:
df['Loan_Status'].value_counts()

In [None]:
from xgboost import XGBClassifier

# Set scale_pos_weight to balance classes (negatives/positives)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

model = XGBClassifier(scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)