In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    FunctionTransformer,
    OneHotEncoder,
    StandardScaler
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)

In [15]:
df = pd.read_excel('C:/Users/Huawei/Downloads/loan_prediction (1).xlsx')

In [16]:
print("First rows of dataset:")
print(df.head())

First rows of dataset:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N

In [17]:
print("\nData info and missing values:")
print(df.info())
print(df.isnull().sum())


Data info and missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None
Loan_ID               0
Gender               13
Married               3
Depe

In [18]:
target_col = 'Loan_Status'
X = df.drop(columns=[target_col])
y = df[target_col].map({'Y': 1, 'N': 0})

In [19]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

In [20]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # Cast everything to uniform strings to avoid int/str mix:
    ('to_str',   FunctionTransformer(lambda X: X.astype(str), validate=False)),
    ('onehot',   OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

### Modellər

In [21]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'Random Forest (Bagging)': RandomForestClassifier(random_state=42, n_estimators=100),
    'AdaBoost (Boosting)': AdaBoostClassifier(
        random_state=42,
        n_estimators=50,
        algorithm='SAMME'
    ),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'Linear Regression': LinearRegression()
}

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [23]:
results = {}

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier',    model)
    ])

    pipe.fit(X_train, y_train)

    raw_pred = pipe.predict(X_test)
    if raw_pred.dtype.kind == 'f':
        y_pred = (raw_pred >= 0.5).astype(int)
    else:
        y_pred = raw_pred

    if hasattr(model, 'predict_proba'):
        y_prob = pipe.predict_proba(X_test)[:, 1]
    else:
        y_prob = np.zeros_like(y_pred, dtype=float)

    results[name] = {
        'Accuracy' : str(accuracy_score(y_test,  y_pred)),
        'Precision': str(precision_score(y_test, y_pred)),
        'Recall'   : str(recall_score(y_test,    y_pred)),
        'F1 Score' : str(f1_score(y_test,        y_pred)),
        'ROC AUC'  : str(roc_auc_score(y_test,   y_prob))
    }

In [25]:
print("\nModel Evaluation Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric_name, value in metrics.items():
        if isinstance(value, (int, float, np.number)):
            print(f"  {metric_name}: {value:.4f}")
        else:
            print(f"  {metric_name}: {value}")


Model Evaluation Results:

Logistic Regression:
  Accuracy: 0.7886178861788617
  Precision: 0.7596153846153846
  Recall: 0.9875
  F1 Score: 0.8586956521739131
  ROC AUC: 0.7494186046511628

KNN:
  Accuracy: 0.7642276422764228
  Precision: 0.7524752475247525
  Recall: 0.95
  F1 Score: 0.8397790055248618
  ROC AUC: 0.6866279069767441

Random Forest (Bagging):
  Accuracy: 0.7886178861788617
  Precision: 0.7596153846153846
  Recall: 0.9875
  F1 Score: 0.8586956521739131
  ROC AUC: 0.7441860465116279

AdaBoost (Boosting):
  Accuracy: 0.7886178861788617
  Precision: 0.7596153846153846
  Recall: 0.9875
  F1 Score: 0.8586956521739131
  ROC AUC: 0.6992732558139535

Gradient Boosting:
  Accuracy: 0.7886178861788617
  Precision: 0.7596153846153846
  Recall: 0.9875
  F1 Score: 0.8586956521739131
  ROC AUC: 0.6947674418604651

Linear Regression:
  Accuracy: 0.7886178861788617
  Precision: 0.7596153846153846
  Recall: 0.9875
  F1 Score: 0.8586956521739131
  ROC AUC: 0.5


In [27]:
print("\nDetailed Classification Reports:")
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)

    if name == 'Linear Regression':
        y_pred = (pipeline.predict(X_test) >= 0.5).astype(int)
    else:
        y_pred = pipeline.predict(X_test)

    print(f"\n{name} classification report:")
    print(classification_report(y_test, y_pred, target_names=['N', 'Y']))


Detailed Classification Reports:

Logistic Regression classification report:
              precision    recall  f1-score   support

           N       0.95      0.42      0.58        43
           Y       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123


KNN classification report:
              precision    recall  f1-score   support

           N       0.82      0.42      0.55        43
           Y       0.75      0.95      0.84        80

    accuracy                           0.76       123
   macro avg       0.79      0.68      0.70       123
weighted avg       0.78      0.76      0.74       123


Random Forest (Bagging) classification report:
              precision    recall  f1-score   support

           N       0.95      0.42      0.58        43
           Y       0.76      0.99      0.86        80

    accuracy                