In [86]:
# improting of libaries
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    accuracy_score,roc_auc_score
)
import warnings
warnings.filterwarnings('ignore')

In [87]:
def load_and_explore_dataset(filepath):
    print('=' * 60)
    print("LOAD AND EXPLORE DATASET")
    print('=' * 60)

    df = pd.read_csv(filepath)

    print('shape of the dataset:')
    print(df.shape)
    print('\nCheck for missing values:')
    print(df.isnull().sum())
    print('\nFirst five rows:')
    print(df.head())
    print('\nDescriptive Stats:')
    print(df.describe())
    print('\nDataset info:')
    print(df.info())
    print('\Loan Status Distribution:')
    print(df['loan_status'].value_counts())
    print('\loan Status Percentage Distribution:')
    print(df['loan_status'].value_counts(normalize=True)* 100)
    
    return df

In [88]:
def identify_features(df):
    print('=' * 60)
    print("IDETIFYING  OF FEATURES ")
    print('=' * 60)
    # features to be drop(not useful)
    features_to_drop = [
        "person_age",
        'person_gender',
        'person_education'
    ]

    numerical_features = [
    'person_emp_exp', 'loan_amnt',
    'loan_int_rate','loan_percent_income',
    'cb_person_cred_hist_length','credit_score'
    ]

    categorical_features = [
         "person_home_ownership",
        'loan_intent', 'previous_loan_defaults_on_file'
    ] 
    print('\nFeatueres to drop', features_to_drop)
    print('\nNumerical Features', numerical_features)
    print('\nCategorical Features', categorical_features)
    return numerical_features, categorical_features, features_to_drop

In [89]:
def prepare_data(df, numerical_features, categorical_features, features_to_drop):
    print('=' * 60)
    print("PREPARING  DATASET ")
    print('=' * 60)

    # drop unnecessary cols
    df_model = df.drop(columns=features_to_drop)

    # featyre to use
    X = df_model.drop('loan_status', axis= 1)
    y = df_model['loan_status']

    print("\nFeatures Shape", X.shape)
    print("\nTarget Shape", y.shape)
    print("\nFeature columns list", list(X.columns))

    return X, y, list(X.columns)

In [90]:
def create_preprocessing_pipeline(numerical_features, categorical_features):
    print('=' * 60)
    print("CREATING PREPROCESSING  PIPELINE ")
    print('=' * 60)

    # nmerical_features pipeline
    numerical_pipeline = Pipeline([
        ('scaler', StandardScaler())
    ])

    # categorical features pipeline
    categorical_pipeline = Pipeline([
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown = 'ignore'))
    ])
    #  column preprocessing
    preprocessor = ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    return preprocessor

In [91]:
def split_data(X, y, test_size=0.2, random_state=42):
    print('=' * 60)
    print("SPLITTING DATA INTO TRAIN/TEST")
    print('=' * 60)

    X_train, X_test, y_train, y_test = train_test_split(
        X,y, test_size= test_size, random_state = random_state, stratify=y
    )
    print('\nTrain set size', X_train.shape[0])
    print('\nTest set size', X_test.shape[0])
    print('\nTrain Loan Status Distribution', y_train.value_counts())
    print('\nTest Loan Status Distribution', y_test.value_counts())

    return X_train, X_test, y_train, y_test

In [92]:
def create_model_pipeline(preprocessor):
    print('=' * 60)
    print("CREATING MODEL PIPELINE")
    print('=' * 60)

    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(
            max_iter =1000,
            random_state = 42,
            class_weight= 'balanced',
            solver= 'lbfgs'
        ))
    ])

    print('\nModel pipeline created succesfully')

    return model_pipeline

In [93]:
def train_model(model_pipeline, X_train, y_train):
    print('=' * 60)
    print("TRAIN MODEL")
    print('=' * 60)

    model_pipeline.fit(X_train, y_train)
    print('\nModel trained successfully')

    # get name steps from preprocessing
    try:
        num_features = model_pipeline.named_steps['preprocessor'].transformers_[0][2]
        cat_features = model_pipeline.named_steps['preprocessor'].transformers_[1][2]

        # get name_Steps from onehotencoder
        onehot_encoder = model_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
        cat_features_name = onehot_encoder.get_feature_names_out(cat_features)

        all_features_name = list(num_features) + list(cat_features_name)

        # get coefficient 
        coefficient = model_pipeline.named_steps['classifier'].coef_[0]
        print('\nTop  most importance fatures(by coefficent magnitude)')
        feature_importance = pd.DataFrame({
            'feature_columns': all_features_name,
            'coefficient': coefficient
        }).sort_values('coefficient', key=abs, ascending=False)
        print(feature_importance.head(10).to_string(index=False))
    except Exception as e:
        print(f"Can not extract features name {e}")
    return model_pipeline

In [94]:
def evaluate_model(model_pipeline, X_train, X_test, y_train, y_test):
    print('=' * 60)
    print("EVALUATING MODEL")
    print('=' * 60)

    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)

    # predict model prob
    y_train_proba = model_pipeline.predict_proba(X_train)[:,1]
    y_test_proba = model_pipeline.predict_proba(X_test)[:,1]

    # metrics 
    y_train_acc_score = accuracy_score(y_train, y_train_pred)
    y_test_acc_Score = accuracy_score(y_test_pred, y_test)

    y_train_roc_score = roc_auc_score(y_train, y_train_proba)
    y_test_roc_score = roc_auc_score(y_test, y_test_proba)

    print("\nMETRICS EVALUATION")
    print(f"\nTrain accuracy score:, {float(y_train_acc_score):.4f}")
    print(f"\nTest accuracy score:, {float(y_test_acc_Score):.4f}")
    print(f"\nTrain ROC Score:, {float(y_train_roc_score):.4f}")
    print(f"\nTest ROC Score:, {float(y_test_roc_score):.4f}")

    print("\n CLASSIFICATION OF TRAIN SIZE")
    print("\n Classification report\n", classification_report(y_train, y_train_pred, target_names = ['Rejected','Approved']))
    print("\n CLASSIFICATION OF TEST SIZE")
    print("\n Classification report\n", classification_report(y_test, y_test_pred, target_names = ['Rejected','Approved']))

    # cross validation score
    cv_score = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='roc_auc')
    print('\nCross Validation (5-folds)')
    print(f'R2 Score:{cv_score}')
    print(f'Cross Validation mean: {cv_score.mean():.2f}')
    print(f"Cross Validation STD : {cv_score.std():.4f}")

    metrics = {
        "y_train_acc_score": y_train_acc_score,
        "y_test_acc_Score": y_test_acc_Score,
        "y_train_roc_score": y_train_roc_score,
        "y_test_roc_score": y_test_roc_score, 
        "y_train_pred": y_train_pred,
        "y_test_pred": y_test_pred,
        'y_train_proba':y_train_proba,
        "y_test_proba": y_test_proba,
        "cv_score":cv_score,
    }
    return metrics

In [95]:
def save_model_artifact(model_pipeline, feature_columns):
    print('=' * 60)
    print("SAVING MODEL ARTIFACT")
    print('=' * 60)

    joblib.dump(model_pipeline, 'loan_status_model_pipeline.pkl')
    print('Loan Status model pipeline saved successfully')
    
    joblib.dump(feature_columns, 'loan_feature_columns.pkl')
    print('Loan features columns saved successfully')

    print('=' * 60)
    print(" MODEL ARTIFACT SAVED SUCCESSFULLY")
    print('=' * 60)

In [96]:
def predict_loan_status (customer_data):
    model_pipeline = joblib.load("loan_status_model_pipeline.pkl")
    feature_columns = joblib.load("loan_feature_columns.pkl")
    customer_df = pd.DataFrame([customer_data])
    for feature in feature_columns:
        if feature not in customer_df.columns:
            raise ValueError(f"Missing require feature{feature}")
    customer_df = customer_df[feature_columns]
    
    prediction = model_pipeline.predict(customer_df)[0]
    probability = model_pipeline.predict_proba(customer_df)[0]

    result = {
    'prediction': 'Approved' if prediction == 1 else "Rejected",
    'rejected_probability': probability[0],
    'approved_probability': probability[1],
    'approval_rate': 'High' if probability [1] > 0.6 else 'Medium' if probability[1] > 0.3 else 'Low'
    }

    return result

In [97]:
def test_prediction():
    customer_data ={
    'person_income': 95000,
    'person_emp_exp': 10,
    'person_home_ownership': 'MORTGAGET',
    'loan_amnt': 5000,
    'loan_intent': 'VENTURE',
    'loan_int_rate': 7.5,
    'loan_percent_income':	0.05,
    'cb_person_cred_hist_length':	12,
    'credit_score':	800,
    'previous_loan_defaults_on_file': 'No'
    }
    result = predict_loan_status (customer_data)
    print("Loan Status:", result['prediction'])
    print("Rejection  Probability:", result['rejected_probability'])
    print("Approved Probability:", result['approved_probability'])
    print("Approval rate:", result['approval_rate']) 
   
    customer_data ={
    'person_income': 25000,
    'person_emp_exp': 1,
    'person_home_ownership': 'RENT',
    'loan_amnt': 18000,
    'loan_intent': 'MEDICAL',
    'loan_int_rate': 16.0,
    'loan_percent_income':	0.72,
    'cb_person_cred_hist_length':	2,
    'credit_score':	520,
    'previous_loan_defaults_on_file': 'Yes'
    }
    result = predict_loan_status (customer_data)
    print("Loan Status:", result['prediction'])
    print("Rejection  Probability:", result['rejected_probability'])
    print("Approved Probability:", result['approved_probability'])
    print("Approval rate:", result['approval_rate'])

    customer_data ={
    'person_income': 50000,
    'person_emp_exp': 4,
    'person_home_ownership': 'OWN',
    'loan_amnt': 12000,
    'loan_intent': 'DEBTCONSULTATION',
    'loan_int_rate': 11.5,
    'loan_percent_income':	0.24,
    'cb_person_cred_hist_length':	6,
    'credit_score':	670,
    'previous_loan_defaults_on_file': 'NO'
    }
    result = predict_loan_status (customer_data)
    print("Loan Status:", result['prediction'])
    print("Rejection  Probability:", result['rejected_probability'])
    print("Approved Probability:", result['approved_probability'])
    print("Approval rate:", result['approval_rate'])


In [98]:
def main():
    filepath  = 'loan_data.csv'
    df = load_and_explore_dataset(filepath)
    numerical_features, categorical_features, features_to_drop = identify_features(df)
    X, y, feature_columns = prepare_data(df, numerical_features, categorical_features, features_to_drop)
    preprocessor = create_preprocessing_pipeline(numerical_features, categorical_features)
    X_train, X_test, y_train, y_test = split_data(X, y)
    model_pipeline = create_model_pipeline(preprocessor)
    model_pipeline = train_model(model_pipeline, X_train, y_train)
    metrics = evaluate_model(model_pipeline, X_train, X_test, y_train, y_test)
    save_model_artifact(model_pipeline, feature_columns)
    test_prediction()

In [99]:
if __name__ == "__main__" :
    main()

LOAD AND EXPLORE DATASET
shape of the dataset:
(45000, 14)

Check for missing values:
person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

First five rows:
   person_age person_gender person_education  person_income  person_emp_exp  \
0        22.0        female           Master        71948.0               0   
1        21.0        female      High School        12282.0               0   
2        25.0        female      High School        12438.0               3   
3        23.0        female         Bachelor        79753.0    