In [1]:
import flash as fz
import joblib
import klib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import (
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    VotingClassifier,
)
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    LabelEncoder,
    MinMaxScaler,
    OneHotEncoder,
    RobustScaler,
    StandardScaler,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import plotly.graph_objs as go
import toml
# from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'toml'

# Feature construction

- It looks like people with a co-applicant income of 0 doesn't have a co-applicant. So, we should create a new feature called 'has_coapplicant'. For this feature, set the value to 'no' for individuals with a co-applicant income of 0, and 'yes' for those with a non-zero co-applicant income.

In [None]:
df['has_coapplicant'] = np.where(df['coapplicant_income'] == 0, 'no', 'yes')

# Test
df['has_coapplicant']

In [None]:
# Appending newly created features based on their feature type
cat_cols.append('has_coapplicant')

# Test
cat_cols

## EDA

### Univariate analysis

In [None]:
# Statistical measures
df[['has_coapplicant']].describe().T

In [None]:
# Countplot
sns.countplot(x=df['has_coapplicant'])
plt.show()

### Bivariate analysis

#### Features

##### Categorical - Categorical

In [None]:
# Heatmap
fz.crosstab_heatmap_viz(df, cat_cols, ['has_coapplicant'], 'both')

##### Numerical - Categorical

In [None]:
# Point plot
fig, axs = fz.num_cat_viz(df, num_cols, 'has_coapplicant', kind='point')
fig

#### Target

In [None]:
# Heatmap
fz.crosstab_heatmap_viz(df, ['loan_status'], ['has_coapplicant'], 'both')

# Feature transformation

In [None]:
transformed_data = fz.feature_transform(df[num_cols])

## applicant_income

In [None]:
# Histogram
fig, axs = fz.feature_transform_viz(df['applicant_income'], transformed_data)
fig

In [None]:
# QQ Plot
fig, axs = fz.feature_transform_viz(df['applicant_income'], transformed_data, kind='qq')
fig

## coapplicant_income

In [None]:
# Histogram
fig, axs = fz.feature_transform_viz(df['coapplicant_income'], transformed_data)
fig

In [None]:
# QQ Plot
fig, axs = fz.feature_transform_viz(df['coapplicant_income'], transformed_data, kind='qq')
fig

## loan_amount

In [None]:
# Histogram
fig, axs = fz.feature_transform_viz(df['loan_amount'], transformed_data)
fig

In [None]:
# QQ Plot
fig, axs = fz.feature_transform_viz(df['loan_amount'], transformed_data, kind='qq')
fig

## Conclusions

- **applicant_income & loan_amount:** Quantile Transform normalizes the data effectively.  
- **coapplicant_income:** Reciprocal Transform transforms coapplicant_income to follow a bimodal distribution.

In [None]:
df['applicant_income'] = transformed_data['Quantile']['applicant_income']
df['coapplicant_income'] = transformed_data['Reciprocal']['coapplicant_income']
df['loan_amount'] = transformed_data['Quantile']['loan_amount']

In [None]:
# Test
fig, axs = fz.hist_box_viz(df[num_cols])
fig

In [27]:
# Load the dataset
df = pd.read_csv("../data/raw/loan_sanction_train.csv")

In [None]:
    # Understand structure of the dataset
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [None]:
# Drop useless features
df.drop("Loan_ID", axis=1, inplace=True)

# Test
print(df.columns)

In [None]:
# Clean column names
df = klib.clean_column_names(df)

# Test
column_names = df.columns.tolist()
print(column_names)

In [None]:
# Check for duplicate data points
def check_duplicates(df):
    if df.duplicated().any():
        print(df[df.duplicated(keep=False)])
    else:
        print("There are no duplicate data points in the dataframe")


check_duplicates(df)

In [None]:
# Get some information about the dataset
df.info()

Useful information that we can get from df.info():

- Feature names
- Number of data points
- Number of features
- Data type of features
- Memory usage

In [None]:
# Extract numerical, categorical, and other features from the dataset
num_cols, cat_cols, other_cols = fz.extract_features(
    df, "all", ignore_cols=["loan_status"]
)

In [None]:
# Print numerical features of dataset
df[num_cols]

In [None]:
# Print categorical features of dataset
df[cat_cols]

In [None]:
# Reorder columns
target_col = ["loan_status"]
df = df[num_cols + cat_cols + target_col]

# Test
print(df.columns)

In [None]:
# Statistical measures
df[num_cols].describe().T

In [None]:
# Histogram & Box-plot
fig, axs = fz.hist_box_viz(df[num_cols])
fig

In [None]:
# Numerical features
num_nan_pct = fz.calc_nan_values(df[num_cols])
num_cols_with_nan = num_nan_pct.index.tolist()

print(num_nan_pct)  # Percentage of missing values in numerical features
print(num_cols_with_nan)  # Numerical features with missing values

In [None]:
# Categorical features
cat_nan_pct = fz.calc_nan_values(df[cat_cols])
cat_cols_with_nan = cat_nan_pct.index.tolist()

print(cat_nan_pct)  # Percentage of missing values in categorical features
print(cat_cols_with_nan)  # Categorical features with missing values

In [None]:
# Check whether the target column contains any missing values
df["loan_status"].isna().sum()

In [None]:
# Visualize the distribution of missing values to determine the type of missing values
fig, axs = fz.nan_value_viz(df[num_cols_with_nan + cat_cols_with_nan])
fig

In [None]:
# Split the dataset into features and target
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [30]:
df['applicant_income'] = df['applicant_income'].astype(float)
df['loan_amount_term'] = df['loan_amount_term'].astype(int).astype(object)
df['credit_history'] = df['credit_history'].astype(int).astype(object)

# Test
print(df.dtypes)

KeyError: 'applicant_income'

In [None]:
# Export config data

config_data = {
    "column_names": column_names,
    "num": {"cols": num_cols, "nan": num_cols_with_nan},
    "cat": {"cols": cat_cols, "nan": cat_cols_with_nan},
    "target_col": target_col,
}

with open("../src/loan_sanction_prediction/config.yaml", "w") as file:
    yaml.dump(config_data, file, default_flow_style=False)

In [29]:
# Export dataset
fz.export(df, "../data/interim/cleaned_train_data_v1.csv", force_overwrite=True)

Data exported to ../data/interim/cleaned_train_data_v1.csv


## Handle Missing Values

### Numerical

In [None]:
# 1. Handle missing values in categorical columns
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

In [None]:
# OneHotEncoder expects datatypes of every value in a column to be the same
X[cat_cols] = X[cat_cols].astype(str)

In [None]:
# Store encoded feature names before encoding
unique_values_in_cols = {}
for col in cat_cols:
    encoded_columns = []
    for value in X[col].unique():
        encoded_columns.append(f"{col}_{value}")
    unique_values_in_cols[col] = encoded_columns

In [None]:
# 2. One-Hot Encode categorical features
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded_X_data = encoder.fit_transform(X[cat_cols])
encoded_X_df = pd.DataFrame(encoded_X_data, columns=encoder.get_feature_names_out())

# Concatenating encoded categorical features with the rest of the X
X = pd.concat([X.drop(columns=cat_cols), encoded_X_df], axis=1)

In [None]:
# 3. Impute missing values in numerical features using KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)

In [None]:
# 4. Assign the missing value imputed numerical features back to df
df[num_cols_with_nan] = X[num_cols_with_nan]

### Categorical

In [None]:
def advanced_categorical_imputer(X, y, clf_model):
    X, y = X.copy(), y.copy() # Avoid modifying the original X and y

    y_notna = y.notna() # Create a mask for non-missing values in y

    # Split the data into training (non-missing) and test (missing) data
    X, X_test = X[y_notna], X[~y_notna]
    y_train, y_test = y[y_notna], y[~y_notna]

    # Label encoding the target feature
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)

    clf_model.fit(X, y_train) # Train the model

    y_pred = clf_model.predict(X_test) # Predict on the test data (missing values)

    # Inverse transform the predicted values to original labels
    y_pred_inverse = le.inverse_transform(y_pred)

    y[y_test.index] = y_pred_inverse # Impute the missing target values

    return y, clf_model, le

In [None]:
clf_models = {}
label_encoders = {}
for col in cat_cols_with_nan:
    df[col], clf_models[col], label_encoders[col] = advanced_categorical_imputer(
        X.drop(columns=unique_values_in_cols[col], errors='ignore'), df[col],
        ExtraTreesClassifier(random_state=42))

In [None]:
def check_missing_values(df):
    if df.isna().any().any():
        print("There are still missing values in the DataFrame.")
    else:
        print("There are no missing values left in the DataFrame.")

# Test
check_missing_values(df)

In [None]:
# Load the configurations
with open("config/config_v2.toml", "r") as file:
    config_data = toml.load(file)

num_cols, cat_cols = config_data['num']['cols'], config_data['cat']['cols']

# Model Building

In [None]:
# Load the dataset
df = pd.read_csv('data/interim/feature_engineered_train_data_v1.csv')

# Split the data into features and target
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [None]:
# Label encode target
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# Tranformer for preprocessing data
transformer = make_column_transformer(
    (StandardScaler(), num_cols),
    (OneHotEncoder(drop='first', sparse_output=False), cat_cols),
    remainder='passthrough'
)

In [None]:
X_transformed = transformer.fit_transform(X)

# Test
X_transformed.shape

## Handling imbalanced dataset

In [None]:
# Oversampling the dataset using SMOTE
smote = SMOTE(random_state=42)
X_transformed, y_resampled = smote.fit_resample(X_transformed, y)

# Test
print(X_transformed.shape, y_resampled.shape)

In [None]:
# Test
unique_values, counts = np.unique(y_resampled, return_counts=True)

# Print the counts of each class
for value, count in zip(unique_values, counts):
    print(f"Class {value}: {count}")

## Model selection (Before hyperparameter tuning)

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(),
    'KNN': KNeighborsClassifier(),
    'Decision Trees': DecisionTreeClassifier(),
    'Xgboost': XGBClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

In [None]:
# Define metric functions
metrics = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score
}

In [None]:
def eval_models_across_metrics(models, metrics, X, y, cv=5):
    models_across_metrics = {metric: {} for metric in metrics}
    for metric in metrics:
        for model_name, model in models.items():
            cv_scores = cross_val_score(model, X, y, cv=cv, scoring=metric)
            cv_scores_mean = cv_scores.mean()
            models_across_metrics[metric][model_name] = round(cv_scores_mean, 3)
    return pd.DataFrame(models_across_metrics)

In [None]:
models_across_metrics = eval_models_across_metrics(models, metrics.keys(), X_transformed, y_resampled)

In [None]:
models_across_metrics

Conclusions:

- After evaluating the metrics, I have decided to focus on the top 3 models (in terms of accuracy_score): Random Forest Classifier, Extra Trees Classifier, Xgboost Classifier.

## Hyperparameter tuning

In [None]:
# Define top models for further hyperparameter tuning
models = {
    'Random Forest': RandomForestClassifier(),
    'Xgboost': XGBClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

In [None]:
# Define parameter grids
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 200],
        'max_depth': [None, 30],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 4]
    },
    'Xgboost': {
        'n_estimators': [50, 200],
        'max_depth': [3, 10],
        'learning_rate': [0.01, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 0.2]
    },
    'Extra Trees': {
        'n_estimators': [50, 200],
        'max_depth': [None, 30],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 4],
        'bootstrap': [True, False]
    }
}

In [None]:
def perform_grid_search(models, param_grids, X, y):
    best_params = {}
    for model_name, model in models.items():
        print(f"Processing {model_name}...")
        param_grid = param_grids[model_name]
        grid_search = GridSearchCV(
            estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1,
            verbose=1
            )
        grid_search.fit(X, y)
        best_params[model_name] = {
            'Best Parameters': grid_search.best_params_,
            'Average accuracy score on the best parameters': round(grid_search.best_score_, 3)
        }
    return best_params

In [None]:
# Finding best hyperparameters on top models using GridSearchCV
best_params = perform_grid_search(models, param_grids, X_transformed, y_resampled)

In [None]:
pd.DataFrame(best_params)

In [None]:
# best_params

# {'Random Forest': {'Best Parameters': {'max_depth': None,
#    'min_samples_leaf': 1,
#    'min_samples_split': 2,
#    'n_estimators': 200},
#   'Average accuracy score on the best parameters': 0.838},
#  'Xgboost': {'Best Parameters': {'colsample_bytree': 0.8,
#    'gamma': 0.2,
#    'learning_rate': 0.2,
#    'max_depth': 10,
#    'n_estimators': 50,
#    'subsample': 1.0},
#   'Average accuracy score on the best parameters': 0.826},
#  'Extra Trees': {'Best Parameters': {'bootstrap': True,
#    'max_depth': 30,
#    'min_samples_leaf': 1,
#    'min_samples_split': 2,
#    'n_estimators': 200},
#   'Average accuracy score on the best parameters': 0.84}}

In [None]:
# Define top models with best hyperparameters
models = {
    'Random Forest': RandomForestClassifier(**best_params['Random Forest']['Best Parameters']),
    'Xgboost': XGBClassifier(**best_params['Xgboost']['Best Parameters']),
    'Extra Trees': ExtraTreesClassifier(**best_params['Extra Trees']['Best Parameters'])
}

In [None]:
# Comparing top models across metrics after hyperparameter tuning
models_across_metrics = eval_models_across_metrics(models, metrics.keys(), X_transformed, y_resampled)

In [None]:
models_across_metrics


## Model training


In [None]:
estimators = []
for model_name, model in models.items():
    estimators.append((model_name, model))

In [None]:
estimators

In [None]:
def eval_voting_clf(estimators, X, y, cv = 5):
    # Create a voting classifier (hard voting)
    voting_clf_hard = VotingClassifier(estimators=estimators, voting='hard')

    # Create a voting classifier (soft voting)
    voting_clf_soft = VotingClassifier(estimators=estimators, voting='soft')

    # Apply cross-validation
    cv_scores_h = cross_val_score(voting_clf_hard, X, y, cv=cv, scoring='accuracy')
    cv_scores_s = cross_val_score(voting_clf_soft, X, y, cv=cv, scoring='accuracy')

    accuracy_results = {}

    accuracy_results['Hard Margin'] = round(cv_scores_h.mean(), 3)
    accuracy_results['Soft Margin'] = round(cv_scores_s.mean(), 3)

    return accuracy_results

In [None]:
# Accuracy on hard and soft margin voting classifiers
accuracy = eval_voting_clf(estimators, X_transformed, y_resampled)
accuracy

In [None]:
# Fit the best model
voting_clf = VotingClassifier(estimators, voting='hard')
voting_clf.fit(X_transformed, y_resampled)

In [None]:
# Create a pipeline
pipeline = make_pipeline(
    StandardScaler(),     # Step 1: Standardize the data
    PCA(n_components=2),  # Step 2: Apply PCA
    LogisticRegression()  # Step 3: Train a logistic regression model
)


In [None]:
# Pipeline
pipe = make_pipeline(
    (transformer),
    ('model', voting_clf)
)

pipe.fit(X, y)

## Saving

In [None]:
# Save the Machine Learning model
joblib.dump(voting_clf, 'model.joblib')

In [None]:
# Convert categorical features' data type to category
# This will be helpful while doing analysis
df[cat_cols] = df[cat_cols].astype('category')

# Test
df.dtypes

# EDA on features

## Univariate analysis

### Numerical

In [None]:
# Statistical measures
df[num_cols].describe().T

In [None]:
# Statistical moments
fz.stats_moments(df[num_cols])

In [None]:
# Plotting histogram & boxplot
fig, axs = fz.hist_box_viz(df[num_cols])
fig

#### Conclusions:

- There are many outliers on the upper side of all numerical features, while none are present on the lower side.
- Since the outliers appear to be valid and are not due to data entry issues, we don't have to drop them.
- None of the numerical features follow a normal distribution.
- The distributions of applicant income and loan amount are right-skewed (positively skewed).
- Feature transformation is required for all numerical features to address this skewness.
- It looks like people with a co-applicant income of 0 doesn't have a co-applicant. So, we should create a new feature called 'has_coapplicant'. For this feature, set the value to 'no' for individuals with a co-applicant income of 0, and 'yes' for those with a non-zero co-applicant income.

### Categorical

In [None]:
# Statistical measures
df[cat_cols].describe().T

In [None]:
# Countplots
fig, axs = fz.count_viz(df[cat_cols])
fig

#### Conclusions:

- A higher number of males apply for loans compared to females.
- Married individuals are more likely to apply for loans than unmarried individuals, with approximately twice as many married applicants.
- Individuals without dependents apply for loans more frequently than those with dependents.
- Graduates are more likely to apply for loans than non-graduates.
- Non-self-employed individuals apply for loans more than self-employed individuals.
- People whose property is located in semi-urban areas tend to apply for loans more than those with properties in rural or urban areas. Those with property in rural areas apply for the fewest loans, although these trends are not very strong.
- The majority of loan applicants prefer a loan term of 360 months (30 years), followed by 180 months (15 years). Other loan term durations are relatively rare.
- Individuals with a credit history of 1 are more likely to apply for loans compared to those with a credit history of 0.

## Bivariate analysis

### Numerical - Numerical

In [None]:
# Pairplot
grid = fz.pair_viz(df[num_cols])
plt.show()

In [None]:
# Correlation heatmap
methods=['pearson', 'spearman', 'kendall']
for method in methods:
    fz.corr_heatmap_viz(df[num_cols], method=method)
    plt.show()
    print("-"*150)

Conclusions:

- None of the features show a strong linear relationship with each other. However, there is a moderate relationship between applicant income and loan amount. This makes sense because individuals with higher incomes often need larger loan amounts.

- Pearson, Spearman, and Kendall
Tau's correlations show similar patterns, but their values are slightly different. Since the heatmaps from all of these are similar, the exact values are less important. In this case, Spearman's correlation is more suitable because the data isn't normally distributed, doesn't have a linear relationship between features, and has outliers.

### Categorical - Categorical

In [None]:
# Crosstab Heatmap
fz.crosstab_heatmap_viz(df, cat_cols, normalize='both')

### Numerical - Categorical

#### Box-plot

In [None]:
# applicant_income
fig, axs = fz.num_cat_viz(df, 'applicant_income', cat_cols)
fig

In [None]:
# coapplicant_income
fig, axs = fz.num_cat_viz(df, 'coapplicant_income', cat_cols)
fig

In [None]:
# loan_amount
fig, axs = fz.num_cat_viz(df, 'loan_amount', cat_cols)
fig

#### KDE-plot

In [None]:
# applicant_income
fig, axs = fz.num_cat_viz(df, 'applicant_income', cat_cols, kind='kde')
fig

In [None]:
# coapplicant_income
fig, axs = fz.num_cat_viz(df, 'coapplicant_income', cat_cols, kind='kde')
fig

In [None]:
# loan_amount
fig, axs = fz.num_cat_viz(df, 'loan_amount', cat_cols, kind='kde')
fig

#### Point-plot

In [None]:
# applicant_income
fig, axs = fz.num_cat_viz(df, 'applicant_income', cat_cols, kind='point')
fig

In [None]:
# coapplicant_income
fig, axs = fz.num_cat_viz(df, 'coapplicant_income', cat_cols, kind='point')
fig

In [None]:
# loan_amount
fig, axs = fz.num_cat_viz(df, 'loan_amount', cat_cols, kind='point')
fig

## Mulitvariate analysis

### Numerical - Numerical - Numerical

In [None]:
trace = go.Scatter3d(
    x=df['applicant_income'],
    y=df['coapplicant_income'],
    z=df['loan_amount'],
    mode='markers',
    marker=dict(size=5)
)

layout = go.Layout(
    scene=dict(
        xaxis_title='Applicant Income',
        yaxis_title='Coapplicant Income',
        zaxis_title='Loan Amount'
    )
)

fig = go.Figure(data=[trace], layout=layout)
fig.show()

### Numerical - Numerical - Categorical

In [None]:
# Pairplot
for feature in cat_cols:
    sns.pairplot(df, vars=num_cols, hue=feature)
    plt.show()
    print("-"*105)

In [None]:
def num_num_cat_viz(x, y, categorical_features):
    for feature in categorical_features:
        sns.relplot(df, x=x, y=y, col=feature)
        plt.show()
        print("-"*118)

In [None]:
# applicant_income & coapplicant_income
num_num_cat_viz('applicant_income', 'coapplicant_income', cat_cols)

In [None]:
# applicant_income & loan_amount
num_num_cat_viz('applicant_income', 'loan_amount', cat_cols)

In [None]:
# coapplicant_income & loan_amount
num_num_cat_viz('coapplicant_income', 'loan_amount', cat_cols)

### Numerical - Categorical - Categorical

# EDA on target

## Univariate analysis

In [None]:
plt.pie(df['loan_status'].value_counts(), labels=df['loan_status'].unique(), autopct='%0.2f%%',
        shadow=True, explode=(0, 0.1), counterclock=False, colors=['lime', 'cyan'])
plt.show()

#### Conclusions:

- The classes in target is moderately imbalanced. Need to handle the class imbalance using SMOTE.

## Bivariate analysis


### Categorical - Categorical


In [None]:
# Crosstab Heatmap
fz.crosstab_heatmap_viz(df, cat_cols, ['loan_status'], 'both')


### Numerical - Categorical


In [None]:
# Box-plot
fig, axs = fz.num_cat_viz(df, num_cols, 'loan_status')
fig

In [None]:
# KDE-plot
fig, axs = fz.num_cat_viz(df, num_cols, 'loan_status', kind='kde')
fig

In [None]:
# Point-plot
fig, axs = fz.num_cat_viz(df, num_cols, 'loan_status', kind='point')
fig

## Mulitvariate analysis


### Numerical - Numerical - Categorical

In [None]:
sns.pairplot(df, vars = num_cols, hue='loan_status')

In [None]:
def relplot(df, numerical_features, categorical_feature):
    for i, feature_i in enumerate(numerical_features):
        for j, feature_j in enumerate(numerical_features[i+1:], start=i+1):
            sns.relplot(df, x=feature_i, y=feature_j, col=categorical_feature)
            plt.show()
            print("-" * df[categorical_feature].nunique()*59)

In [None]:
relplot(df, num_cols, 'loan_status')


### Numerical - Categorical - Categorical