In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

%matplotlib inline

# Description of our Dataset

File contains 14 columns and 5000 rows. Description of the columns are as follows:

* ID: Customer ID
* Age : Customer Age
* Experience : Customer Experience
* Income : Income of the Customer
* ZipCode: Customer's residence zipcode
* Family : No of Family members of the customer
* CCAvg: Credit Card Average Score
* Education: Education of the customer
* Mortgage: Mortgage taken or not taken by the customer
* Personal Loan: 0 = No personal loan given , 1 = personal loan given
* Securities Account : Having or not having a Securities Account
* CD Account : Having or not having a CD Account
* Online : Having or not having online banking
* Credit Card : Having or not having a credit card

In [None]:
df = pd.read_csv('/kaggle/input/bank-loan-approval-lr-dt-rf-and-auc/bankloan.csv')

df.head(10)

# Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.duplicated().sum()

Let's Check Outliers in our Columns

In [None]:
# Assuming you have a pandas DataFrame named 'df'
# For example, df = pd.DataFrame({'col1': [1, 2, 3, 4, 5], 'col2': [5, 4, 3, 2, 1]})

# Select numerical columns
numerical_columns = df.select_dtypes(include=['number'])

# Create a box plot for all numerical columns
plt.figure(figsize=(12, 8))
sns.boxplot(data=numerical_columns)
plt.title("Box Plot for Numerical Columns")

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha="right")

# Show the plot
plt.show()

as we look at Box plots we have outliers in `Zip_code` column. Thus We Won't use it in Predictive Modeling then we will drop it

# Data Preprocessing

In [None]:
# Replace period (.) in column names with underscore (_)
df.columns = [col.replace('.', '_') for col in df.columns]

# Baseline Model

In [None]:
# Assuming 'ZIP_Code' and 'Personal_Loan' are columns in your DataFrame
X = df.drop(['ZIP_Code', 'Personal_Loan', 'ID'], axis=1)  # Drop the specified columns along the columns axis
y = df['Personal_Loan']  # Set the target variable

In [None]:
# Assuming you have your dataset loaded into X (features) and y (target)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with different classifiers
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

pipeline_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])

pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Fit the pipelines
pipeline_rf.fit(X_train, y_train)
pipeline_svm.fit(X_train, y_train)
pipeline_lr.fit(X_train, y_train)

# Make predictions
y_pred_rf = pipeline_rf.predict(X_test)
y_pred_svm = pipeline_svm.predict(X_test)
y_pred_lr = pipeline_lr.predict(X_test)

# Evaluate the performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_lr = accuracy_score(y_test, y_pred_lr)

print("Random Forest Accuracy:", accuracy_rf)
print("SVM Accuracy:", accuracy_svm)
print("Logistic Regression Accuracy:", accuracy_lr)

# Hyperparameter Tuning

Through this code we will use `GridSearchCV` and will print Best parameters can get Higher Performance

In [None]:
# Define hyperparameter grids for RandomForestClassifier
param_grid_rf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}

# Create a pipeline with GridSearchCV for RandomForestClassifier
pipeline_rf_cv = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5))
])

# Fit the pipeline with cross-validation and hyperparameter tuning
pipeline_rf_cv.fit(X_train, y_train)

# Access the best hyperparameters and make predictions
y_pred_rf_cv = pipeline_rf_cv.predict(X_test)

# Evaluate the performance
accuracy_rf_cv = accuracy_score(y_test, y_pred_rf_cv)
print("Random Forest Accuracy (with CV):", accuracy_rf_cv)

# Access the best hyperparameters for RandomForestClassifier
best_params_rf = pipeline_rf_cv.named_steps['classifier'].best_params_
print("\nBest Hyperparameters for RandomForestClassifier:")
print(best_params_rf)

# Model Evaluation

When using pipelines, model evaluation remains consistent with traditional workflows. The key advantage of pipelines is the seamless integration of preprocessing and modeling steps, making the overall process cleaner and more modular.

In [None]:
# Evaluate the model
classification_rep = classification_report(y_test, y_pred_rf_cv)
print("Classification Report:\n", classification_rep)

# Compare Scores and Values

In [None]:
# Fit the pipeline on the training data
pipeline_rf_cv.fit(X_train, y_train)

# Make predictions on the training set
y_pred_train = pipeline_rf_cv.predict(X_train)

# Make predictions on the test set
y_pred_test = pipeline_rf_cv.predict(X_test)

# Calculate accuracy for training set
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Training Set Accuracy:", accuracy_train)

# Calculate accuracy for test set
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Set Accuracy:", accuracy_test)

In [None]:
# Print actual and predicted values
for actual, predicted in zip(y_test, y_pred_rf_cv):
    print(f"Actual: {actual}, Predicted: {predicted}")

# Evaluate the performance
accuracy_rf = accuracy_score(y_test, y_pred_rf_cv)
print("Random Forest Accuracy:", accuracy_rf)
