In [None]:

import kagglehub
wordsforthewise_lending_club_path = kagglehub.dataset_download('wordsforthewise/lending-club')

print('Data source import complete.')


# Project: Build a machine learning model to predict the likelihood of loan default based on applicant details and credit history.

In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Import Libraries

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Exploring Accepted Dataset

In [None]:
accepted=pd.read_csv('/kaggle/input/lending-club/accepted_2007_to_2018q4.csv/accepted_2007_to_2018Q4.csv')
accepted

In [None]:
accepted.info()

In [None]:
for feature in accepted.columns:
    print(feature)

In [None]:
miss=accepted.isna().sum()
missing_entries_sorted = miss.sort_values(ascending=False)
print(missing_entries_sorted)

In [None]:
lower=1130351
high=2260701

missing_num=accepted.isnull().sum()

features_to_drop=missing_num[(missing_num>=lower)& (missing_num<=high)].index
accepted=accepted.drop(columns=features_to_drop)

print("Dropped Features:" , features_to_drop)

In [None]:
accepted

In [None]:
threshold=accepted.shape[1]*0.7

accepted=accepted.dropna(thresh=threshold)

print("Data after dropping rows with more than 70% missing values:")
accepted

In [None]:
for features in accepted.isna().sum():
    print(features)

In [None]:
lower=100000
high=800000

missing_num1=accepted.isnull().sum()

features_to_drop=missing_num1[(missing_num1>=lower)& (missing_num1<=high)].index
accepted=accepted.drop(columns=features_to_drop)

print("Dropped Features:" , features_to_drop)

In [None]:
accepted

In [None]:
accepted.columns

In [None]:
accepted['debt_settlement_flag'].unique()

In [None]:
accepted['title'].nunique()

In [None]:
#NO=0, YES=1
from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder()
accepted['Label']=label_encoder.fit_transform(accepted['debt_settlement_flag'])
accepted

In [None]:
accepted.columns

In [None]:
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = accepted

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

# Initialize LabelEncoder for each categorical column
label_encoders = {}
for col in categorical_columns:
    df[col] = df[col].astype(str)  # Ensure all entries are strings to avoid mixed types
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split into features and target
X = df.drop('Label', axis=1)  # Replace 'Label' with your target column if named differently
y = df['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize LGBMClassifier with parameters
model = lgb.LGBMClassifier(
    objective='binary',  # Change to 'multiclass' if you have multiple classes
    metric='binary_logloss',
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature importance
importance = model.feature_importances_
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance}).sort_values(by='Importance', ascending=False)

In [None]:
# Plot feature importance
plt.figure(figsize=(16, 20))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from LGBMClassifier')
plt.gca().invert_yaxis()
plt.show()

In [None]:
drop_yes = importance_df[importance_df['Importance'] <= 5].index
importance_df = importance_df.drop(drop_yes)
print(importance_df)

In [None]:
importance_df['Feature'].tolist()

In [None]:
features_to_keep = importance_df['Feature'].tolist()
additional_feature = 'Label'

if additional_feature not in features_to_keep:
    features_to_keep.append(additional_feature)


filtered_df = accepted[features_to_keep]
filtered_df

In [None]:
filtered_df.isna().sum()

In [None]:
columns_to_fill = ['dti', 'mths_since_recent_bc', 'mths_since_rcnt_il', 'il_util']

for column in columns_to_fill:
    filtered_df[column].fillna(filtered_df[column].mean(), inplace=True)

# Verify that missing values have been filled
print(filtered_df[columns_to_fill].isnull().sum())

In [None]:
filtered_df.dropna(inplace=True)

In [None]:
filtered_df['Label'].value_counts()

# Correlations

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = filtered_df  # Replace with your actual DataFrame
correlation_matrix = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8}, linewidths=.5)
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
filtered_df = filtered_df.drop(columns=['debt_settlement_flag', 'loan_amnt'], errors='ignore')
filtered_df

# Balance Dataset

In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
majority_class = filtered_df[filtered_df['Label'] == 0]
minority_class = filtered_df[filtered_df['Label'] == 1]

In [None]:
minority_upsampled = resample(minority_class,
                              replace=True,       # Sample with replacement
                              n_samples=len(majority_class),  # Match majority class count
                              random_state=42)   # For reproducibility

# Combine majority class with upsampled minority class
balanced_df = pd.concat([majority_class, minority_upsampled])

# Shuffle the new balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
balanced_df['Label'].value_counts()

In [None]:
filtered_df=balanced_df

In [None]:
filtered_df['Label'].value_counts()

In [None]:
X =filtered_df.drop('Label', axis=1)
y =filtered_df['Label']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val=scaler.transform(X_val)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lg = LogisticRegression(C=0.01, random_state=42)
lg.fit(X_train, y_train)
lg_predictions = lg.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, lg_predictions))
print(classification_report(y_test, lg_predictions))

In [None]:
y_val_pred = lg.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(classification_report(y_val, y_val_pred))

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, mean_squared_error

# Calculate training error
y_train_pred = lg.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
print("Training MSE (Bias):", train_mse)

# Calculate validation error
y_test_pred =lg.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print("Validation MSE (Variance):", test_mse)

A training MSE of 0.0 suggests that the model perfectly fits the training data, indicating no bias. However, a validation MSE of 0.0 as well implies the model also perfectly fits the test data, which is unusual and may indicate data leakage or an overly simplistic dataset, as perfect predictions are rare in real-world scenarios.

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm_lg = confusion_matrix(y_test, lg_predictions)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_lg, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# Cross Validation

In [None]:
# Set up k-fold cross-validation with 10 folds
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize the model
model = LogisticRegression(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

# Output the results
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))
print("Standard Deviation of Cross-Validation Accuracy:", np.std(cv_scores))

The cross-validation accuracy scores are very high, with a mean of approximately 98.45%, indicating that the model consistently performs well across different folds. The low standard deviation of around 0.00033 suggests minimal variance, meaning the model is stable and performs reliably across various subsets of the data without overfitting or high sensitivity to data splits.

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

In [None]:
cm_rf = confusion_matrix(y_test, rf_predictions)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_predictions))
print(classification_report(y_test, xgb_predictions))

In [None]:
cm_xgb = confusion_matrix(y_test, xgb_predictions)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_xgb, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()