In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# data loading

file = "/kaggle/input/credit-risk-dataset/credit_risk_dataset.csv"
df = pd.read_csv(file)
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df['loan_status'].unique()

In [None]:
df.isnull().sum()

Method: Impute with median or use a model-based imputation (e.g., KNN imputation, regression model).
Reason: Employment length is a numerical feature. The number of missing values is quite high (895), so imputation with median is a common choice. Alternatively, you could use a machine learning model to predict missing values based on other features if the correlation is strong.

In [None]:
# fillin missing values for person_emp_length
# df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
df['person_emp_length'] = df['person_emp_length'].fillna(df['person_emp_length'].median())
df.tail()

In [None]:
# df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)
df['loan_int_rate'] = df['loan_int_rate'].fillna(df['loan_int_rate'].median())

df.head()
df.tail()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
# 1 → Loan defaulted (High Risk)
# 0 → Loan fully paid (Low Risk)
# # Check unique values in loan_status (Target Variable)
print(df["loan_status"].value_counts())

In [None]:
# drop unwanted feature
# df = df.drop("student_id", axis=1)

In [None]:
# df.head()

In [None]:
# check unique values from person_home_ownership
person_home_ownership = df['person_home_ownership'].unique()
print(person_home_ownership)

In [None]:
# One-Hot Encoding for person_home_ownership
df_encoded = pd.get_dummies(df, columns=["person_home_ownership"])
df_encoded.tail()

In [None]:
# check unique values from loan_intent
loan_intent = df['loan_intent'].unique()
print(loan_intent)

In [None]:
# One-Hot Encoding for gender
df_encoded = pd.get_dummies(df_encoded, columns=["loan_intent"])
df_encoded.tail()

In [None]:
# check unique values from cb_person_default_on_file
cb_person_default_on_file = df['cb_person_default_on_file'].unique()
print(cb_person_default_on_file)

In [None]:
# LABEL ENCODING FOR cb_person_default_on_file
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_encoded['cb_person_default_on_file'] = label_encoder.fit_transform(df_encoded['cb_person_default_on_file'])
# Display the transformed column
print(df_encoded['cb_person_default_on_file'].head())  # Show the first 5 rows of the transformed column
df_encoded.tail()



In [None]:
# check unique values from loan_grade
loan_grade = df['loan_grade'].unique()
print(loan_grade)

In [None]:
# One-Hot Encoding for gender
df_encoded = pd.get_dummies(df_encoded, columns=["loan_grade"])
df_encoded.tail()

In [None]:
df_encoded.shape

In [None]:
df_encoded.columns

In [None]:
# correlations
# correlation_matrix = df_encoded.corr()
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Correlation matrix
correlation_matrix = df_encoded.corr()

# Set figure size
plt.figure(figsize=(20, 8))  # Increase the width and height (adjust as needed)

# Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Show the plot
plt.show()


In [None]:
# List of columns to drop
columns_to_drop = [
    "cb_person_cred_hist_length",  # Highly correlated with person_age
    "loan_int_rate",               # Highly correlated with loan_amnt
    # "person_home_ownership_OTHER", # Low impact
    # "loan_intent_VENTURE",         # Low impact
    "person_home_ownership_MORTGAGE"  # Strong correlation with RENT and OWN (keep only one)
]

# Drop the columns
df_encoded = df_encoded.drop(columns=columns_to_drop, axis=1)

# Display the updated DataFrame
df_encoded.head()

In [None]:
# Correlation matrix
correlation_matrix = df_encoded.corr()

# Set figure size
plt.figure(figsize=(20, 8))  # Increase the width and height (adjust as needed)

# Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Show the plot
plt.show()

In [None]:
# from sklearn.preprocessing import LabelEncoder

# # Initialize the label encoder
# le = LabelEncoder()

# # Apply label encoding
# df['gender'] = le.fit_transform(df['gender'])


In [None]:
df_encoded.columns

In [None]:
# feature splitting

X = df_encoded.drop("loan_status", axis=1)
X.head()

In [None]:
X.columns
X.shape

In [None]:
y = df_encoded['loan_status']
y.head()

In [None]:
# scale data
from sklearn.preprocessing import StandardScaler
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the data
X_scaled = scaler.fit_transform(X)

# Convert scaled data back to a DataFrame
X = pd.DataFrame(X_scaled, columns=[col for col in X.columns])
X.head()

In [None]:
# Scatter plot using 'age' from X and 'dropout' from y
plt.scatter(X['loan_percent_income'], y, marker='+', color='red')
plt.xlabel('loan_percent_income')  # Label for X-axis
plt.ylabel('Loan Status')  # Label for Y-axis
plt.title('Prediction Based on loan_percent_income')
plt.show()


In [None]:
# Train and Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train,y_train)

<h1 style="color:blue">CHECK OVERFITTING</h1>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Compute accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Compute log loss (need probability predictions)
y_train_proba = model.predict_proba(X_train)
y_test_proba = model.predict_proba(X_test)

train_loss = log_loss(y_train, y_train_proba)
test_loss = log_loss(y_test, y_test_proba)

# Print results
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Train Loss (Log Loss): {train_loss:.4f}")
print(f"Test Loss (Log Loss): {test_loss:.4f}")


In [None]:
# Measure accuracy
print(model.score(X_test,y_test))

In [None]:
# prediction
y_pred = model.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_pred,y_test))
print("-----" * 10)
print(confusion_matrix(y_pred,y_test))

In [None]:
# Extract feature importance
import seaborn as sns
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

# Plot feature importance
plt.figure(figsize=(8, 5))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.show()


In [None]:
X_train.columns

In [None]:
print(f"First Independent Record: \n {X_train.iloc[0]}")  # If X_train is a Pandas DataFrame
print(f"First Dependant (loan status) Record: \n {y_train.iloc[0]}")  # If X_train is a Pandas DataFrame


In [None]:
# Predict the result for the input value (20 in this case)
# Define the input sample based on the feature list
input_sample = [[64, 46000, 2.0, 4800, 0.1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,0]]

# Predict using the trained model
prediction = model.predict(input_sample)

# Display the result
if prediction[0] == 0:
    print("Loan fully paid (Low Risk): 0")
else:
    print("Loan defaulted (High Risk): 1")
    
# 1 → Loan defaulted (High Risk)
# 0 → Loan fully paid (Low Risk)




In [None]:
X.head()

<h1>END </h1>

<h1>REMOVE OVERFITTING and Retrain Different Models</h1>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import GridSearchCV

# Define the model with regularization
model_Random_Forest = RandomForestClassifier(
    n_estimators=100,        # Number of trees (can reduce if needed)
    max_depth=10,            # Limit tree depth to prevent overfitting
    min_samples_split=10,    # Minimum samples to split a node
    min_samples_leaf=5,      # Minimum samples per leaf to prevent small splits
    max_features="sqrt",     # Use sqrt features per split (reduces overfitting)
    random_state=42,
    n_jobs=-1                # Use all CPU cores for faster training
)

# Train the model
model_Random_Forest.fit(X_train, y_train)

# Predict probabilities for loss calculation
train_probs = model_Random_Forest.predict_proba(X_train)
test_probs = model_Random_Forest.predict_proba(X_test)

# Compute Log Loss (Cross-Entropy Loss)
train_loss = log_loss(y_train, train_probs)
test_loss = log_loss(y_test, test_probs)

# Compute Accuracy
train_accuracy = accuracy_score(y_train, model_Random_Forest.predict(X_train))
test_accuracy = accuracy_score(y_test, model_Random_Forest.predict(X_test))

# Print results
print(f"Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
print(f"Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")


In [None]:
# Measure accuracy
print(model_Random_Forest.score(X_test,y_test))

In [None]:
# prediction
y_pred = model_Random_Forest.predict(X_test)
y_pred

In [None]:
# Predict the result for the input value (20 in this case)
# Define the input sample based on the feature list
input_sample = [[64, 46000, 2.0, 4800, 0.1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,0]]

# Predict using the trained model
prediction = model_Random_Forest.predict(input_sample)

# Display the result
if prediction[0] == 0:
    print("Loan fully paid (Low Risk): 0")
else:
    print("Loan defaulted (High Risk): 1")
    
# 1 → Loan defaulted (High Risk)
# 0 → Loan fully paid (Low Risk)

In [None]:
import pickle

# Saving the model to a file
with open('credit_random_model.pkl', 'wb') as model_file:
    pickle.dump(model_Random_Forest, model_file)
print("Model saved successfully!")

In [None]:
X_train.columns

In [None]:
X_test.head()

<p style="color:blue">Below is the Python code to train and evaluate multiple models (Logistic Regression, Random Forest, Naive Bayes, SVM, XGBoost, LightGBM, and CatBoost) in a loop. The code calculates and prints the accuracy and log loss for both the training and test sets for each model.</p>

<h1 style="color:blue">Different Model Training: Logistic Regression, Random Forest, Naive Bayes, SVM,XGBClassifier,
LGBMClassifier, CatBoostClassifier</h1>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, accuracy_score, classification_report
import pandas as pd

# Define models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features="sqrt",
        random_state=42,
        n_jobs=-1
    ),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True, random_state=42),  # Enable probability for log loss
    "XGBoost": XGBClassifier(random_state=42, n_jobs=-1),
    "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0)  # Disable training logs
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"\n Training {name}...")

    # Train the model
    model.fit(X_train, y_train)

    # Predict probabilities for log loss
    train_probs = model.predict_proba(X_train)
    test_probs = model.predict_proba(X_test)

    # Compute log loss
    train_loss = log_loss(y_train, train_probs)
    test_loss = log_loss(y_test, test_probs)

    # Compute accuracy
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))

    # Compute classification report
    class_report = classification_report(y_test, model.predict(X_test), output_dict=True)

    # Store results
    results[name] = {
        "Train Loss": train_loss,
        "Test Loss": test_loss,
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Precision": class_report["weighted avg"]["precision"],
        "Recall": class_report["weighted avg"]["recall"],
        "F1-Score": class_report["weighted avg"]["f1-score"]
    }

    # Print results
    print(f"  {name} Results:")
    print(f"  Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
    print(f"  Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")
    print(f"  Precision: {results[name]['Precision']:.4f}, Recall: {results[name]['Recall']:.4f}, F1-Score: {results[name]['F1-Score']:.4f}")
    print("-" * 50)

# Display results in a table
results_df = pd.DataFrame(results).T
print("\n Summary of Results:")
print(results_df)


<h1 style="color:blue">Prediction for each Model</h1>

In [None]:
# prediction
# Make predictions with each model
y_preds = {}
for name, model in models.items():
    y_preds[name] = model.predict(X_test)

# Print the predictions for each model
for name, y_pred in y_preds.items():
    print(f"\nPredictions for {name}:")
    print(y_pred)


<h1 style="color:blue">Confusion Matrix for each Model</h1>

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Set up the figure and axes for plotting 3 models per row
n_rows = (len(models) + 2) // 3  # Calculate number of rows needed (round up if needed)
fig, axes = plt.subplots(n_rows, 3, figsize=(15, n_rows * 5))

# Flatten axes array in case it's 2D for easier iteration
axes = axes.flatten()

# Iterate over each model and plot the confusion matrix
for i, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_test)  # Get predictions for each model
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot Confusion Matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot(cmap='Blues', ax=axes[i])
    axes[i].set_title(f"Confusion Matrix - {name}")
    
# Hide unused axes in the last row if necessary
for i in range(len(models), len(axes)):
    axes[i].axis('off')

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
import joblib

# Find the best model based on highest test accuracy
best_model_name = max(results, key=lambda x: results[x]["Test Accuracy"])
best_model = models[best_model_name]

# Save the best model
model_filename = f"best_model_{best_model_name.lower().replace(' ', '_')}.pkl"
joblib.dump(best_model, model_filename)

print(f"Best model ({best_model_name}) saved successfully as {model_filename}!")
