In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load dataset
data = pd.read_csv("credit_risk_dataset.csv")  # Replace with your dataset path

# Handle missing values
numerical_cols = data.select_dtypes(include=[np.number]).columns
categorical_cols = data.select_dtypes(exclude=[np.number]).columns

# Fill numeric columns with their mean
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# Fill categorical columns with their mode
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# Encode categorical variables
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
label_encoder = LabelEncoder()
for col in categorical_features:
    data[col] = label_encoder.fit_transform(data[col])

# Feature scaling for numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data[['person_age', 'person_income', 'person_emp_length',
                                             'loan_amnt', 'loan_int_rate', 'loan_percent_income',
                                             'cb_person_cred_hist_length']])
data[['person_age', 'person_income', 'person_emp_length',
      'loan_amnt', 'loan_int_rate', 'loan_percent_income',
      'cb_person_cred_hist_length']] = scaled_features

# Split dataset into features and target
X = data.drop(columns=['loan_status'])  # Keep only features
y = data['loan_status']  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection using Random Forest
selector = RandomForestClassifier(random_state=42)
selector.fit(X_train, y_train)

# Get feature importances
importances = selector.feature_importances_
feature_names = X.columns
important_features = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Select top features
threshold = 0.02  # Define a threshold for feature selection
selected_features = important_features[important_features['Importance'] > threshold]['Feature'].values

# Filter the dataset with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Display selected features
print("Selected Features:")
print(selected_features)

# (Optional) Display feature importance
print("\nFeature Importances:")
print(important_features)

Selected Features:
['loan_percent_income' 'person_income' 'loan_int_rate' 'loan_grade'
 'person_home_ownership' 'loan_amnt' 'loan_intent' 'person_emp_length'
 'person_age' 'cb_person_cred_hist_length']

Feature Importances:
                       Feature  Importance
8          loan_percent_income    0.224182
1                person_income    0.150195
7                loan_int_rate    0.115602
5                   loan_grade    0.107670
2        person_home_ownership    0.100832
6                    loan_amnt    0.075462
4                  loan_intent    0.072895
3            person_emp_length    0.064132
0                   person_age    0.043297
10  cb_person_cred_hist_length    0.035042
9    cb_person_default_on_file    0.010690


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
data = pd.read_csv("credit_risk_dataset.csv")  # Replace with your dataset path

# Handle missing values
numerical_cols = data.select_dtypes(include=[np.number]).columns
categorical_cols = data.select_dtypes(exclude=[np.number]).columns

# Fill numeric columns with their mean
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())

# Fill categorical columns with their mode
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# Encode categorical variables
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
label_encoder = LabelEncoder()
for col in categorical_features:
    data[col] = label_encoder.fit_transform(data[col])

# Feature scaling for numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data[['person_age', 'person_income', 'person_emp_length',
                                             'loan_amnt', 'loan_int_rate', 'loan_percent_income',
                                             'cb_person_cred_hist_length']])
data[['person_age', 'person_income', 'person_emp_length',
      'loan_amnt', 'loan_int_rate', 'loan_percent_income',
      'cb_person_cred_hist_length']] = scaled_features

# Split dataset into features and target
X = data.drop(columns=['loan_status'])  # Keep only features
y = data['loan_status']  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection using Random Forest
selector = RandomForestClassifier(random_state=42)
selector.fit(X_train, y_train)

# Get feature importances
importances = selector.feature_importances_
feature_names = X.columns
important_features = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Select top features
threshold = 0.02  # Define a threshold for feature selection
selected_features = important_features[important_features['Importance'] > threshold]['Feature'].values

# Filter the dataset with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Display selected features
print("Selected Features:")
print(selected_features)

# (Optional) Display feature importance
print("\nFeature Importances:")
print(important_features)

# ============= Model Training and Evaluation Section ============= #

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_selected, y_train)

# Predictions
y_pred = model.predict(X_test_selected)

# Evaluate the model
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Selected Features:
['loan_percent_income' 'person_income' 'loan_int_rate' 'loan_grade'
 'person_home_ownership' 'loan_amnt' 'loan_intent' 'person_emp_length'
 'person_age' 'cb_person_cred_hist_length']

Feature Importances:
                       Feature  Importance
8          loan_percent_income    0.224182
1                person_income    0.150195
7                loan_int_rate    0.115602
5                   loan_grade    0.107670
2        person_home_ownership    0.100832
6                    loan_amnt    0.075462
4                  loan_intent    0.072895
3            person_emp_length    0.064132
0                   person_age    0.043297
10  cb_person_cred_hist_length    0.035042
9    cb_person_default_on_file    0.010690

Model Evaluation:
Accuracy: 0.9286481509897192

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96      5072
           1       0.96      0.71      0.81      1445

    accuracy           