In [1]:
# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv('/path/credit_risk_dataset.csv')

# Data Cleaning and Preprocessing
# Impute missing values
imputer = SimpleImputer(strategy='median')
data['person_emp_length'] = imputer.fit_transform(data[['person_emp_length']])
data['loan_int_rate'] = imputer.fit_transform(data[['loan_int_rate']])

# Log transformation for skewed features
data['log_person_income'] = np.log1p(data['person_income'])
data['log_loan_percent_income'] = np.log1p(data['loan_percent_income'])

# Scaling numerical features
scaler = RobustScaler()
features_to_scale = ['person_age', 'log_person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'log_loan_percent_income', 'cb_person_cred_hist_length']
data_scaled = scaler.fit_transform(data[features_to_scale])
data_scaled = pd.DataFrame(data_scaled, columns=features_to_scale)

# Combine scaled data with the non-numerical features
data_cleaned = pd.concat([data_scaled, data[['person_home_ownership', 'loan_intent', 'loan_grade', 'loan_status', 'cb_person_default_on_file']]], axis=1)

# Exploratory Data Analysis (Visualizations are described but not executed here)
# For visualization, you would normally use sns.histplot or sns.boxplot here

# Model Training
X = data_cleaned.drop('loan_status', axis=1)
y = data_cleaned['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handling categorical variables (simple encoding example)
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Model Evaluation
y_pred = rf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.title('Confusion Matrix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

# Hyperparameter Tuning (Example with RandomForestClassifier)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model performance
print("Best Parameters:", grid_search.best_params_)
print(f"Best Score: {grid_search.best_score_:.4f}")

# Assuming the plot visualizations are conceptual and need to be executed in a Python environment with matplotlib and seaborn installed

FileNotFoundError: [Errno 2] File /path/to/your/credit_risk_dataset.csv does not exist: '/path/to/your/credit_risk_dataset.csv'