In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
file_path = '/data/credit_risk_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe to understand its structure
data.head(), data.info(), data.describe()

# Imputing missing values with the median
imputer = SimpleImputer(strategy='median')
data['person_emp_length'] = imputer.fit_transform(data[['person_emp_length']])
data['loan_int_rate'] = imputer.fit_transform(data[['loan_int_rate']])

# Handling outliers by capping the age and employment length to 99th percentile
for col in ['person_age', 'person_emp_length']:
    percentiles = data[col].quantile([0.01, 0.99]).values
    data[col] = data[col].clip(lower=percentiles[0], upper=percentiles[1])

# Log transforming 'person_income' and 'loan_percent_income' to reduce skewness
data['log_person_income'] = np.log1p(data['person_income'])
data['log_loan_percent_income'] = np.log1p(data['loan_percent_income'])

# Normalizing numerical features using RobustScaler to reduce the influence of outliers
scaler = RobustScaler()
scaled_features = scaler.fit_transform(data[['person_age', 'log_person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'log_loan_percent_income', 'cb_person_cred_hist_length']])
scaled_df = pd.DataFrame(scaled_features, columns=['scaled_person_age', 'scaled_log_person_income', 'scaled_person_emp_length', 'scaled_loan_amnt', 'scaled_loan_int_rate', 'scaled_log_loan_percent_income', 'scaled_cb_person_cred_hist_length'])

# Merging scaled features back into the original dataframe
data_cleaned = pd.concat([data, scaled_df], axis=1)

# Displaying the first few rows of the cleaned and scaled dataframe
data_cleaned.head()

# Setting the aesthetic style of the plots
sns.set_style("whitegrid")

# Define a list of numerical columns for distribution plots
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

# Plotting histograms for numerical columns
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(20, 20))
axes = axes.flatten() # Flatten the axes array

for i, col in enumerate(numerical_cols):
    sns.histplot(data[col], bins=30, ax=axes[i], kde=True)
    axes[i].set_title(f'Distribution of {col}', fontsize=14)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')

plt.tight_layout()
plt.delaxes(axes[-1]) # Remove the last subplot (unused)
plt.show()

# Boxplot for loan interest rates to visualize outliers
plt.figure(figsize=(10, 6))
sns.boxplot(data['loan_int_rate'])
plt.title('Boxplot of Loan Interest Rates')
plt.xlabel('Loan Interest Rate')
plt.show()

# Correlation analysis
correlation_matrix = data_cleaned.corr()

# Plotting the correlation matrix
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix of Features')
plt.show()

# Focusing on correlations with the target variable 'loan_status'
correlation_with_target = correlation_matrix['loan_status'].sort_values(ascending=False)
correlation_with_target

# Selecting features for the model
features = ['scaled_person_age', 'scaled_log_person_income', 'scaled_person_emp_length',
            'scaled_loan_amnt', 'scaled_loan_int_rate', 'scaled_log_loan_percent_income',
            'scaled_cb_person_cred_hist_length']
target = 'loan_status'

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_cleaned[features], data_cleaned[target], test_size=0.2, random_state=42)

# Verifying the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Initializing the models
log_reg = LogisticRegression(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)

# Dictionary to hold models and their names
models = {'Logistic Regression': log_reg,
          'Random Forest': rf_clf,
          'Gradient Boosting': gb_clf}

# Dictionary to hold evaluation metrics
metrics = {'Accuracy': accuracy_score,
           'Precision': precision_score,
           'Recall': recall_score,
           'F1 Score': f1_score}

# Training and evaluating the models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Training
    y_pred = model.predict(X_test)  # Predicting
    results[model_name] = {metric_name: metric_func(y_test, y_pred) for metric_name, metric_func in metrics.items()}

# Displaying the results
results_df = pd.DataFrame(results).T
results_df

# Defining the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Setting up the grid search with cross-validation
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fitting the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

plt.figure(figsize=(10, 8))
# Assuming the original 'person_income' and 'loan_amnt' are used directly without transformation for clarity
sns.scatterplot(data=data_cleaned, x='person_income', y='loan_amnt', hue='loan_status', style='loan_status', alpha=0.6)
plt.title('Loan Amount vs. Person Income by Loan Status')
plt.xlabel('Person Income')
plt.ylabel('Loan Amount')
plt.legend(title='Loan Status', labels=['Not Defaulted', 'Defaulted'])
plt.show()

# Implementing Hyperparameter Tuning
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Initialize the classifier
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Extract the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score:.4f}")

results = grid_search.cv_results_
mean_test_scores = results['mean_test_score']
n_estimators = [param['n_estimators'] for param in results['params']]

plt.figure(figsize=(8, 6))
plt.plot(n_estimators, mean_test_scores, marker='o')
plt.xlabel('Number of Trees (n_estimators)')
plt.ylabel('Accuracy')
plt.title('Model Performance by Number of Trees')
plt.show()


FileNotFoundError: [Errno 2] File /path/to/your/credit_risk_dataset.csv does not exist: '/path/to/your/credit_risk_dataset.csv'