<a href="https://colab.research.google.com/github/btabi/btabi.com/blob/main/Loan_Borrower_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report)

# Load the dataset
data = pd.read_csv('loan_borowwer_data.csv')
print(data.head())

In [None]:
df_info = data.info()

In [None]:
print(data.info())
print(data.describe())

In [None]:
print(data.isnull().sum())

In [None]:
sns.countplot(x='credit.policy', data=data)
plt.title('Distribution of credit.policy')
plt.show()


In [None]:
data = pd.get_dummies(data, columns=['purpose'], drop_first=True)

In [None]:
print(data.head())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your data into a DataFrame
# Replace 'your_data.csv' with the path to your data file
df = pd.read_csv('loan_borowwer_data.csv')

# List of important variables
important_variables = ['fico', 'log.annual.inc', 'int.rate', 'dti', 'revol.util']

# Plot distributions
plt.figure(figsize=(15, 10))
for i, feature in enumerate(important_variables, 1):
    plt.subplot(3, 2, i)
    sns.histplot(df[feature], kde=True, color='blue', bins=30)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Plot distributions by target variable (credit.policy)
plt.figure(figsize=(15, 10))
for i, feature in enumerate(important_variables, 1):
    plt.subplot(3, 2, i)
    sns.histplot(df[df['credit.policy'] == 1][feature], kde=True, color='green', label='Repaid', bins=30)
    sns.histplot(df[df['credit.policy'] == 0][feature], kde=True, color='red', label='Defaulted', bins=30)
    plt.title(f'Distribution of {feature} by Loan Outcome')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Statistical summary of important variables
print(df[important_variables].describe())

In [None]:
X = data.drop('credit.policy', axis=1)
y = data['credit.policy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
df = pd.read_csv('loan_borowwer_data.csv')
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the selected columns
scaled_data = scaler.fit_transform(df[['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util']])

# Convert the scaled data back to a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util'], index=df.index)

# Assign the scaled data back to the original DataFrame
df[['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util']] = scaled_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of numerical features
numerical_features = ['int.rate', 'installment', 'log.annual.inc', 'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util']

# Create box plots for each numerical feature
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df[feature])
    plt.title(f'Box Plot of {feature}')
    plt.ylabel(feature)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluate models
def evaluate_model(y_true, y_pred):
    print(f'Accuracy: {accuracy_score(y_true, y_pred)}')
    print(f'Precision: {precision_score(y_true, y_pred)}')
    print(f'Recall: {recall_score(y_true, y_pred)}')
    print(f'F1-Score: {f1_score(y_true, y_pred)}')
    print(f'ROC-AUC: {roc_auc_score(y_true, y_pred)}')

print("Logistic Regression:")
evaluate_model(y_test, y_pred_lr)

print("Random Forest:")
evaluate_model(y_test, y_pred_rf)

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation for Random Forest
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='roc_auc')
print(f'Cross-Validation ROC-AUC Scores: {cv_scores}')
print(f'Mean ROC-AUC: {cv_scores.mean()}')


In [None]:
importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

In [None]:
print("Key predictors of loan default include FICO score, income-to-installment ratio, and revolving balance utilization.")

Ethical Considerations

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred_rf)
print(conf_matrix)

Scalability and Future Work

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(X_train, y_train)

In [None]:
print("Limitations include potential overfitting and the need for more diverse data.")