In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Set random seed for reproducibility
np.random.seed(42)

# Generate simulated data
num_samples = 1000

# Features
revenue = np.random.randint(100000, 10000000, size=num_samples)  # Annual revenue
debt_to_equity = np.random.uniform(0.0, 3.0, size=num_samples)  # Debt-to-equity ratio
current_ratio = np.random.uniform(0.5, 3.0, size=num_samples)  # Current ratio
years_in_business = np.random.randint(1, 50, size=num_samples)  # Years in business

# Simulated default status based on some simple rules
prob_default = (1 / (1 + np.exp(-0.5 * (debt_to_equity - 1.5) + 0.5 * (current_ratio - 1.5) - 0.00000005 * revenue + 0.05 * (years_in_business - 10))))

default_status = np.random.binomial(1, prob_default, size=num_samples)

# Create DataFrame
simulated_data = pd.DataFrame({
    'revenue': revenue,
    'debt_to_equity': debt_to_equity,
    'current_ratio': current_ratio,
    'years_in_business': years_in_business,
    'default_status': default_status
})

# Prepare the dataset
X = simulated_data.drop('default_status', axis=1)
y = simulated_data['default_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# Print results
print("Model Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print("\nFeature Importance:")
print(feature_importance)

# Calculate probability of default for a new business
new_business = np.array([[5000000, 1.2, 1.8, 15]])  # Revenue, Debt-to-Equity, Current Ratio, Years in Business
new_business_scaled = scaler.transform(new_business)
pd_new_business = rf_model.predict_proba(new_business_scaled)[0][1]

print("\nProbability of Default for New Business:", pd_new_business)

Model Accuracy: 0.675

Confusion Matrix:
[[102  26]
 [ 39  33]]

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.80      0.76       128
           1       0.56      0.46      0.50        72

    accuracy                           0.68       200
   macro avg       0.64      0.63      0.63       200
weighted avg       0.66      0.68      0.67       200


Feature Importance:
             feature  importance
3  years_in_business    0.262947
2      current_ratio    0.260584
1     debt_to_equity    0.253976
0            revenue    0.222494

Probability of Default for New Business: 0.48


