# Bankruptcy Prediction Project
## 1. Data loading and initial exploration

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv("./Bankruptcy_data_Final.csv", delimiter=';')

In [None]:
print(f"number of row: {df.shape[0]}")
print(f"number of columns: {df.shape[1]}")

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
missing_values = df.isnull().sum().sort_values(ascending=False)

print("number of missing values per column")
missing_values

In [None]:
missing_percentage = (missing_values / len(df)) * 100
print("percentage of missing values per column")
print(missing_percentage.round(2))

In [None]:
print("BK distribution - counts")
df["BK"].value_counts()

In [None]:
print("BK distribution - percentages")
(df["BK"].value_counts(normalize=True) * 100).round(4)

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x="BK", data=df)
plt.title("Distribution of the target variable (BK)")
plt.xlabel("0 = Non-bankrupt (0) | Bankrupt (1)")
plt.ylabel("Number of companies")
plt.show()

In [None]:
print("Data types before conversion:")
df.dtypes

# 2. Data Preprocessing

### data type

In [None]:
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].str.replace(",", ".", regex=False)

        try:
            df[col] = df[col].astype(float)
        except ValueError:
            print(f"Column '{col}' could not be converted to float and was kept as object.")

In [None]:
df.dtypes

### Duplicates

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns

outlier_summary = {}

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    outlier_summary[col] = outliers

print("number of detected outliers per numerical variable")
outlier_summary

### missing values

In [None]:
df = df.fillna(df.median(numeric_only=True))

In [None]:
print("remaining missing values after imputation")
df.isnull().sum().sort_values(ascending=False).head(10)

In [None]:
plt.figure(figsize=(7, 6))
corr_matrix = df.corr(method="pearson")
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap of Numerical Features")
sns.set_style("white")
plt.show()

### distribution of Numerical Variables

In [None]:
axes = df[numeric_cols].hist(
    figsize=(16, 14),
    bins=50,
    density=True,
    edgecolor="black"
)

for ax in axes.flatten():
    ax.set_xlim(-10, 10)

plt.suptitle("Distribution of Numerical Features (Density, clipped to [-2, 2])")
plt.show()

### box plots to visualise outliers

In [None]:
plt.figure(figsize=(16, 10))
sns.boxplot(data=df[numeric_cols], orient="h")
plt.title("Boxplot of Numerical Features")
plt.show()

### relationship Between Features and BK

In [None]:
plt.figure(figsize=(14, 8))
for i, col in enumerate(numeric_cols[:6]):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(x=df["BK"], y=df[col])
    plt.title(f"{col} vs BK")

plt.tight_layout()
plt.show()

# 4. Train & Test Split, Scaling and Class Imbalance Handling

In [None]:
X = df.drop(columns=["BK"])
y = df["BK"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=32, stratify=y
)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

print("before SMOTE :", y_train.value_counts())
print("after SMOTE :", y_train_res.value_counts())

# 5. model training

### regression logistic

In [None]:
log_reg = LogisticRegression(
    max_iter=500,
    class_weight="balanced",
    random_state=42
)
log_reg.fit(X_train_res, y_train_res)

In [None]:
y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_lr))

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_prob_lr)

In [None]:
confusion_matrix(y_test, y_pred_lr)

### Random Forest

In [None]:

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    random_state=42
)
rf.fit(X_train_res, y_train_res)


In [None]:
y_pred_rf = rf.predict(X_test_scaled)
y_prob_rf = rf.predict_proba(X_test_scaled)[:, 1]

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
roc_auc_score(y_test, y_prob_rf)

### SVM

In [None]:

from sklearn.svm import SVC

svm_model = SVC(
    kernel="rbf",
    probability=True,
    class_weight="balanced",
    C=1,
    gamma="scale",
    random_state=42
)
svm_model.fit(X_train_res, y_train_res)


In [None]:
y_pred_svm = svm_model.predict(X_test_scaled)
y_prob_svm = svm_model.predict_proba(X_test_scaled)[:, 1]

In [None]:
print(classification_report(y_test, y_pred_svm))

In [None]:
roc_auc_score(y_test, y_prob_svm)

In [None]:
confusion_matrix(y_test, y_pred_svm)

# 6. Model Evaluation, ROC Curves & Hyperparameter Tuning

### ROC Curves

In [None]:
plt.figure(figsize=(10, 8))

models = {
    "Logistic Regression": (y_test, y_prob_lr),
    "Random Forest": (y_test, y_prob_rf),
    "SVM": (y_test, y_prob_svm)
}

for name, (yt, yp) in models.items():
    RocCurveDisplay.from_predictions(yt, yp, name=name)

plt.plot([0, 1], [0, 1], "k--")
plt.show()

### extended Metrics Table

In [None]:
def compute_metrics(y_true, y_pred, y_prob):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1-score": f1_score(y_true, y_pred, zero_division=0),
        "ROC AUC": roc_auc_score(y_true, y_prob)
    }

In [None]:
metrics_table = pd.DataFrame([
    compute_metrics(y_test, y_pred_lr, y_prob_lr),
    compute_metrics(y_test, y_pred_rf, y_prob_rf),
    compute_metrics(y_test, y_pred_svm, y_prob_svm),
], index=["Logistic Regression", "Random Forest", "SVM"])

metrics_table

### hyperparameter Tuning

In [None]:
N_ITER = 5  # nombre de param test
N_CV = 3  # nombre de folds
N_JOB = 4  # nombre de cœurs que le pc va utiliser

In [None]:
rf_params = {
    "n_estimators": [200, 300, 500, 800],
    "max_depth": [None, 10, 20, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(
    estimator=RandomForestClassifier(
        class_weight="balanced",
        random_state=42
    ),
    param_distributions=rf_params,
    n_iter=N_ITER,
    cv=N_CV,
    scoring="roc_auc",
    random_state=42,
    n_jobs=N_JOB
)

rf_random.fit(X_train_res, y_train_res)
rf_best_params = rf_random.best_params_

In [None]:
print("best parameters for Random Forest = ", rf_best_params)

### hyperparameter Tuning – SVM

In [None]:
svm_params = {
    "C": [0.1, 1, 10, 50],
    "gamma": ["scale", 0.1, 0.01, 0.001],
    "kernel": ["rbf"]
}

In [None]:
svm_random = RandomizedSearchCV(
    estimator=SVC(
        class_weight="balanced",
        probability=True,
        random_state=42
    ),
    param_distributions=svm_params,
    n_iter=3,
    cv=2,
    scoring="roc_auc",
    random_state=42,
    n_jobs=N_JOB
)

svm_random.fit(X_train_res, y_train_res)

svm_best_params = svm_random.best_params_

In [None]:
print("best parameters for SVM = ", svm_best_params)

### Retraining Models with Optimal Hyperparameters

In [None]:
rf_optimized = RandomForestClassifier(
    **rf_best_params,
    class_weight="balanced",
    random_state=42
)

rf_optimized.fit(X_train_res, y_train_res)

In [None]:
y_pred_rf_opt = rf_optimized.predict(X_test_scaled)
y_prob_rf_opt = rf_optimized.predict_proba(X_test_scaled)[:, 1]

In [None]:
print("Optimized Random Forest")
print(classification_report(y_test, y_pred_rf_opt))
print("ROC AUC:", roc_auc_score(y_test, y_prob_rf_opt))

In [None]:
svm_optimized = SVC(
    **svm_best_params,
    probability=True,
    class_weight="balanced",
    random_state=42
)

svm_optimized.fit(X_train_res, y_train_res)

In [None]:
y_pred_svm_opt = svm_optimized.predict(X_test_scaled)
y_prob_svm_opt = svm_optimized.predict_proba(X_test_scaled)[:, 1]

In [None]:
print("Optimized SVM")
print(classification_report(y_test, y_pred_svm_opt))
print("ROC AUC:", roc_auc_score(y_test, y_prob_svm_opt))

# 7. Final Comparison & Conclusion

In [None]:
optimized_metrics_table = pd.DataFrame([
    compute_metrics(y_test, y_pred_rf_opt, y_prob_rf_opt),
    compute_metrics(y_test, y_pred_svm_opt, y_prob_svm_opt),
], index=[
    "Random Forest (Optimized)",
    "SVM (Optimized)"
])

In [None]:
optimized_metrics_table

In [None]:
best_model_name = optimized_metrics_table["ROC AUC"].idxmax()
best_model_score = optimized_metrics_table.loc[best_model_name, "ROC AUC"]
print(f"Best optimized model: {best_model_name}")
print(f"ROC AUC score: {best_model_score:.4f}")