In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [24]:
df = pd.read_csv(r"C:\Users\ADITYA\Downloads\Surgical_deepnet.csv")
df.head()

Unnamed: 0,bmi,Age,asa_status,baseline_cancer,baseline_charlson,baseline_cvd,baseline_dementia,baseline_diabetes,baseline_digestive,baseline_osteoart,...,complication_rsi,dow,gender,hour,month,moonphase,mort30,mortality_rsi,race,complication
0,19.31,59.2,1,1,0,0,0,0,0,0,...,-0.57,3,0,7.63,6,1,0,-0.43,1,0
1,18.73,59.1,0,0,0,0,0,0,0,0,...,0.21,0,0,12.93,0,1,0,-0.41,1,0
2,21.85,59.0,0,0,0,0,0,0,0,0,...,0.0,2,0,7.68,5,3,0,0.08,1,0
3,18.49,59.0,1,0,1,0,0,1,1,0,...,-0.65,2,1,7.58,4,3,0,-0.32,1,0
4,19.7,59.0,1,0,0,0,0,0,0,0,...,0.0,0,0,7.88,11,0,0,0.0,1,0


In [25]:
df.shape

(14635, 25)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14635 entries, 0 to 14634
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   bmi                  14635 non-null  float64
 1   Age                  14635 non-null  float64
 2   asa_status           14635 non-null  int64  
 3   baseline_cancer      14635 non-null  int64  
 4   baseline_charlson    14635 non-null  int64  
 5   baseline_cvd         14635 non-null  int64  
 6   baseline_dementia    14635 non-null  int64  
 7   baseline_diabetes    14635 non-null  int64  
 8   baseline_digestive   14635 non-null  int64  
 9   baseline_osteoart    14635 non-null  int64  
 10  baseline_psych       14635 non-null  int64  
 11  baseline_pulmonary   14635 non-null  int64  
 12  ahrq_ccs             14635 non-null  int64  
 13  ccsComplicationRate  14635 non-null  float64
 14  ccsMort30Rate        14635 non-null  float64
 15  complication_rsi     14635 non-null 

In [51]:
df.describe()

Unnamed: 0,bmi,Age,asa_status,baseline_cancer,baseline_charlson,baseline_cvd,baseline_dementia,baseline_diabetes,baseline_digestive,baseline_osteoart,...,complication_rsi,dow,gender,hour,month,moonphase,mort30,mortality_rsi,race,complication
count,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,...,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0,14635.0
mean,31.295642,63.205268,0.63232,0.262316,0.97752,0.620294,0.004851,0.120875,0.189546,0.34274,...,-0.699044,1.60697,0.54889,10.171613,5.915408,1.187086,0.003963,-0.836712,0.91944,0.252135
std,8.152709,18.088191,0.539952,0.439909,1.758355,0.48533,0.069485,0.325993,0.391955,0.474642,...,1.339394,1.497738,0.497621,2.659881,3.239825,1.158357,0.06283,1.194111,0.364663,0.434253
min,2.15,6.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-4.72,0.0,0.0,6.07,0.0,0.0,0.0,-3.82,0.0,0.0
25%,26.51,51.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.97,0.0,0.0,7.82,3.0,0.0,0.0,-2.25,1.0,0.0
50%,28.98,59.7,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.58,1.0,1.0,9.12,7.0,1.0,0.0,-0.64,1.0,0.0
75%,35.295,74.7,1.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,...,0.0,3.0,1.0,12.05,8.0,2.0,0.0,0.0,1.0,1.0
max,92.59,90.0,2.0,1.0,13.0,1.0,1.0,1.0,1.0,1.0,...,12.56,4.0,1.0,18.92,11.0,3.0,1.0,4.4,2.0,1.0


In [52]:
df.isnull().sum()

bmi                    0
Age                    0
asa_status             0
baseline_cancer        0
baseline_charlson      0
baseline_cvd           0
baseline_dementia      0
baseline_diabetes      0
baseline_digestive     0
baseline_osteoart      0
baseline_psych         0
baseline_pulmonary     0
ahrq_ccs               0
ccsComplicationRate    0
ccsMort30Rate          0
complication_rsi       0
dow                    0
gender                 0
hour                   0
month                  0
moonphase              0
mort30                 0
mortality_rsi          0
race                   0
complication           0
dtype: int64

In [53]:
df.duplicated().sum()

2902

In [54]:
df.fillna(df.median(numeric_only=True), inplace=True)

In [55]:
df_encoded = pd.get_dummies(df, drop_first=True)



In [56]:
X = df_encoded.drop("mort30", axis=1)
y = df_encoded["mort30"]


In [57]:

y.value_counts()
y.value_counts(normalize=True) * 100

mort30
0    99.60369
1     0.39631
Name: proportion, dtype: float64

In [68]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(
    sampling_strategy='auto',
    random_state=42
)

X_train_smote, y_train_smote = smote.fit_resample(
    X_train, y_train
)

y_train_smote.value_counts()


mort30
0    11656
1    11656
Name: count, dtype: int64

In [72]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
)


In [73]:
scaler = StandardScaler()

X_train_smote_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)



In [81]:
# Logistic Regression

lr_smote = LogisticRegression(max_iter=1000)
lr_smote.fit(X_train_smote_scaled, y_train_smote)

y_pred_lr = lr_smote.predict(X_test_scaled)

print(confusion_matrix(y_test , y_pred_lr))
print("Logistic Regression")
print(classification_report(y_test, y_pred_lr))



[[2776  145]
 [   0    6]]
Logistic Regression
              precision    recall  f1-score   support

           0       1.00      0.95      0.97      2921
           1       0.04      1.00      0.08         6

    accuracy                           0.95      2927
   macro avg       0.52      0.98      0.53      2927
weighted avg       1.00      0.95      0.97      2927



In [85]:
#Decision Tree

dt_smote = DecisionTreeClassifier(
    max_depth=6,
    min_samples_leaf=15,
    random_state=42
)

dt_smote.fit(X_train_smote, y_train_smote)
y_pred_dt = dt_smote.predict(X_test)

print(confusion_matrix(y_test , y_pred_lr))
print("Decision Tree")
print(classification_report(y_test, y_pred_dt))



[[2776  145]
 [   0    6]]
Decision Tree
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      2921
           1       0.02      0.67      0.04         6

    accuracy                           0.94      2927
   macro avg       0.51      0.80      0.50      2927
weighted avg       1.00      0.94      0.97      2927



In [87]:
rf_smote = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=10,
    random_state=42
)

rf_smote.fit(X_train_smote, y_train_smote)
y_pred_rf = rf_smote.predict(X_test)

print(confusion_matrix(y_test , y_pred_lr))
print("Random Forest")
print(classification_report(y_test, y_pred_rf))



[[2776  145]
 [   0    6]]
Random Forest + SMOTE
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      2921
           1       0.10      0.67      0.17         6

    accuracy                           0.99      2927
   macro avg       0.55      0.83      0.58      2927
weighted avg       1.00      0.99      0.99      2927



In [90]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

knn_smote = KNeighborsClassifier(
    n_neighbors=5,
    metric='minkowski'
)

knn_smote.fit(X_train_smote_scaled, y_train_smote)

y_pred_knn = knn_smote.predict(X_test_scaled)

print(confusion_matrix(y_test , y_pred_lr))
print("KNN")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))



[[2776  145]
 [   0    6]]
KNN
Accuracy: 0.9678852066962761
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      2921
           1       0.03      0.50      0.06         6

    accuracy                           0.97      2927
   macro avg       0.52      0.73      0.52      2927
weighted avg       1.00      0.97      0.98      2927



In [91]:
k_values = range(3, 21)
knn_results = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_smote_scaled, y_train_smote)
    y_pred = knn.predict(X_test_scaled)
    
    knn_results.append({
        "k": k,
        "accuracy": accuracy_score(y_test, y_pred)
    })

pd.DataFrame(knn_results)


Unnamed: 0,k,accuracy
0,3,0.977451
1,4,0.977451
2,5,0.967885
3,6,0.967885
4,7,0.961394
5,8,0.962077
6,9,0.956269
7,10,0.957636
8,11,0.950803
9,12,0.952169


In [88]:
svm_smote = SVC(kernel='rbf')
svm_smote.fit(X_train_smote_scaled, y_train_smote)

y_pred_svm = svm_smote.predict(X_test_scaled)

print(confusion_matrix(y_test , y_pred_lr))
print("SVM")
print(classification_report(y_test, y_pred_svm))



[[2776  145]
 [   0    6]]
SVM
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2921
           1       0.04      0.17      0.06         6

    accuracy                           0.99      2927
   macro avg       0.52      0.58      0.53      2927
weighted avg       1.00      0.99      0.99      2927



In [94]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "Random Forest",
        "KNN",
        "SVM"
    ],
    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_dt),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_knn),
        accuracy_score(y_test, y_pred_svm)
    ]
})

results.sort_values(by="Accuracy", ascending=False)


Unnamed: 0,Model,Accuracy
4,SVM,0.990092
2,Random Forest,0.987017
3,KNN,0.967885
0,Logistic Regression,0.950461
1,Decision Tree,0.936795


In [95]:
from sklearn.metrics import roc_auc_score

y_prob_rf = rf_smote.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob_rf)

print("Random Forest + SMOTE ROC-AUC:", roc_auc)


Random Forest + SMOTE ROC-AUC: 0.9858495948875956
