In [5]:

import pandas as pd
import numpy as np


df = pd.read_csv("diabetic_data.csv")
print("Initial Shape:", df.shape)


#  Drop Unnecessary ID Columns

df.drop(['encounter_id', 'patient_nbr'], axis=1, inplace=True)


# Replace Missing Value Symbol '?' with NaN

df.replace('?', np.nan, inplace=True)


# Step 5: Encode Target Variable
# '<30' = 1 (Readmitted within 30 days)
# '>30' or 'NO' = 0

df['readmitted'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)


# Handle Missing Values
# Drop columns with >50% missing values
# Fill remaining NaN with Mode (most frequent value)

df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)


# Encode Categorical Variables

categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


#  Display Final Dataset Info

print("Final Shape after preprocessing:", df.shape)
print(df.head())


# Save Preprocessed Data

df.to_csv("preprocessed_diabetic_data.csv", index=False)
print("✅ Preprocessed dataset saved as 'preprocessed_diabetic_data.csv'")


Initial Shape: (77599, 50)
Final Shape after preprocessing: (77599, 2296)
   admission_type_id  discharge_disposition_id  admission_source_id  \
0                6.0                      25.0                  1.0   
1                1.0                       1.0                  7.0   
2                1.0                       1.0                  7.0   
3                1.0                       1.0                  7.0   
4                1.0                       1.0                  7.0   

   time_in_hospital  num_lab_procedures  num_procedures  num_medications  \
0               1.0                41.0             0.0              1.0   
1               3.0                59.0             0.0             18.0   
2               2.0                11.0             5.0             13.0   
3               2.0                44.0             1.0             16.0   
4               1.0                51.0             0.0              8.0   

   number_outpatient  number_emergency  nu

In [8]:

# Train-Test Split

from sklearn.model_selection import train_test_split

X = df.drop("readmitted", axis=1)
y = df["readmitted"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


# Feature Selection

from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer

# Keep numeric columns only
X_train_num = X_train.select_dtypes(include=[np.number])
X_test_num = X_test[X_train_num.columns]

# Impute missing numeric values with median
imp = SimpleImputer(strategy='median')
X_train_imp = pd.DataFrame(imp.fit_transform(X_train_num), columns=X_train_num.columns, index=X_train_num.index)
X_test_imp = pd.DataFrame(imp.transform(X_test_num), columns=X_train_num.columns, index=X_test.index)

# Variance Threshold
vt = VarianceThreshold(threshold=0.0)
X_train_vt = pd.DataFrame(vt.fit_transform(X_train_imp), columns=X_train_imp.columns[vt.get_support()], index=X_train_imp.index)
X_test_vt = pd.DataFrame(vt.transform(X_test_imp), columns=X_train_imp.columns[vt.get_support()], index=X_test_imp.index)

# Select top K features
K_TOP_FEATURES = 30
k = min(K_TOP_FEATURES, X_train_vt.shape[1])
skb = SelectKBest(score_func=mutual_info_classif, k=k)
X_train_sel = pd.DataFrame(skb.fit_transform(X_train_vt, y_train),
                           columns=X_train_vt.columns[skb.get_support()],
                           index=X_train_vt.index)
X_test_sel = pd.DataFrame(skb.transform(X_test_vt),
                          columns=X_train_vt.columns[skb.get_support()],
                          index=X_test_vt.index)

print(f"✅ Selected Top {k} Features")



from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=42)
}

results = []
for name, model in models.items():
    print(f" Training {name}...")
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_test_sel)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1
    })

    print(f"{name} Results:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)


# Results Summary

results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False)
print("\n✅ Model Performance Summary:")
print(results_df)



Train shape: (62079, 2295)
Test shape: (15520, 2295)
✅ Selected Top 11 Features
 Training Logistic Regression...
Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     13765
           1       0.46      0.01      0.03      1755

    accuracy                           0.89     15520
   macro avg       0.67      0.51      0.48     15520
weighted avg       0.84      0.89      0.84     15520

--------------------------------------------------
 Training Decision Tree...
Decision Tree Results:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88     13765
           1       0.15      0.18      0.16      1755

    accuracy                           0.79     15520
   macro avg       0.52      0.52      0.52     15520
weighted avg       0.81      0.79      0.80     15520

--------------------------------------------------
 Training Random Forest...
Random Forest Results:
  