In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_parquet('/content/cleaned_hospital_readmission.parquet', engine='pyarrow')


In [None]:
y = df.readmitted.copy()
col_to_use = [
    'race', 'gender', 'age', 'time_in_hospital', 'num_medications',
    'number_outpatient', 'number_emergency', 'number_inpatient',
    'number_diagnoses', 'metformin', 'repaglinide', 'nateglinide',
    'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin',
    'glyburide-metformin', 'change', 'diabetesMed', 'admission_type',
    'admission_sources', 'discharge_dispositions', 'primary_diagnosis_1',
    'primary_diagnosis_2', 'primary_diagnosis_3'
]
X = df[col_to_use].copy()

In [None]:
numerical_cols = []
categorical_cols =[]
for x in X.columns:
    if df[x].dtype in ['object', 'category']:
        categorical_cols.append(x)
    elif df[x].dtype == 'int64':
        numerical_cols.append(x)


# now we label those that were readmitted within 30 days as 1 and all other as zero
y = y.apply(lambda x: 1 if x == '<30' else 0)

y.value_counts(normalize=True), y.shape

(readmitted
 0    0.884783
 1    0.115217
 Name: proportion, dtype: float64,
 (95672,))

In [None]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [None]:
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='rbf', C=1, gamma='scale', probability=True))
])


In [None]:
# Take a random 1% sample of the entire dataset
X_sample, _, y_sample, _ = train_test_split(X, y, test_size=0.90, random_state=42, stratify=y)

# Now, split this small dataset into train and test sets
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVM
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],  # Regularization parameter
    'classifier__gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient
    'classifier__kernel': ['rbf']  # Using RBF kernel
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)

# Train on the reduced dataset
grid_search.fit(X_train_small, y_train_small)

# Best parameters found
print("Best parameters:", grid_search.best_params_)


Best parameters: {'classifier__C': 100, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf'}


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_small)



In [None]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, confusion_matrix, classification_report

# Predictions on the test set
y_pred = best_model.predict(X_test_small)

# Calculate Metrics
accuracy = accuracy_score(y_test_small, y_pred)
precision = precision_score(y_test_small, y_pred)
recall = recall_score(y_test_small, y_pred)
f1 = f1_score(y_test_small, y_pred)
f2 = fbeta_score(y_test_small, y_pred, beta=2)  # F2-score (recall-weighted)

# Print Results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"F2 Score: {f2:.4f}")

# Print Confusion Matrix
conf_matrix = confusion_matrix(y_test_small, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Print Classification Report
print("\nClassification Report:")
print(classification_report(y_test_small, y_pred))


Accuracy: 0.8312
Precision: 0.1634
Recall: 0.1136
F1 Score: 0.1340
F2 Score: 0.1210

Confusion Matrix:
[[1566  128]
 [ 195   25]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      1694
           1       0.16      0.11      0.13       220

    accuracy                           0.83      1914
   macro avg       0.53      0.52      0.52      1914
weighted avg       0.81      0.83      0.82      1914



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, classification_report, confusion_matrix

df_sampled = df.sample(frac=0.1, random_state=42)


# Define features & target variable
col_to_use = [
    'race', 'gender', 'age', 'time_in_hospital', 'num_medications',
    'number_outpatient', 'number_emergency', 'number_inpatient',
    'number_diagnoses', 'metformin', 'repaglinide', 'nateglinide',
    'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin',
    'glyburide-metformin', 'change', 'diabetesMed', 'admission_type',
    'admission_sources', 'discharge_dispositions', 'primary_diagnosis_1',
    'primary_diagnosis_2', 'primary_diagnosis_3'
]

# Extract features & target
X = df_sampled[col_to_use].copy()
y = df_sampled['readmitted'].copy()

# Label readmissions within 30 days as 1, others as 0
y = y.apply(lambda x: 1 if x == '<30' else 0)

# Identify categorical & numerical columns
numerical_cols = []
categorical_cols = []

for col in X.columns:
    if X[col].dtype in ['object', 'category']:
        categorical_cols.append(col)
    elif X[col].dtype == 'int64' or X[col].dtype == 'float64':
        numerical_cols.append(col)

print(f"Categorical Columns: {categorical_cols}")
print(f"Numerical Columns: {numerical_cols}")

# Check class imbalance
print(y.value_counts(normalize=True))  # Check imbalance before SMOTE


Categorical Columns: ['race', 'gender', 'age', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'change', 'diabetesMed', 'admission_type', 'admission_sources', 'discharge_dispositions', 'primary_diagnosis_1', 'primary_diagnosis_2', 'primary_diagnosis_3']
Numerical Columns: ['time_in_hospital', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
readmitted
0    0.886067
1    0.113933
Name: proportion, dtype: float64


In [None]:
# Splitting Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Apply preprocessing to train & test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Apply SMOTE to balance dataset
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # Adjust ratio if needed
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_transformed, y_train)

print(f"After SMOTE: {pd.Series(y_train_balanced).value_counts(normalize=True)}")  # Check new class distribution


After SMOTE: readmitted
0    0.666699
1    0.333301
Name: proportion, dtype: float64


In [None]:
# Train SVM with class weight balancing
svm_model = SVC(kernel='rbf', class_weight='balanced', C=100, gamma='scale', probability=True)
svm_model.fit(X_train_balanced, y_train_balanced)

# Predict on test set
y_pred = svm_model.predict(X_test_transformed)


In [None]:
# Compute Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
f2 = fbeta_score(y_test, y_pred, beta=2)  # Focuses more on recall

# Print Results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"F2 Score: {f2:.4f}")

# Print Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8088
Precision: 0.1408
Recall: 0.1330
F1 Score: 0.1368
F2 Score: 0.1345

Confusion Matrix:
[[1519  177]
 [ 189   29]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      1696
           1       0.14      0.13      0.14       218

    accuracy                           0.81      1914
   macro avg       0.52      0.51      0.51      1914
weighted avg       0.80      0.81      0.81      1914



In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, fbeta_score

# Splitting Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Apply preprocessing to train & test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Train SVM with class weight balancing
svm_model = SVC(kernel='rbf', class_weight='balanced', C=100, gamma='scale', probability=True)
svm_model.fit(X_train_transformed, y_train)

# Get predicted probabilities
y_probs = svm_model.predict_proba(X_test_transformed)[:, 1]

# Tune the decision threshold
best_threshold = 0.5  # Default threshold
best_f2 = 0

for threshold in np.arange(0.1, 1.0, 0.05):
    y_pred = (y_probs >= threshold).astype(int)
    f2 = fbeta_score(y_test, y_pred, beta=2)
    if f2 > best_f2:
        best_f2 = f2
        best_threshold = threshold

# Apply best threshold
y_pred_final = (y_probs >= best_threshold).astype(int)

# Evaluate model
precision = precision_score(y_test, y_pred_final)
recall = recall_score(y_test, y_pred_final)
f2_score = fbeta_score(y_test, y_pred_final, beta=2)

print(f"Optimal Threshold: {best_threshold:.2f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F2 Score: {f2_score:.4f}")


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, fbeta_score

# Use only 5% of the whole dataset
df_sampled = df.sample(frac=0.05, random_state=42)
X_sampled = df_sampled.drop(columns=['readmitted'])  # Assuming 'readmitted' is the target
y_sampled = df_sampled['readmitted'].map({'>30': 0, 'NO': 0, '<30': 1}).astype(int)


# Splitting Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Apply preprocessing to train & test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Train SVM with class weight balancing
svm_model = SVC(kernel='rbf', class_weight='balanced', C=100, gamma='scale', probability=True)
svm_model.fit(X_train_transformed, y_train)

# Get predicted probabilities
y_probs = svm_model.predict_proba(X_test_transformed)[:, 1]

# Tune the decision threshold
best_threshold = 0.60  # Default threshold
best_f2 = 0

for threshold in np.linspace(0.1, 0.99, 20):  # More granular threshold tuning
    y_pred = (y_probs >= threshold).astype(int)
    f2 = fbeta_score(y_test, y_pred, beta=2)
    if f2 > best_f2:
        best_f2 = f2
        best_threshold = threshold

best_threshold = 0.0001
# Apply best threshold
y_pred_final = (y_probs >= best_threshold).astype(int)

# Evaluate model
precision = precision_score(y_test, y_pred_final)
recall = recall_score(y_test, y_pred_final)
f2_score = fbeta_score(y_test, y_pred_final, beta=2)

print(f"Optimal Threshold: {best_threshold:.2f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F2 Score: {f2_score:.4f}")



Optimal Threshold: 0.00
Precision: 0.1108
Recall: 1.0000
F2 Score: 0.3838


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94       851
           1       0.00      0.00      0.00       106

    accuracy                           0.89       957
   macro avg       0.44      0.50      0.47       957
weighted avg       0.79      0.89      0.84       957



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
