In [None]:
import pandas as pd
!pip3 install -U ucimlrepo
from ucimlrepo import fetch_ucirepo
# fetch dataset
dia_130 = fetch_ucirepo(id=296)
# metadata
print("A metadata Description")
print(dia_130.metadata)

print(dia_130.shape)

#variable information
print(dia_130.variables)
# data (as pandas dataframes)
X_130 = dia_130.data.features
y_130 = dia_130.data.targets
pandas_df = pd.DataFrame(X_130, columns=dia_130.feature_names)

# Counting NaNs in each column
nan_counts = pandas_df.isnull().sum()

print("Number of NaN values per feature/column before cleaning:")
print(nan_counts)

# NaN counts for the target variable:
nan_counts_targets = pd.DataFrame(y_130).isnull().sum()
print("Number of NaN values in target column:")
print(nan_counts_targets)

integer_df = pandas_df.select_dtypes(include=['int64'])

# Display the new DataFrame with only integer columns
print("DataFrame with only integer columns:")
print(integer_df.head())

print("Data types of the columns in the new DataFrame:")
print(integer_df.dtypes)

# Counting NaNs in each column
nan_counts = integer_df.isnull().sum()

print("Number of NaN values per feature/column before cleaning:")
print(nan_counts)



  df = pd.read_csv(data_url)


A metadata Description
{'uci_id': 296, 'name': 'Diabetes 130-US Hospitals for Years 1999-2008', 'repository_url': 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008', 'data_url': 'https://archive.ics.uci.edu/static/public/296/data.csv', 'abstract': 'The dataset represents ten years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. Each row concerns hospital records of patients diagnosed with diabetes, who underwent laboratory, medications, and stayed up to 14 days. The goal is to determine the early readmission of the patient within 30 days of discharge.\nThe problem is important for the following reasons. Despite high-quality evidence showing improved clinical outcomes for diabetic patients who receive various preventive and therapeutic interventions, many patients do not receive them. This can be partially attributed to arbitrary diabetes management in hospital environments, which fail to attend to glycemic control.

Normalized and scaled data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
integer_df_scaled = pd.DataFrame(scaler.fit_transform(integer_df), columns=integer_df.columns)

SelectKBest or Random forest to identify top variables

In [None]:
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

# Convert y_130 to a 1D array using ravel()
y_130_1d = y_130.values.ravel()

# SelectKBest
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(integer_df_scaled, y_130_1d)
selected_features = selector.get_support(indices=True)
print("Top 10 features selected by SelectKBest:", integer_df_scaled.columns[selected_features])

# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(integer_df_scaled, y_130_1d)
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
print("Top 10 features selected by Random Forest:", integer_df_scaled.columns[indices[:10]])

Top 10 features selected by SelectKBest: Index(['discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses'],
      dtype='object')
Top 10 features selected by Random Forest: Index(['num_lab_procedures', 'num_medications', 'time_in_hospital',
       'num_procedures', 'number_diagnoses', 'discharge_disposition_id',
       'number_inpatient', 'admission_type_id', 'admission_source_id',
       'number_outpatient'],
      dtype='object')


Trying different rebalancing and splitting strategies

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(integer_df_scaled, y_130_1d, test_size=0.2, stratify=y_130_1d)

# SMOTE only in training
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

Fitting model on key features

In [None]:
from sklearn.linear_model import LogisticRegression

key_features_kbest = ['discharge_disposition_id', 'admission_source_id', 'time_in_hospital']
X_train_key_kbest = X_train_resampled[key_features_kbest]
X_test_key_kbest = X_test[key_features_kbest]
model_kbest = LogisticRegression()
model_kbest.fit(X_train_key_kbest, y_train_resampled.ravel())

key_features_rf = ['num_lab_procedures', 'num_medications', 'time_in_hospital']
X_train_key_rf = X_train_resampled[key_features_rf]
X_test_key_rf = X_test[key_features_rf]
model_rf = LogisticRegression()
model_rf.fit(X_train_key_rf, y_train_resampled.ravel())

Tuning hyperparameters and evaluate the model

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Iteration 1 - Using key features from SelectKBest
param_grid_1_kbest = {
    'C': [0.1, 1, 10],
    'penalty': ['l2', None]
}
grid_search_1_kbest = GridSearchCV(estimator=model_kbest, param_grid=param_grid_1_kbest, scoring='roc_auc_ovr', cv=5)
grid_search_1_kbest.fit(X_train_key_kbest, y_train_resampled.ravel())
best_model_1_kbest = grid_search_1_kbest.best_estimator_
if best_model_1_kbest.get_params()['penalty'] == 'none':
    best_model_1_kbest.set_params(penalty=None)
y_pred_proba_1_kbest = best_model_1_kbest.predict_proba(X_test_key_kbest)
auc_1_kbest = roc_auc_score(y_test, y_pred_proba_1_kbest, multi_class='ovr')

# Iteration 2 - Using key features from SelectKBest
param_grid_2_kbest = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
grid_search_2_kbest = GridSearchCV(estimator=model_kbest, param_grid=param_grid_2_kbest, scoring='roc_auc_ovr', cv=5)
grid_search_2_kbest.fit(X_train_key_kbest, y_train_resampled.ravel())
best_model_2_kbest = grid_search_2_kbest.best_estimator_
y_pred_proba_2_kbest = best_model_2_kbest.predict_proba(X_test_key_kbest)
auc_2_kbest = roc_auc_score(y_test, y_pred_proba_2_kbest, multi_class='ovr')

# Iteration 3 - Using key features from SelectKBest
param_grid_3_kbest = {
    'C': [0.1, 1, 10]
}
grid_search_3_kbest = GridSearchCV(estimator=model_kbest, param_grid=param_grid_3_kbest, scoring='roc_auc_ovr', cv=5)
grid_search_3_kbest.fit(X_train_key_kbest, y_train_resampled.ravel())
best_model_3_kbest = grid_search_3_kbest.best_estimator_
y_pred_proba_3_kbest = best_model_3_kbest.predict_proba(X_test_key_kbest)
auc_3_kbest = roc_auc_score(y_test, y_pred_proba_3_kbest, multi_class='ovr')

# Iteration 1 - Using key features from Random Forest
param_grid_1_rf = {
    'C': [0.1, 1, 10],
    'penalty': ['l2', None]
}
grid_search_1_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_1_rf, scoring='roc_auc_ovr', cv=5)
grid_search_1_rf.fit(X_train_key_rf, y_train_resampled.ravel())
best_model_1_rf = grid_search_1_rf.best_estimator_
if best_model_1_rf.get_params()['penalty'] == 'none':
    best_model_1_rf.set_params(penalty=None)
y_pred_proba_1_rf = best_model_1_rf.predict_proba(X_test_key_rf)
auc_1_rf = roc_auc_score(y_test, y_pred_proba_1_rf, multi_class='ovr')

# Iteration 2 - Using key features from Random Forest
param_grid_2_rf = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
grid_search_2_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_2_rf, scoring='roc_auc_ovr', cv=5)
grid_search_2_rf.fit(X_train_key_rf, y_train_resampled.ravel())
best_model_2_rf = grid_search_2_rf.best_estimator_
y_pred_proba_2_rf = best_model_2_rf.predict_proba(X_test_key_rf)
auc_2_rf = roc_auc_score(y_test, y_pred_proba_2_rf, multi_class='ovr')

# Iteration 3 - Using key features from Random Forest
param_grid_3_rf = {
    'C': [0.1, 1, 10]
}
grid_search_3_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_3_rf, scoring='roc_auc_ovr', cv=5)
grid_search_3_rf.fit(X_train_key_rf, y_train_resampled.ravel())
best_model_3_rf = grid_search_3_rf.best_estimator_
y_pred_proba_3_rf = best_model_3_rf.predict_proba(X_test_key_rf)
auc_3_rf = roc_auc_score(y_test, y_pred_proba_3_rf, multi_class='ovr')

# Print AUC scores for all iterations
print("AUC Scores - SelectKBest:")
print("Iteration 1 - AUC: {:.3f}".format(auc_1_kbest))
print("Iteration 2 - AUC: {:.3f}".format(auc_2_kbest))
print("Iteration 3 - AUC: {:.3f}".format(auc_3_kbest))
print("AUC Scores - Random Forest:")
print("Iteration 1 - AUC: {:.3f}".format(auc_1_rf))
print("Iteration 2 - AUC: {:.3f}".format(auc_2_rf))
print("Iteration 3 - AUC: {:.3f}".format(auc_3_rf))



AUC Scores - SelectKBest:
Iteration 1 - AUC: 0.552
Iteration 2 - AUC: 0.553
Iteration 3 - AUC: 0.552
AUC Scores - Random Forest:
Iteration 1 - AUC: 0.541
Iteration 2 - AUC: 0.541
Iteration 3 - AUC: 0.541


In code snippet above:

    Iteration 1 uses the parameter grid param_grid_1 with 'l2' and 'none' penalties.
    Iteration 2 uses the parameter grid param_grid_2 with 'l1' and 'l2' penalties and 'liblinear' and 'saga' solvers.
    Iteration 3 uses the parameter grid param_grid_3 with only the 'C' parameter.

Each iteration performs the following steps:

    Defines the parameter grid for the specific iteration.
    Creates a GridSearchCV object with the specified parameter grid and other settings.
    Fits the grid search object on the training data.
    Retrieves the best model found by the grid search.
    Makes predictions using the best model on the test data.
    Calculates the AUC score using the roc_auc_score function with the multi_class='ovr' option.
    Prints the AUC score for the specific iteration.


The previous AUC scores aren't great, let's make some modifications:

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Reduced hyperparameter grid for SelectKBest model
param_grid_kbest = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced', None]
}
grid_search_kbest = GridSearchCV(estimator=model_kbest, param_grid=param_grid_kbest, scoring='roc_auc_ovr', cv=5)
grid_search_kbest.fit(X_train_key_kbest, y_train_resampled.ravel())
best_model_kbest = grid_search_kbest.best_estimator_
y_pred_proba_kbest = best_model_kbest.predict_proba(X_test_key_kbest)
auc_kbest = roc_auc_score(y_test, y_pred_proba_kbest, multi_class='ovr')

# Reduced hyperparameter grid for Random Forest model
param_grid_rf = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced', None]
}
grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, scoring='roc_auc_ovr', cv=5)
grid_search_rf.fit(X_train_key_rf, y_train_resampled.ravel())
best_model_rf = grid_search_rf.best_estimator_
y_pred_proba_rf = best_model_rf.predict_proba(X_test_key_rf)
auc_rf = roc_auc_score(y_test, y_pred_proba_rf, multi_class='ovr')

# Print the best hyperparameters and AUC scores
print("Best Hyperparameters - SelectKBest:", grid_search_kbest.best_params_)
print("AUC Score - SelectKBest: {:.3f}".format(auc_kbest))
print("Best Hyperparameters - Random Forest:", grid_search_rf.best_params_)
print("AUC Score - Random Forest: {:.3f}".format(auc_rf))

Best Hyperparameters - SelectKBest: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
AUC Score - SelectKBest: 0.553
Best Hyperparameters - Random Forest: {'C': 0.1, 'class_weight': None, 'penalty': 'l1', 'solver': 'saga'}
AUC Score - Random Forest: 0.541


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Feature selection using SelectKBest
k_kbest = min(10, X_train_key_kbest.shape[1])  # Number of top features to select
selector_kbest = SelectKBest(score_func=f_classif, k=k_kbest)
X_train_selected_kbest = selector_kbest.fit_transform(X_train_key_kbest, y_train_resampled.ravel())
X_test_selected_kbest = selector_kbest.transform(X_test_key_kbest)

k_rf = min(10, X_train_key_rf.shape[1])  # Number of top features to select
selector_rf = SelectKBest(score_func=f_classif, k=k_rf)
X_train_selected_rf = selector_rf.fit_transform(X_train_key_rf, y_train_resampled.ravel())
X_test_selected_rf = selector_rf.transform(X_test_key_rf)

# Expanded hyperparameter tuning for SelectKBest model
param_grid_kbest = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'newton-cg', 'sag']
}
model_kbest = LogisticRegression(random_state=42)
grid_search_kbest = GridSearchCV(estimator=model_kbest, param_grid=param_grid_kbest, scoring='roc_auc_ovr', cv=3)
grid_search_kbest.fit(X_train_selected_kbest, y_train_resampled.ravel())

best_model_kbest = grid_search_kbest.best_estimator_

# Expanded hyperparameter tuning for Random Forest model
param_grid_rf = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'newton-cg', 'sag']
}
model_rf = LogisticRegression(random_state=42)
grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, scoring='roc_auc_ovr', cv=3)
grid_search_rf.fit(X_train_selected_rf, y_train_resampled.ravel())

best_model_rf = grid_search_rf.best_estimator_

# Cross-validation and model evaluation for SelectKBest model
cv_scores_kbest = cross_val_score(best_model_kbest, X_train_selected_kbest, y_train_resampled.ravel(), cv=3, scoring='roc_auc_ovr')
print("Cross-validation AUC scores - SelectKBest:", cv_scores_kbest)
print("Mean AUC score - SelectKBest:", cv_scores_kbest.mean())

y_pred_kbest = best_model_kbest.predict(X_test_selected_kbest)
precision_kbest = precision_score(y_test, y_pred_kbest, average='weighted')
recall_kbest = recall_score(y_test, y_pred_kbest, average='weighted')
f1_kbest = f1_score(y_test, y_pred_kbest, average='weighted')
print("Precision - SelectKBest:", precision_kbest)
print("Recall - SelectKBest:", recall_kbest)
print("F1-score - SelectKBest:", f1_kbest)

# Cross-validation and model evaluation for Random Forest model
cv_scores_rf = cross_val_score(best_model_rf, X_train_selected_rf, y_train_resampled.ravel(), cv=3, scoring='roc_auc_ovr')
print("Cross-validation AUC scores - Random Forest:", cv_scores_rf)
print("Mean AUC score - Random Forest:", cv_scores_rf.mean())

y_pred_rf = best_model_rf.predict(X_test_selected_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
print("Precision - Random Forest:", precision_rf)
print("Recall - Random Forest:", recall_rf)
print("F1-score - Random Forest:", f1_rf)

Cross-validation AUC scores - SelectKBest: [0.54946517 0.54696903 0.54711817]
Mean AUC score - SelectKBest: 0.5478507902481184
Precision - SelectKBest: 0.46162349160835375
Recall - SelectKBest: 0.3888179227670237
F1-score - SelectKBest: 0.4077173874754784
Cross-validation AUC scores - Random Forest: [0.53848396 0.53008886 0.53664823]
Mean AUC score - Random Forest: 0.5350736854582316
Precision - Random Forest: 0.45423137704506167
Recall - Random Forest: 0.39574530804755825
F1-score - Random Forest: 0.40203181416010303
