In [35]:
import os
import pandas as pd

## download the dataset
# Directory of the raw data files
_data_root = './data/Diabetes'
# Path to the raw training data
_data_filepath = os.path.join(_data_root, 'Diabetes.csv')
# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export= \
    download&confirm={{VALUE}}&id=1k5-1caezQ3zWJbKaiMULTGq-3sz6uThC'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)

df_diabetes = pd.read_csv(_data_filepath )

In [40]:
df_diabetes.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [36]:
def preprocess_diabetes_data(df, data_filepath=None):
    """
    Preprocesses the diabetes dataset by handling categorical and numerical variables,
    creating derived features, and preparing the target variable.
    
    Parameters:
    -----------
    df : pandas.DataFrame or None
        The diabetes dataframe to preprocess. If None, data will be loaded from data_filepath.
    data_filepath : str or None
        Path to the CSV file containing the diabetes data. Used only if df is None.
        
    Returns:
    --------
    X : pandas.DataFrame
        Preprocessed features dataframe
    y : pandas.Series
        Target variable (readmitted)
    """
    import pandas as pd
    import numpy as np
    
    # Load data if dataframe not provided
    if df is None:
        if data_filepath is None:
            raise ValueError("Either df or data_filepath must be provided")
        df = pd.read_csv(data_filepath)
    
    # Define column categories
    ids_cols = ['encounter_id', 'patient_nbr']
    
    categorical_cols = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 
                       'admission_source_id', 'payer_code', 'medical_specialty',
                       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
                       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
                       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
                       'tolazamide', 'examide', 'citoglipton', 'insulin',
                       'glyburide-metformin', 'glipizide-metformin',
                       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
                       'metformin-pioglitazone', 'change', 'readmitted', 'diag_1', 'diag_2', 'diag_3',
                       'diabetesMed', 'max_glu_serum', 'A1Cresult']
    
    numerical_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
                     'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
    
    # Split dataframe into categorical and numerical parts
    df_diabetes_cat = df[categorical_cols]
    df_diabetes_num = df[numerical_cols]
    
    # Process target variable
    y = df_diabetes['readmitted'].copy()
    y = y.apply(lambda x: 'YES' if (x == '<30') else 'NO')
    
    # Process numerical variables
    df_diabetes_num['service_utilization'] = (df_diabetes_num['number_outpatient'] + 
                                             df_diabetes_num['number_emergency'] + 
                                             df_diabetes_num['number_inpatient'])
    
    # Process categorical variables
    no_representative_cat = ['repaglinide', 'nateglinide', 'chlorpropamide', 'acetohexamide', 
                            'glipizide', 'tolbutamide', 'acarbose', 'miglitol', 'troglitazone',
                            'tolazamide', 'glyburide-metformin', 'glipizide-metformin',
                            'glimepiride-pioglitazone', 'metformin-rosiglitazone', 
                            'metformin-pioglitazone', 'payer_code', 
                            'medical_specialty', 'diag_2', 'diag_3']
    
    df_diabetes_cat_depured = df_diabetes_cat.drop(no_representative_cat, axis=1)
    X_cat = df_diabetes_cat_depured.drop('readmitted', axis=1)
    
    # Fill missing values
    X_cat = X_cat.fillna('?')
    
    # Process diagnosis codes
    X_cat['diag_1'] = X_cat['diag_1'].apply(lambda x: level1_diag1(x))
    
    # Group admission source ID
    admission_source_mapping = {
        2: 1, 3: 1,  # Group to 1
        5: 4, 6: 4, 8: 4, 10: 4, 18: 4, 22: 4, 25: 4, 26: 4,  # Group to 4
        15: 9, 17: 9, 20: 9, 21: 9,  # Group to 9
        13: 11, 14: 11, 23: 11, 24: 11  # Group to 11
    }
    X_cat['admission_source_id'] = X_cat['admission_source_id'].replace(admission_source_mapping)
    
    # Group admission type ID
    admission_type_mapping = {
        2: 1, 7: 1, 4: 1,  # Group to Emergency (1)
        6: 5, 8: 5  # Group to Not available (5)
    }
    X_cat['admission_type_id'] = X_cat['admission_type_id'].replace(admission_type_mapping)
    
    # Group discharge disposition ID
    discharge_mapping_1 = {6: 1, 8: 1, 13: 1, 19: 1, 20: 1}  # Group to 1
    discharge_mapping_2 = {3: 2, 4: 2, 5: 2, 9: 2, 10: 2, 12: 2, 14: 2, 22: 2, 23: 2, 24: 2}  # Group to 2
    discharge_mapping_15 = {16: 15, 17: 15, 27: 15, 28: 15, 29: 15, 30: 15}  # Group to 15
    discharge_mapping_18 = {25: 18, 26: 18}  # Group to 18
    
    discharge_mapping = {**discharge_mapping_1, **discharge_mapping_2, **discharge_mapping_15, **discharge_mapping_18}
    X_cat['discharge_disposition_id'] = X_cat['discharge_disposition_id'].replace(discharge_mapping)
    
    # Combine categorical and numerical features
    X = pd.concat([X_cat, df_diabetes_num], axis=1)
    
    return X, y

def level1_diag1(x):
    """
    Maps diagnosis codes to categorical levels
    
    Parameters:
    -----------
    x : int or str
        Diagnosis code
        
    Returns:
    --------
    int
        Mapped diagnosis category (0-8)
    """
    import numpy as np
    
    if isinstance(x, (int, float)) and not np.isnan(x):
        x = int(x)
        if (x >= 390 and x < 460) or (np.floor(x) == 785):
            return 1
        elif (x >= 460 and x < 520) or (np.floor(x) == 786):
            return 2
        elif (x >= 520 and x < 580) or (np.floor(x) == 787):
            return 3
        elif (np.floor(x) == 250):
            return 4
        elif (x >= 800 and x < 1000):
            return 5
        elif (x >= 710 and x < 740):
            return 6
        elif (x >= 580 and x < 630) or (np.floor(x) == 788):
            return 7
        elif (x >= 140 and x < 240):
            return 8
        else:
            return 0
    else:
        return 0

# Transform 

def create_preprocessing_pipeline(X, cat_cols=None, num_cols=None):
    """
    Creates a preprocessing pipeline for transforming categorical and numerical variables
    using scikit-learn's ColumnTransformer.
    
    Parameters:
    -----------
    X : pandas.DataFrame
        DataFrame containing the features to transform
    cat_cols : list or None
        List of categorical column names. If None, uses default list.
    num_cols : list or None
        List of numerical column names. If None, uses default list.
    
    Returns:
    --------
    preprocessor : ColumnTransformer
        Fitted ColumnTransformer object for preprocessing the data
    X_transformed : numpy.ndarray
        Transformed feature matrix
    """
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    
    # Define default columns if not provided
    if cat_cols is None:
        cat_cols = ['race', 'gender', 'age', 'admission_type_id',
                   'discharge_disposition_id', 'admission_source_id', 'metformin',
                   'glimepiride', 'glyburide', 'pioglitazone', 'rosiglitazone', 'examide',
                   'citoglipton', 'insulin', 'change', 'diag_1', 'diabetesMed',
                   'max_glu_serum', 'A1Cresult']
    
    if num_cols is None:
        num_cols = ['time_in_hospital', 'num_lab_procedures',
                   'num_procedures', 'num_medications', 'number_outpatient',
                   'number_emergency', 'number_inpatient', 'number_diagnoses', 
                   'service_utilization']
    
    # Validate that the columns exist in the DataFrame
    missing_cat_cols = [col for col in cat_cols if col not in X.columns]
    missing_num_cols = [col for col in num_cols if col not in X.columns]
    
    if missing_cat_cols:
        raise ValueError(f"Categorical columns not found in DataFrame: {missing_cat_cols}")
    
    if missing_num_cols:
        raise ValueError(f"Numerical columns not found in DataFrame: {missing_num_cols}")
    
    # Pipeline for numerical columns
    numeric_pipeline = Pipeline([
        ("scaler", StandardScaler())
    ])
    
    # Pipeline for categorical columns
    categorical_pipeline = Pipeline([
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    
    # Create column transformer
    preprocessor = ColumnTransformer([
        ("num", numeric_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols)
    ])
    
    # Fit and transform the data
    X_transformed = preprocessor.fit_transform(X)
    
    return preprocessor, X_transformed

def perform_undersampling_from_transformed(X_transformed, y, random_state=42, sampling_strategy=1.0):
    """
    Performs undersampling on already transformed data.
    
    Parameters:
    -----------
    X_transformed : numpy.ndarray
        Transformed feature matrix
    y : pandas.Series or numpy.ndarray
        Target variable (class labels)
    random_state : int, default=42
        Controls the randomization of the algorithm
    sampling_strategy : float or str, default=1.0
        If float, specifies the ratio of minority to majority class samples.
        If 'auto' or 'majority', the majority class is resampled to match the minority.
    
    Returns:
    --------
    X_transformed_resampled : numpy.ndarray
        Undersampled transformed feature matrix
    y_resampled : numpy.ndarray or pandas.Series
        Undersampled target variable
    """
    import numpy as np
    import pandas as pd
    
    # Check if y is a pandas Series
    is_pandas_series = isinstance(y, pd.Series)
    if is_pandas_series:
        y_name = y.name
    
    # Get array representation
    y_array = y.values if is_pandas_series else y
    
    # Find indices of each class
    unique_classes = np.unique(y_array)
    class_indices = {}
    for cls in unique_classes:
        class_indices[cls] = np.where(y_array == cls)[0]
    
    # Display class distribution before resampling
    print("Class distribution before undersampling:")
    for cls, indices in class_indices.items():
        print(f"  Class {cls}: {len(indices)} samples")
    
    # Determine target count for majority class
    if len(unique_classes) != 2:
        raise ValueError("This function only supports binary classification problems")
    
    # Find majority and minority classes
    class_counts = [len(class_indices[cls]) for cls in unique_classes]
    majority_class_idx = np.argmax(class_counts)
    minority_class_idx = 1 - majority_class_idx
    
    majority_class = unique_classes[majority_class_idx]
    minority_class = unique_classes[minority_class_idx]
    
    minority_count = len(class_indices[minority_class])
    majority_count = len(class_indices[majority_class])
    
    # Calculate target sample count for majority class
    if sampling_strategy == 'auto' or sampling_strategy == 'majority':
        target_majority_count = minority_count
    elif isinstance(sampling_strategy, (int, float)):
        # sampling_strategy is ratio of minority:majority
        target_majority_count = int(minority_count / sampling_strategy)
    else:
        raise ValueError("sampling_strategy must be 'auto', 'majority', or a number")
    
    # Undersample majority class
    np.random.seed(random_state)
    selected_majority_indices = np.random.choice(
        class_indices[majority_class], 
        size=min(target_majority_count, majority_count), 
        replace=False
    )
    
    # Combine with minority class indices
    selected_indices = np.concatenate([selected_majority_indices, class_indices[minority_class]])
    
    # Get resampled data
    X_transformed_resampled = X_transformed[selected_indices]
    y_resampled = y_array[selected_indices]
    
    # Display class distribution after resampling
    unique_resampled, counts_resampled = np.unique(y_resampled, return_counts=True)
    print("Class distribution after undersampling:")
    for cls, count in zip(unique_resampled, counts_resampled):
        print(f"  Class {cls}: {count} samples")
    
    # Preserve pandas Series type if input was a Series
    if is_pandas_series:
        y_resampled = pd.Series(y_resampled, name=y_name)
    
    return X_transformed_resampled, y_resampled

In [38]:
X, y = preprocess_diabetes_data(df_diabetes)
Preprocessor, X_transformed = create_preprocessing_pipeline(X, cat_cols=None, num_cols=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_diabetes_num['service_utilization'] = (df_diabetes_num['number_outpatient'] +


In [39]:
X_resampled, y_resampled = perform_undersampling_from_transformed(X_transformed, y)

Class distribution before undersampling:
  Class NO: 90409 samples
  Class YES: 11357 samples
Class distribution after undersampling:
  Class NO: 11357 samples
  Class YES: 11357 samples


In [32]:
def train_evaluate_transformed_data(X_transformed, y, model=None, test_size=0.3, random_state=42):
    """
    Trains and evaluates a model using already transformed data.
    
    Parameters:
    -----------
    X_transformed : numpy.ndarray
        Transformed feature matrix
    y : pandas.Series or numpy.ndarray
        Target variable
    model : estimator or None
        Scikit-learn estimator to use. If None, uses RandomForestClassifier.
    test_size : float
        Proportion of the dataset to include in the test split
    random_state : int
        Random state for reproducibility
        
    Returns:
    --------
    model : estimator
        Fitted model
    results : dict
        Dictionary containing evaluation metrics
    """
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
    from sklearn.ensemble import RandomForestClassifier
    import numpy as np
    
    # Create model if not provided
    if model is None:
        model = RandomForestClassifier(random_state=random_state)
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_transformed, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Calculate metrics
    results = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, pos_label="YES"),
        "recall": recall_score(y_test, y_pred, pos_label="YES"),
        "f1_score": f1_score(y_test, y_pred, pos_label="YES")
    }
    
    # Calculate ROC AUC if probability estimates are available
    if y_pred_proba is not None:
        results["roc_auc"] = roc_auc_score(y_test == "YES", y_pred_proba)
    
    return model, results

# Después de obtener X_transformed_resampled y y_resampled:
model, results = train_evaluate_transformed_data(X_resampled, y_resampled)
print(results)

{'accuracy': 0.6033749082905356, 'precision': 0.6054523666866387, 'recall': 0.5931904901673026, 'f1_score': 0.5992587101556709, 'roc_auc': 0.6400755883013569}
