In [0]:
!pip install imblearn

# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split


In [0]:
# Defines function to load data and returns it in a pandas df
def load_data(path):
    df = pd.read_csv(path, encoding="utf-8")
    print("Shape: ", df.shape)
    return df

In [0]:

# Defines function to validate data by printing out general EDA
def validate_data(df, target_feature):
    print("Pre-processed Dataframe Validation: \n")

    # Obtaining general df info
    print("\nDataframe info:")
    print(df.info())

    # Checking nulls per column
    print("\nNulls per Column:")
    print(df.isna().sum())

    # Checking distribution of target feature
    print(f"\nTarget Feature: {target_feature}")
    labels = df[target_feature].unique()
    labels = [str(label) for label in labels]
    df[target_feature].value_counts(1).plot(kind='barh',figsize=(10, 2)).spines[['top', 'right']].set_visible(False);
    plt.title(f'{target_feature} Distribution (%)', fontsize=18)
    plt.yticks(ticks=range(len(labels)), labels=labels)
    plt.show()

    # Creating a correlation matrix
    print("\nCorrelation Matrix:")
    corr_matrix = df.corr()
    plt.figure(figsize=(16, 8))
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()

  # Categorizing Binary Features and plotting distribution
    print("\nBinary Feature Distribution:")
    bool_vars = (df.nunique()[df.nunique() == 2]
                  .index
                  .drop(labels=target_feature))
    num_vars = [var for var in df.columns if var not in bool_vars and var != target_feature]

    for col in bool_vars:
        (df.groupby(target_feature)[col]
        .value_counts(1)
        .unstack()
        .iloc[:,::-1]
        .plot(kind='barh',stacked=True,figsize=(10, 2), alpha=1)
        .spines[['top', 'right']].set_visible(False))
        plt.legend(['Yes', "No"],bbox_to_anchor=(1, 1, 0, 0),shadow=False, frameon=False)
        plt.yticks(ticks=[0,1], labels=['Non-Diabetic', 'Diabetic'])
        plt.tight_layout()
        plt.title(col, fontsize=18)
        plt.show()

    # Plotting numeric feature distribution
    print("\nNumeric Feature Distribution:")
    plt.figure(figsize=(20, 20))
    num_rows = len(num_vars)

    for index, var in enumerate(num_vars):
        plt.subplot(num_rows, 2, index+1)
        df[df[target_feature] == 0][var].hist(alpha=0.5, label='Diabetes=0', bins=30)
        df[df[target_feature] == 1][var].hist(alpha=0.5, label='Diabetes=1', bins=30)
        plt.title(var)
        plt.xlabel(var)
        plt.ylabel('Frequency')
        plt.legend()

    plt.tight_layout()
    plt.show()

    pass

In [0]:
# Defines function to process data and returns processed dataframe
def process_data(df, target_feature, test_size, random_state):
    # Separate features and target
    X = df.drop(target_feature, axis=1)
    y = df[target_feature]

    # Split the data into training and testing sets (before resampling)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Apply undersampling only to the training set
    nm = NearMiss()
    X_train_res, y_train_res = nm.fit_resample(X_train, y_train)

    # Log before and after resampling counts
    print("\nBefore Under-Sampling, Class '1' (Train): {}".format(sum(y_train == 1)))
    print("Before Under-Sampling, Class '0' (Train): {}".format(sum(y_train == 0)))
    print("After Under-Sampling, Class '1': {}".format(sum(y_train_res == 1)))
    print("After Under-Sampling, Class '0': {}".format(sum(y_train_res == 0)))

    # Convert the training splits into DataFrames for consistency
    train_df = pd.DataFrame(X_train_res, columns=X.columns)
    train_df[target_feature] = y_train_res

    # Testing set remains unchanged
    test_df = pd.DataFrame(X_test, columns=X.columns)
    test_df[target_feature] = y_test

    print('\nResampled Training Data Shape: ', train_df.shape)
    print('Testing Data Shape (Unchanged): ', test_df.shape)

    return train_df, test_df


In [0]:
# Main execution
if __name__ == "__main__":
    # Set up widgets for parameter passing
    dbutils.widgets.text("path", "")
    dbutils.widgets.text("target_feature", "")
    dbutils.widgets.text("test_size", "")
    dbutils.widgets.text("random_state", "")
    
    # Retrieve parameters
    path = dbutils.widgets.get("path")
    target_feature = dbutils.widgets.get("target_feature")
    test_size = float(dbutils.widgets.get("test_size"))
    random_state = int(dbutils.widgets.get("random_state")) 

    # Perform EDA and processing
    df = load_data(path)
    validate_data(df, target_feature)
    train_df, test_df = process_data(df, target_feature, test_size, random_state)

    # Save processed training and testing data
    train_df_path = "/dbfs/FileStore/tables/train_data.csv"
    test_df_path = "/dbfs/FileStore/tables/test_data.csv"
    train_df.to_csv(train_df_path, index=False)
    test_df.to_csv(test_df_path, index=False)

    # Exit with the paths to the processed data
    result = {
        "train_df_path": train_df_path,
        "test_df_path": test_df_path
    }
    dbutils.notebook.exit(json.dumps(result))