In [0]:
# general
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [0]:
path = 'diabetes_binary_health_indicators_BRFSS2015.csv'

df = pd.read_csv(path, encoding="utf-8", encoding_errors="replace")

print("Shape: ", df.shape)
df.head(4)

In [0]:
df.to_csv('/dbfs/FileStore/tables/diabetes_raw.csv')

# Check for NaNs

In [0]:
df.info()

In [0]:
df.isna().sum()

# Imbalance dataset for the predictor

In [0]:
df['Diabetes_binary'].value_counts()

In [0]:
df['Diabetes_binary'].value_counts(1).plot(kind='barh',figsize=(10, 2)).spines[['top', 'right']].set_visible(False);
plt.title('Diabetes_binary Distribution (%)', fontsize=18)
plt.yticks(ticks=[0,1], labels=['Non-Diabetic', 'Diabetic']);

# Correlation Matrix

In [0]:
corr_matrix = df.corr()

# Heatmap of the correlation matrix
plt.figure(figsize=(16, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Categorize Features

In [0]:
target = 'Diabetes_binary'
bool_vars = (df.nunique()[df.nunique() == 2]
                .index
                .drop(labels='Diabetes_binary'))
num_vars = [var for var in df.columns if var not in bool_vars and var != 'Diabetes_binary']

# Binary Features Distribution

In [0]:
for col in bool_vars:
    (df.groupby('Diabetes_binary')[col]
     .value_counts(1)
     .unstack()
     .iloc[:,::-1]
     .plot(kind='barh',stacked=True,figsize=(10, 2), alpha=1)
     .spines[['top', 'right']].set_visible(False))
    plt.legend(['Yes', "No"],bbox_to_anchor=(1, 1, 0, 0),shadow=False, frameon=False)
    plt.yticks(ticks=[0,1], labels=['Non-Diabetic', 'Diabetic'])
    plt.tight_layout()
    plt.title(col, fontsize=18)
    plt.show()

# Numeric Features Distribution

In [0]:
plt.figure(figsize=(20, 20))
num_rows = len(num_vars)

for index, var in enumerate(num_vars):
    plt.subplot(num_rows, 2, index+1)
    df[df['Diabetes_binary'] == 0][var].hist(alpha=0.5, label='Diabetes=0', bins=30)
    df[df['Diabetes_binary'] == 1][var].hist(alpha=0.5, label='Diabetes=1', bins=30)
    plt.title(var)
    plt.xlabel(var)
    plt.ylabel('Frequency')
    plt.legend()

plt.tight_layout()
plt.show()

In [0]:
!pip install imbalanced-learn
!pip install scikit-learn

In [0]:
from imblearn.under_sampling import NearMiss

X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

nm = NearMiss()
X_res, y_res = nm.fit_resample(X, y)

print("Before Under-Sampling, Diabetes '1': {}".format(sum(y == 1)))
print("Before Under-Sampling, Diabetes '0': {}".format(sum(y == 0)))

print("After Under-Sampling, Diabetes '1': {}".format(sum(y_res == 1)))
print("After Under-Sampling, Diabetes '0': {}".format(sum(y_res == 0)))

In [0]:
X_res.to_csv('diabetes_X.csv')
y_res.to_csv('diabetes_y.csv')

In [0]:
resampled_df = pd.DataFrame(X_res, columns=X.columns)
resampled_df['Diabetes_binary'] = y_res

print('Shape: ',resampled_df.shape)
resampled_df.head(2)

In [0]:
resampled_df.to_csv('diabetes_r.csv') # this upload files into the same folder
resampled_df.to_csv('/dbfs/FileStore/tables/diabetes_r.csv') # this upload into dbfs (databricks file system) to be used in automl