In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read in dataset
df_diabetes_binary = pd.read_csv('../data/raw/diabetes_binary_health_indicators_BRFSS2015.csv')

# Display first few rows
df_diabetes_binary.head()

In [None]:
# Describe the shape of the dataset
rows, columns = df_diabetes_binary.shape
print(f"Number of rows: {rows} | Number of columns: {columns}")

In [None]:
# Print the data type of features
print(df_diabetes_binary.dtypes)

# Convert data type of features respectively 
# -> task for PREPROCESSING 
# (e.g. to only have BMI, MentHlth, PhysHlth, Age as numerical [integer type] features, 
# GenHlth, Education, Income as ordinal/categorical (non-binary) [integer type] features/, 
# and remaining features as binary [integer type])

In [None]:
# Get more insights on the basic statistics
df_diabetes_binary.describe()

In [None]:
# Get information on missing data
missing_values = df_diabetes_binary.isnull().sum()
print(missing_values)

In [None]:
# Get information on the standard deviation of features
df_diabetes_binary.std()


In [None]:
# Get information on the variance of features
df_diabetes_binary.var()

In [None]:
# Create boxplots for numerical [integer type] features
boxplot_features = ['BMI','MentHlth', 'PhysHlth', 'Age' ]
for f in boxplot_features:
    df_diabetes_binary.boxplot(f,by='Diabetes_binary',figsize=(5,7),fontsize=10)
    plt.title("{}\n".format(f),fontsize=10)
    plt.xlabel("Diabetes_binary", fontsize=10)

In [None]:
#Create kernel density plots for numerical [integer type] features (except Age)
density_plot_features = ['BMI','MentHlth', 'PhysHlth' ]

print(df_diabetes_binary[density_plot_features].dtypes)

for f in density_plot_features:
    plt.figure(figsize=(5, 5))
    sns.kdeplot(df_diabetes_binary[f], fill= True)
    plt.title(f'Kernel Density Plot of {f}')
    plt.xlabel(f)
    plt.ylabel('Density')
    plt.show()

    #TODO Histogram -> physical and mental health and age und x-axis bei 0 anfangen

In [None]:
#Create kernel density plots for numerical [integer type] features (except Age)
density_plot_features = ['BMI','MentHlth', 'PhysHlth' ]

print(df_diabetes_binary[density_plot_features].dtypes)

for f in density_plot_features:
    plt.figure(figsize=(6, 4))
    sns.kdeplot(df_diabetes_binary[df_diabetes_binary['Diabetes_binary'] == 1.0][f], color='red', fill=True, label='Diabetes')
    sns.kdeplot(df_diabetes_binary[df_diabetes_binary['Diabetes_binary'] == 0.0][f], color='green', fill=True, label='No Diabetes')
    plt.title(f'Kernel Density Plot of {f}')
    plt.xlabel(f)
    plt.ylabel('Density')
    plt.legend()
    plt.show()

#TODO Histogram -> physical and mental health and age (colored) und x-axis bei 0 anfangen

In [None]:
# Create histograms for ordinal features + Age
histogram_features = ['GenHlth', 'Education', 'Income', 'Age']

# Code provided by ChatGPT after prompting with simple code and specifying adjustments:

# Loop through each feature in the list
for f in histogram_features:
    
    # Adjust bins to scale according to minimum and maximum values
    min_val = int(df_diabetes_binary[f].min())
    max_val = int(df_diabetes_binary[f].max()) + 1
    bins = np.arange(min_val, max_val + 1, 1)
    
    # Plot histograms for each Diabetes_binary class with colors and settings
    plt.hist([df_diabetes_binary[df_diabetes_binary['Diabetes_binary'] == 1.0][f],
              df_diabetes_binary[df_diabetes_binary['Diabetes_binary'] == 0.0][f]], 
             color=['red', 'green'], alpha=0.7, edgecolor='black', stacked=True, bins=bins)
    
    # Add title and labels with adjusted label position
    plt.title(f'Histogram of {f}', fontsize=12)
    plt.xlabel(f, labelpad=15)  # Increase labelpad to move x-axis label lower
    plt.ylabel('Count')
    
    # Set x-ticks at the start of each bin but without labels
    plt.xticks(bins[:-1], labels=[])

    # Add labels at the center of each bin
    for i in range(len(bins) - 1):
        label_position = (bins[i] + bins[i + 1]) / 2  # Center of each bin
        plt.text(label_position, -0.02 * plt.ylim()[1], str(int(bins[i])), ha='center', va='top', fontsize=10)
    
    # Show the plot
    plt.figure(figsize=(6, 5))
    plt.show()


In [None]:
# Create correlation matrix but without redundant values
plt.figure(figsize=(22,22))
no_redundance = np.triu(df_diabetes_binary.corr())
sns.heatmap(df_diabetes_binary.corr(), mask= no_redundance, annot=True);

In [None]:
# Create pair plot/ scatterplot for three feature pairs with highest and lowest correlations to display multi-collinearity 

# Code provided by ChatGPT after prompting with simple code and specifying adjustments:

# Load the sample and correlation matrix
sampled_data = df_diabetes_binary.sample(frac=0.1)
correlation_matrix = sampled_data.corr()

# Identify the highest and lowest correlation pairs
# Flatten correlation matrix, filter out self-correlations, and get abs values to find top correlations
corr_pairs = correlation_matrix.unstack().drop_duplicates()
sorted_corr_pairs = corr_pairs[abs(corr_pairs) < 1].sort_values()

# Select top 3 positive and bottom 3 negative correlations
highest_corr_pairs = sorted_corr_pairs.tail(3).index
lowest_corr_pairs = sorted_corr_pairs.head(3).index

# Combine these into a list of selected pairs for plotting
selected_pairs = list(highest_corr_pairs) + list(lowest_corr_pairs)

# Initialize the plot with subplots for each pair
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10))
axes = axes.flatten()

# Plot each pair
for i, (feature_x, feature_y) in enumerate(selected_pairs):
    sns.scatterplot(
        data=sampled_data,
        x=feature_x,
        y=feature_y,
        hue="Diabetes_binary",
        alpha=0.6,
        ax=axes[i]
    )
    axes[i].set_title(f'Scatter Plot of {feature_x} vs {feature_y}')

plt.tight_layout()
plt.show()

#TODO -> größe der Punkte abhängig!!


In [None]:
#TODO Show balance/distribution

# Code provided by ChatGPT after giving instructions and details:

# Display class distribution of the target variable 'Diabetes_binary'
class_counts = df_diabetes_binary['Diabetes_binary'].value_counts()
class_counts_df = pd.DataFrame(class_counts).reset_index()
class_counts_df.columns = ['Class', 'Frequency']

# Calculate imbalance ratio
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"Imbalance Ratio: {imbalance_ratio:.2f}")

# Set colors for the bars: green for class 0, red for class 1
colors = ['green' if class_label == 0 else 'red' for class_label in class_counts.index]

# Plot the class distribution
plt.figure(figsize=(8, 6))
class_counts.plot(kind='bar', color=colors)
plt.title("Balance/Distribution of Diabetes_binary")
plt.xlabel("Diabetes_binary")
plt.ylabel("Frequency")

# Set y-axis ticks at intervals of 5000
plt.yticks(range(0, class_counts.max() + 5000, 5000))

plt.show()

#TODO Code Matthias

In [None]:
#TODO Ask tutors about 2700 rows of high income but cannot afford doctor

In [None]:
#TODO Correlation of each bin of General Health feature with target variable (exploration for baseline)

# Code for scatterplot by ChatGPT

# Assuming df_diabetes_binary is already loaded and has the columns 'GenHlth' and 'Diabetes_binary'

# Step 1: One-hot encode the 'GenHlth' feature
genhlth_dummies = pd.get_dummies(df_diabetes_binary['GenHlth'], prefix='GenHlth')

# Step 2: Concatenate the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([df_diabetes_binary, genhlth_dummies], axis=1)

# Step 3: Calculate the correlation of each GenHlth bin with the target variable
correlations = df_encoded[genhlth_dummies.columns].corrwith(df_encoded['Diabetes_binary'])

# Convert the correlations to a DataFrame for easier plotting
correlations_df = pd.DataFrame(correlations, columns=['Correlation with Diabetes_binary']).reset_index()
correlations_df.columns = ['GenHlth Bin', 'Correlation with Diabetes_binary']

# Plotting the correlation values with a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(correlations_df['GenHlth Bin'], correlations_df['Correlation with Diabetes_binary'], color='blue')
plt.title("Correlation of Each GenHlth Bin with Diabetes_binary")
plt.xlabel("GenHlth Bin")
plt.ylabel("Correlation with Diabetes_binary")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


# Code for barplot by ChatGPT

# Assuming df_diabetes_binary is already loaded and has the columns 'GenHlth' and 'Diabetes_binary'

# Step 1: One-hot encode the 'GenHlth' feature
genhlth_dummies = pd.get_dummies(df_diabetes_binary['GenHlth'], prefix='GenHlth')

# Step 2: Concatenate the one-hot encoded columns with the original DataFrame
df_encoded = pd.concat([df_diabetes_binary, genhlth_dummies], axis=1)

# Step 3: Calculate the correlation of each GenHlth bin with the target variable
correlations = df_encoded[genhlth_dummies.columns].corrwith(df_encoded['Diabetes_binary'])

# Convert the correlations to a DataFrame for easier plotting
correlations_df = pd.DataFrame(correlations, columns=['Correlation with Diabetes_binary']).reset_index()
correlations_df.columns = ['GenHlth Bin', 'Correlation with Diabetes_binary']

# Plotting the correlation values
plt.figure(figsize=(10, 6))
plt.bar(correlations_df['GenHlth Bin'], correlations_df['Correlation with Diabetes_binary'], color='blue')
plt.title("Correlation of Each GenHlth Bin with Diabetes_binary")
plt.xlabel("GenHlth Bin")
plt.ylabel("Correlation with Diabetes_binary")
plt.xticks(rotation=45)
plt.show()


#TODO red and green



In [None]:
#TODO Think about possible models
""" 
LR (Benedikt)
Decision trees and ensembles -> good because binary (Paty)
SVM (Salone)
Naive Bayes (Philipp)
KNN + Nearest Centroids (Matthias)

(Deep) Neural Networks

Baseline: Majority Class, Distribution Count/Class, General Health (highest correlating feature), PCA and taking first component as base
"""