In [1]:
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the files
with open('config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

data = config['files']
# extract each file
diabetes_data = pd.read_csv(data['diabetes'])
stroke_data = pd.read_csv(data['stroke'])

In [None]:
diabetes_data.head()

In [None]:
diabetes_data.info()

In [None]:
# List of categorical columns
categorical_columns = ['Sex', 'Smoker', 'Stroke', 'Diabetes', 'HighBP']

# Convert these columns to 'category' type
for column in categorical_columns:
    diabetes_data[column] = diabetes_data[column].astype('category')

# Check the updated data types
diabetes_data.info()

In [None]:
print(f'Missing values before handling:\n{diabetes_data.isnull().sum()}')

In [None]:
diabetes_data.describe()

In [None]:
# Filter out irrational BMI values
invalid_bmi = ((diabetes_data['BMI'] <= 10) | (diabetes_data['BMI'] >= 60)).sum()
print(f"Invalid BMI values: {invalid_bmi}")
diabetes_data = diabetes_data[(diabetes_data['BMI'] >= 10) & (diabetes_data['BMI'] <= 60)]


In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, ColumnDataSource

output_notebook()

# Function to create consistent boxplots with hover tools
def create_boxplot(data, title):
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    median = data.median()
    iqr = q3 - q1
    lower_whisker = data[data >= (q1 - 1.5 * iqr)].min()
    upper_whisker = data[data <= (q3 + 1.5 * iqr)].max()
    outliers = data[(data < lower_whisker) | (data > upper_whisker)]

    p = figure(title=title, height=300, width=300, tools='hover,pan,wheel_zoom,box_zoom,reset')
    p.vbar(x=0, width=0.5, bottom=q1, top=q3, fill_color='skyblue', line_color='black')
    p.line([-.25, .25], [median, median], line_color='red', line_width=2)
    p.segment(x0=0, x1=0, y0=q3, y1=upper_whisker, line_color='black')
    p.segment(x0=0, x1=0, y0=q1, y1=lower_whisker, line_color='black')
    p.circle(x=[0]*len(outliers), y=outliers, color='red', size=5)

    hover = HoverTool(tooltips=[
        ('Median', f'{median:.2f}'),
        ('Q1', f'{q1:.2f}'),
        ('Q3', f'{q3:.2f}'),
        ('IQR', f'{iqr:.2f}'),
        ('Lower Whisker', f'{lower_whisker:.2f}'),
        ('Upper Whisker', f'{upper_whisker:.2f}')
    ])
    p.add_tools(hover)
    return p

# Function to create bar plots with interactive hover tools
def create_barplot(data, categories, title, colors=None):
    source = ColumnDataSource(data=dict(categories=categories, counts=data, colors=colors))
    p = figure(title=title, x_range=categories, height=300, width=300, tools='hover,pan,wheel_zoom,box_zoom,reset')
    p.vbar(x='categories', top='counts', width=0.5, color='colors', source=source)
    hover = HoverTool(tooltips=[('Category', '@categories'), ('Count', '@counts')])
    p.add_tools(hover)
    return p

# BMI and Age Boxplots
bmi_plot = create_boxplot(diabetes_data['BMI'], 'BMI Distribution')
age_plot = create_boxplot(diabetes_data['Age'], 'Age Distribution')

# Interactive Categorical Bar Plots
smoker_counts = diabetes_data['Smoker'].value_counts()
smoker_plot = create_barplot(smoker_counts.values, ['Non-Smoker', 'Smoker'], 'Smokers Distribution', ['coral', 'coral'])

diabetes_counts = diabetes_data['Diabetes'].value_counts()
diabetes_plot = create_barplot(diabetes_counts.values, ['No Diabetes', 'Diabetes'], 'Diabetes Status', ['green', 'green'])

bp_counts = diabetes_data['HighBP'].value_counts()
bp_plot = create_barplot(bp_counts.values, ['Normal', 'High BP'], 'High Blood Pressure', ['purple', 'purple'])

sex_counts = diabetes_data['Sex'].value_counts()
sex_plot = create_barplot(sex_counts.values, ['Female', 'Male'], 'Sex Distribution', ['pink', 'blue'])

# Arrange plots in a 2-row grid layout
grid = gridplot([
    [bmi_plot, smoker_plot, diabetes_plot],
    [age_plot, bp_plot, sex_plot]
])

show(grid)


In [None]:
import pandas as pd
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import webbrowser
from threading import Timer

diabetes_data = pd.read_csv('diabetes_data.csv')

# Convert binary columns to categorical
diabetes_data['Diabetes'] = diabetes_data['Diabetes'].map({0: 'No', 1: 'Yes'})
diabetes_data['Smoker'] = diabetes_data['Smoker'].map({0: 'No', 1: 'Yes'})
diabetes_data['HighBP'] = diabetes_data['HighBP'].map({0: 'No', 1: 'Yes'})

# Filter BMI data
diabetes_data = diabetes_data[(diabetes_data['BMI'] >= 10) & (diabetes_data['BMI'] <= 60)]

# Create Dash app
app = dash.Dash(__name__)
app.title = "Diabetes Risk Factors Dashboard"

# Function to generate plots
def generate_plots(data):
    continuous_vars = ['BMI', 'Age']
    categorical_vars = ['Smoker', 'HighBP']
    colors = ['#2b908f', '#ff7c43']
    figures = {}

    # Continuous Variables (Boxplots)
    for variable in continuous_vars:
        fig = px.box(
            data,
            x="Diabetes",
            y=variable,
            color="Diabetes",
            color_discrete_sequence=colors,
            labels={"Diabetes": "Diabetes Status"},
            title=f'{variable} Distribution by Diabetes Status'
        )
        fig.update_layout(
            showlegend=False,
            title_font_size=12,
            font_size=10,
            margin=dict(l=20, r=20, t=30, b=20)
        )
        
        # Set y-axis range for BMI
        if variable == 'BMI':
            fig.update_yaxes(range=[10, 60])
        elif variable == 'Age':
            fig.update_yaxes(range=[1, 13])
        
        figures[variable] = fig

    # Categorical Variables (Bar Plots)
    for variable in categorical_vars:
        cross_tab = pd.crosstab(data[variable], data['Diabetes'], normalize='index') * 100
        cross_tab = cross_tab.reset_index().melt(id_vars=variable, value_name="Percentage", var_name="Diabetes")
        
        fig = px.bar(
            cross_tab,
            x=variable,
            y="Percentage",
            color="Diabetes",
            barmode="group",
            text="Percentage",
            color_discrete_sequence=colors,
            title=f'{variable} Distribution by Diabetes Status'
        )
        
        fig.update_traces(
            texttemplate='%{text:.2f}%',
            textposition='inside',
            insidetextanchor='middle'
        )

        fig.update_layout(
            title_font_size=12,
            font_size=10,
            margin=dict(l=20, r=20, t=30, b=20)
        )
        figures[variable] = fig

    return figures

# Generate initial figures
figures = generate_plots(diabetes_data)

# Layout of the app
app.layout = html.Div([
    html.H1(
        "Diabetes Risk Factors Dashboard", 
        style={'textAlign': 'center', 'fontSize': '20px', 'marginBottom': '10px'}
    ),
    html.Div([
        html.Label("Select a Plot to View:", style={'fontSize': '14px'}),
        dcc.Dropdown(
            id="plot-dropdown",
            options=[
                {"label": "BMI Distribution by Diabetes Status", "value": "BMI"},
                {"label": "Age Distribution by Diabetes Status", "value": "Age"},
                {"label": "Smoker Distribution by Diabetes Status", "value": "Smoker"},
                {"label": "HighBP Distribution by Diabetes Status", "value": "HighBP"}
            ],
            value="BMI",
            clearable=False,
            style={'width': '60%', 'fontSize': '12px'}
        )
    ], style={'margin': '10px 0'}),
    dcc.Graph(
        id="selected-plot", 
        style={'height': '400px'}
    )
])

# Callback to update plot based on dropdown selection
@app.callback(
    Output("selected-plot", "figure"),
    [Input("plot-dropdown", "value")]
)
def update_plot(selected_plot):
    return figures[selected_plot]

# Function to open the app in a web browser
def open_browser():
    webbrowser.open_new("http://127.0.0.1:8050/")

if __name__ == "__main__":
    Timer(1, open_browser).start()
    app.run_server(debug=False, use_reloader=False)


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the data
diabetes_data = pd.read_csv('diabetes_data.csv')

# Convert categorical columns to numeric
diabetes_data['Smoker'] = diabetes_data['Smoker'].map({0: 0, 1: 1})
diabetes_data['HighBP'] = diabetes_data['HighBP'].map({0: 0, 1: 1})
diabetes_data['Diabetes'] = diabetes_data['Diabetes'].map({0: 0, 1: 1})

# Define the independent variables (X) and dependent variable (y)
X = diabetes_data[['Smoker', 'Age', 'BMI', 'HighBP']]
y = diabetes_data['Diabetes']

# Add a constant term for the intercept
X = sm.add_constant(X)

# Fit the logistic regression model
model = sm.Logit(y, X).fit()

print(model.summary())


In [None]:
stroke_data.head()

In [None]:
stroke_data.info()

In [None]:
print(f'Missing values before handling:\n{stroke_data.isnull().sum()}')

In [None]:
stroke_data.dropna(inplace=True)
stroke_data.isnull().sum()

In [None]:
stroke_data.describe()

In [None]:
# Remove rows with age < 0
stroke_data = stroke_data[stroke_data['age'] >= 0]
stroke_data = stroke_data[(stroke_data['bmi'] >= 10) & (stroke_data['bmi'] <= 60)]

stroke_data.describe()

In [None]:
# Define age bins and labels
bins = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, np.inf]
labels = range(14)  # 0 to 13

# Create age buckets for diabetes data
stroke_data['age_bucket'] = pd.cut(stroke_data['age'], bins=bins, labels=labels, right=False)
stroke_data.head()

In [None]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, ColumnDataSource

output_notebook()

# Function to create consistent boxplots with hover tools
def create_boxplot(data, title):
    # Calculate statistics
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    median = data.median()
    iqr = q3 - q1
    lower_whisker = data[data >= (q1 - 1.5 * iqr)].min()
    upper_whisker = data[data <= (q3 + 1.5 * iqr)].max()
    outliers = data[(data < lower_whisker) | (data > upper_whisker)]

    p = figure(title=title, height=300, width=300, tools='hover,pan,wheel_zoom,box_zoom,reset')
    
    p.vbar(x=0, width=0.5, bottom=q1, top=q3, fill_color='skyblue', line_color='black')
    
    p.line([-.25, .25], [median, median], line_color='red', line_width=2)
    
    p.segment(x0=0, x1=0, y0=q3, y1=upper_whisker, line_color='black')
    p.segment(x0=0, x1=0, y0=q1, y1=lower_whisker, line_color='black')
    
    p.circle(x=[0]*len(outliers), y=outliers, color='red', size=5)
    
    hover = HoverTool(tooltips=[
        ('Median', f'{median:.2f}'),
        ('Q1', f'{q1:.2f}'),
        ('Q3', f'{q3:.2f}'),
        ('IQR', f'{iqr:.2f}'),
        ('Lower Whisker', f'{lower_whisker:.2f}'),
        ('Upper Whisker', f'{upper_whisker:.2f}')
    ])
    p.add_tools(hover)
    
    return p

# Function to create bar plots with interactive hover tools
def create_barplot(data, categories, title, colors=None):
    source = ColumnDataSource(data=dict(
        categories=categories,
        counts=data,
        colors=colors if colors else ['#1f77b4']*len(categories)
    ))
    
    p = figure(title=title, x_range=categories, height=300, width=300, 
              tools='hover,pan,wheel_zoom,box_zoom,reset')
    
    p.vbar(x='categories', top='counts', width=0.5, color='colors', source=source)
    
    hover = HoverTool(tooltips=[
        ('Category', '@categories'),
        ('Count', '@counts')
    ])
    p.add_tools(hover)
    
    return p

# Read and process the stroke data
stroke_data = pd.read_csv('stroke_data.csv')

# Clean the data
stroke_data.dropna(inplace=True)
stroke_data = stroke_data[stroke_data['age'] >= 1]
stroke_data = stroke_data[(stroke_data['bmi'] >= 10) & (stroke_data['bmi'] <= 60)]

# Define age bins and labels
bins = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, np.inf]
labels = range(14)  # 0 to 13

# Create age buckets
stroke_data['age_bucket'] = pd.cut(stroke_data['age'], bins=bins, labels=labels, right=False)

# Create the box plots
bmi_plot = create_boxplot(stroke_data['bmi'], 'BMI Distribution')
age_bucket_plot = create_boxplot(stroke_data['age_bucket'].astype(float), 'Age Bucket Distribution')

# Create the categorical bar plots
# Smoking status
smoker_counts = stroke_data['smoking_status'].value_counts()
smoker_plot = create_barplot(
    smoker_counts.values, 
    ['Non-Smoker', 'Smoker'], 
    'Smokers Distribution', 
    ['coral', 'coral']
)

# Stroke status
stroke_counts = stroke_data['stroke'].value_counts()
stroke_plot = create_barplot(
    stroke_counts.values,
    ['No Stroke', 'Stroke'],
    'Stroke Status',
    ['green', 'green']
)

# Blood pressure status
bp_counts = stroke_data['hypertension'].value_counts()
bp_plot = create_barplot(
    bp_counts.values,
    ['Normal', 'High BP'],
    'High Blood Pressure',
    ['purple', 'purple']
)

# Sex distribution
sex_counts = stroke_data['sex'].value_counts()
sex_plot = create_barplot(
    sex_counts.values,
    ['Female', 'Male'],
    'Sex Distribution',
    ['pink', 'blue']
)

# Arrange plots in a 2-row grid layout
grid = gridplot([
    [bmi_plot, smoker_plot, stroke_plot],
    [age_bucket_plot, bp_plot, sex_plot]
])

show(grid)

In [None]:
import pandas as pd
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import webbrowser
from threading import Timer

# Load data from the CSV file
stroke_data = pd.read_csv('stroke_data.csv')

# Convert binary columns to categorical
stroke_data['stroke'] = stroke_data['stroke'].map({0: 'No', 1: 'Yes'})
stroke_data['smoking_status'] = stroke_data['smoking_status'].map({0: 'No', 1: 'Yes'})
stroke_data['hypertension'] = stroke_data['hypertension'].map({0: 'No', 1: 'Yes'})

# Filter BMI data
stroke_data = stroke_data[(stroke_data['bmi'] >= 10) & (stroke_data['bmi'] <= 60)]

# Create age buckets
bins = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, float('inf')]
labels = range(14)
stroke_data['age_bucket'] = pd.cut(stroke_data['age'], bins=bins, labels=labels, right=False)

# Create Dash app
app = dash.Dash(__name__)
app.title = "Stroke Risk Factors Dashboard"

# Function to generate plots
def generate_plots(data):
    continuous_vars = ['bmi', 'age_bucket']
    categorical_vars = ['smoking_status', 'hypertension']
    colors = ['#2b908f', '#ff7c43']
    figures = {}

    # Continuous Variables (Boxplots)
    for variable in continuous_vars:
        fig = px.box(
            data,
            x="stroke",
            y=variable,
            color="stroke",
            color_discrete_sequence=colors,
            labels={"stroke": "Stroke Status"},
            title=f'{variable.capitalize()} Distribution by Stroke Status'
        )
        fig.update_layout(
            showlegend=False,
            title_font_size=12,
            font_size=10,
            margin=dict(l=20, r=20, t=30, b=20)
        )
        
        if variable == 'bmi':
            fig.update_yaxes(range=[10, 60])
        elif variable == 'age_bucket':
            fig.update_yaxes(range=[0, 13])
        
        figures[variable] = fig

    # Categorical Variables (Bar Plots)
    for variable in categorical_vars:
        cross_tab = pd.crosstab(data[variable], data['stroke'], normalize='index') * 100
        cross_tab = cross_tab.reset_index().melt(id_vars=variable, value_name="Percentage", var_name="stroke")
        
        fig = px.bar(
            cross_tab,
            x=variable,
            y="Percentage",
            color="stroke",
            barmode="group",
            text="Percentage",
            color_discrete_sequence=colors,
            title=f'{variable.capitalize()} Distribution by Stroke Status'
        )
        
        fig.update_traces(
            texttemplate='%{text:.2f}%',
            textposition='inside',
            insidetextanchor='middle'
        )

        fig.update_layout(
            title_font_size=12,
            font_size=10,
            margin=dict(l=20, r=20, t=30, b=20)
        )
        figures[variable] = fig

    return figures

# Generate initial figures
figures = generate_plots(stroke_data)

# Layout of the app
app.layout = html.Div([
    html.H1(
        "Stroke Risk Factors Dashboard", 
        style={'textAlign': 'center', 'fontSize': '20px', 'marginBottom': '10px'}
    ),
    html.Div([
        html.Label("Select a Plot to View:", style={'fontSize': '14px'}),
        dcc.Dropdown(
            id="plot-dropdown",
            options=[
                {"label": "BMI Distribution by Stroke Status", "value": "bmi"},
                {"label": "Age Distribution by Stroke Status", "value": "age_bucket"},
                {"label": "Smoker Distribution by Stroke Status", "value": "smoking_status"},
                {"label": "Hypertension Distribution by Stroke Status", "value": "hypertension"}
            ],
            value="bmi",
            clearable=False,
            style={'width': '60%', 'fontSize': '12px'}
        )
    ], style={'margin': '10px 0'}),
    dcc.Graph(
        id="selected-plot", 
        style={'height': '400px'}
    )
])

@app.callback(
    Output("selected-plot", "figure"),
    [Input("plot-dropdown", "value")]
)
def update_plot(selected_plot):
    return figures[selected_plot]

def open_browser():
    webbrowser.open_new("http://127.0.0.1:8051/")

if __name__ == "__main__":
    Timer(1, open_browser).start()
    app.run_server(debug=False, use_reloader=False, port=8051)


In [None]:
diabetes_data = diabetes_data.rename(columns={
    'Age': 'age',
    'Sex': 'sex',
    'BMI': 'bmi',
    'Smoker': 'smoking_status',
    'Stroke': 'stroke',
    'HighBP': 'hypertension'
})
diabetes_data.head()

In [21]:
# Define BMI buckets
bmi_bins = [0, 18.5, 25, 30, float('inf')] 
bmi_labels = ["Underweight", "Normal weight", "Overweight", "Obese"]

# Create BMI buckets for diabetes data
diabetes_data['bmi_bucket'] = pd.cut(diabetes_data['bmi'], bins=bmi_bins, labels=bmi_labels)

# Create BMI buckets for stroke data
stroke_data['bmi_bucket'] = pd.cut(stroke_data['bmi'], bins=bmi_bins, labels=bmi_labels)

In [None]:
diabetes_data

In [None]:
stroke_data

In [None]:
stroke_data = stroke_data.drop('age', axis=1)
stroke_data = stroke_data.rename(columns={'age_bucket': 'age'})

stroke_filtered = stroke_data[['age', 'sex', 'bmi_bucket', 'smoking_status', 'hypertension', 'stroke']]
diabetes_filtered = diabetes_data[['age', 'sex', 'bmi_bucket', 'smoking_status', 'hypertension', 'Diabetes']]

diabetes_filtered['hypertension'] = diabetes_filtered['hypertension'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
stroke_filtered['hypertension'] = stroke_filtered['hypertension'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

diabetes_filtered['smoking_status'] = diabetes_filtered['smoking_status'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
stroke_filtered['smoking_status'] = stroke_filtered['smoking_status'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

# Ensure the column is of integer type
diabetes_filtered['hypertension'] = diabetes_filtered['hypertension'].astype(int)
stroke_filtered['hypertension'] = stroke_filtered['hypertension'].astype(int)

# Now try the merge again
merged_data = pd.merge(diabetes_filtered, stroke_filtered, on=['age', 'smoking_status', 'hypertension', 'bmi_bucket'], how='inner')

In [None]:
merged_data

In [None]:
merged_data

In [27]:
merged_data.reset_index(drop=True, inplace=True)
merged_data.to_csv('/users/ali/desktop/merged_data.csv')

In [None]:
merged_data.isnull().sum()

In [None]:
# Filter for individuals with both diabetes and stroke
merged_diabetics_stroke = merged_data[(merged_data['Diabetes'] == 1) & (merged_data['stroke'] == 'Yes')]

# Count the number of individuals with both conditions
number_of_diabetics_with_stroke = len(merged_diabetics_stroke)

print(f"Number of individuals with both diabetes and stroke: {number_of_diabetics_with_stroke}")

diabetics_stroke_perc = (number_of_diabetics_with_stroke / len(merged_data)) * 100
print(f"percentage of individuals with both diabetes and stroke: {diabetics_stroke_perc:.2f}%")


In [None]:
diabetics = merged_data[merged_data['Diabetes'] == 1]
non_diabetics = merged_data[merged_data['Diabetes'] == 0]

stroke_in_diabetics = len(diabetics[diabetics['stroke'] == 'Yes'])
stroke_in_non_diabetics = len(non_diabetics[non_diabetics['stroke'] == 'Yes'])

stroke_in_diabetics_perc = (stroke_in_diabetics / len(diabetics)) * 100
stroke_in_non_diabetics_perc = (stroke_in_non_diabetics / len(non_diabetics)) * 100

print(f"Stroke prevalence in diabetics: {stroke_in_diabetics_perc:.2f}%")
print(f"Stroke prevalence in non-diabetics: {stroke_in_non_diabetics_perc:.2f}%")


In [None]:
import statsmodels.api as sm
import pandas as pd

# Encode the dependent variable ('stroke') as binary (1 = Yes, 0 = No)
merged_data['stroke_binary'] = merged_data['stroke'].apply(lambda x: 1 if x == 'Yes' else 0)

# Define independent variables (Diabetes and any confounders)
X = merged_data[['Diabetes']]  # Add other variables like 'Age', 'BMI' if needed
X = sm.add_constant(X)  # Add intercept

# Define dependent variable
y = merged_data['stroke_binary']

# Logistic regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Print summary
print(result.summary())


In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Create a contingency table
contingency_table = pd.crosstab(merged_data['Diabetes'], merged_data['stroke'])

# Perform the Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")

# Interpretation
if p < 0.05:
    print("There is a significant association between Diabetes and Stroke.")
else:
    print("There is no significant association between Diabetes and Stroke.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# First, let's read our datasets
diabetes_data = pd.read_csv('diabetes_data.csv')
stroke_data = pd.read_csv('stroke_data.csv')

# We'll define our BMI categories in a logical health-based order
bmi_category_order = ['Underweight', 'Normal', 'Overweight', 'Obese']

# This function categorizes BMI values according to WHO standards
def get_bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

# Process the diabetes dataset
diabetes_processed = diabetes_data.copy()
diabetes_processed['bmi_category'] = diabetes_processed['BMI'].apply(get_bmi_category)
diabetes_processed['bmi_category'] = pd.Categorical(
    diabetes_processed['bmi_category'], 
    categories=bmi_category_order, 
    ordered=True
)
diabetes_processed['has_diabetes'] = diabetes_processed['Diabetes'] == 1
diabetes_processed['hypertension'] = diabetes_processed['HighBP']

# Process the stroke dataset
stroke_processed = stroke_data.copy()
stroke_processed['bmi_category'] = stroke_processed['bmi'].apply(get_bmi_category)
stroke_processed['bmi_category'] = pd.Categorical(
    stroke_processed['bmi_category'], 
    categories=bmi_category_order, 
    ordered=True
)
stroke_processed['has_stroke'] = stroke_processed['stroke'] == 1

# Calculate disease prevalence for each BMI category
diabetes_distribution = diabetes_processed.groupby('bmi_category')['has_diabetes'].agg(
    lambda x: (x.mean() * 100).round(2)
)

stroke_distribution = stroke_processed.groupby('bmi_category')['has_stroke'].agg(
    lambda x: (x.mean() * 100).round(2)
)

# Combine the prevalence data into a single DataFrame
disease_data = pd.DataFrame({
    'Diabetes %': diabetes_distribution,
    'Stroke %': stroke_distribution
})

# Create an enhanced visualization
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")

# Plot bars with custom colors and styling
ax = disease_data.plot(kind='bar', 
                      color=['#FF9999', '#66B2FF'],
                      width=0.8)

# Enhance the visualization with proper labels and styling
plt.title('Disease Prevalence by BMI Category', fontsize=14, pad=20)
plt.xlabel('BMI Category', fontsize=12)
plt.ylabel('Prevalence (%)', fontsize=12)

# Adjust legend position and style
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Add value labels on top of each bar
for i in ax.containers:
    ax.bar_label(i, fmt='%.1f%%', padding=3)

plt.tight_layout()

plt.show()