# Load and inspect the data

In [1]:
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the files
with open('config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

data = config['files']
# extract each file
diabetes_data = pd.read_csv(data['diabetes'])
stroke_data = pd.read_csv(data['stroke'])

First, we will examine the impact of BMI, smoking status, and high blood pressure on diabetes separately.

In [2]:
diabetes_data.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   70692 non-null  float64
 1   Sex                   70692 non-null  float64
 2   HighChol              70692 non-null  float64
 3   CholCheck             70692 non-null  float64
 4   BMI                   70692 non-null  float64
 5   Smoker                70692 non-null  float64
 6   HeartDiseaseorAttack  70692 non-null  float64
 7   PhysActivity          70692 non-null  float64
 8   Fruits                70692 non-null  float64
 9   Veggies               70692 non-null  float64
 10  HvyAlcoholConsump     70692 non-null  float64
 11  GenHlth               70692 non-null  float64
 12  MentHlth              70692 non-null  float64
 13  PhysHlth              70692 non-null  float64
 14  DiffWalk              70692 non-null  float64
 15  Stroke             

In [4]:
# List of categorical columns I need
categorical_columns = ['Sex', 'Smoker', 'Stroke', 'Diabetes', 'HighBP']

# Convert these columns to 'category' type
for column in categorical_columns:
    diabetes_data[column] = diabetes_data[column].astype('category')

# Check the updated data types
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Age                   70692 non-null  float64 
 1   Sex                   70692 non-null  category
 2   HighChol              70692 non-null  float64 
 3   CholCheck             70692 non-null  float64 
 4   BMI                   70692 non-null  float64 
 5   Smoker                70692 non-null  category
 6   HeartDiseaseorAttack  70692 non-null  float64 
 7   PhysActivity          70692 non-null  float64 
 8   Fruits                70692 non-null  float64 
 9   Veggies               70692 non-null  float64 
 10  HvyAlcoholConsump     70692 non-null  float64 
 11  GenHlth               70692 non-null  float64 
 12  MentHlth              70692 non-null  float64 
 13  PhysHlth              70692 non-null  float64 
 14  DiffWalk              70692 non-null  float64 
 15  St

In [5]:
# Handling the missing values
print(f'Missing values before handling:\n{diabetes_data.isnull().sum()}')

Missing values before handling:
Age                     0
Sex                     0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Stroke                  0
HighBP                  0
Diabetes                0
dtype: int64


To plot the distribution status, we first need to take an overview of the statistical state of the data. Additionally, for further analysis, numerical data should be normalized. (This process is specific to numerical data, but I applied it to all columns, both numerical and non-numerical.)

In [6]:
diabetes_data.describe()

Unnamed: 0,Age,HighChol,CholCheck,BMI,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk
count,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0
mean,8.584055,0.525703,0.975259,29.856985,0.14781,0.703036,0.611795,0.788774,0.042721,2.837082,3.752037,5.810417,0.25273
std,2.852153,0.499342,0.155336,7.113954,0.354914,0.456924,0.487345,0.408181,0.202228,1.113565,8.155627,10.062261,0.434581
min,1.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,7.0,0.0,1.0,25.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0
50%,9.0,1.0,1.0,29.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0
75%,11.0,1.0,1.0,33.0,0.0,1.0,1.0,1.0,0.0,4.0,2.0,6.0,1.0
max,13.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,5.0,30.0,30.0,1.0


In [7]:
# Filter out irrational BMI values based on the result of describe command
invalid_bmi = ((diabetes_data['BMI'] <= 10) | (diabetes_data['BMI'] >= 60)).sum()
print(f"Invalid BMI values: {invalid_bmi}")
diabetes_data = diabetes_data[(diabetes_data['BMI'] >= 10) & (diabetes_data['BMI'] <= 60)]


Invalid BMI values: 289


## Plot the distribution

In [8]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, ColumnDataSource

output_notebook()

# Function to create consistent boxplots with hover tools
def create_boxplot(data, title):
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    median = data.median()
    iqr = q3 - q1
    lower_whisker = data[data >= (q1 - 1.5 * iqr)].min()
    upper_whisker = data[data <= (q3 + 1.5 * iqr)].max()
    outliers = data[(data < lower_whisker) | (data > upper_whisker)]

    p = figure(title=title, height=300, width=300, tools='hover,pan,wheel_zoom,box_zoom,reset')
    p.vbar(x=0, width=0.5, bottom=q1, top=q3, fill_color='skyblue', line_color='black')
    p.line([-.25, .25], [median, median], line_color='red', line_width=2)
    p.segment(x0=0, x1=0, y0=q3, y1=upper_whisker, line_color='black')
    p.segment(x0=0, x1=0, y0=q1, y1=lower_whisker, line_color='black')
    p.circle(x=[0]*len(outliers), y=outliers, color='red', size=5)

    hover = HoverTool(tooltips=[
        ('Median', f'{median:.2f}'),
        ('Q1', f'{q1:.2f}'),
        ('Q3', f'{q3:.2f}'),
        ('IQR', f'{iqr:.2f}'),
        ('Lower Whisker', f'{lower_whisker:.2f}'),
        ('Upper Whisker', f'{upper_whisker:.2f}')
    ])
    p.add_tools(hover)
    return p

# Function to create bar plots with interactive hover tools
def create_barplot(data, categories, title, colors=None):
    source = ColumnDataSource(data=dict(categories=categories, counts=data, colors=colors))
    p = figure(title=title, x_range=categories, height=300, width=300, tools='hover,pan,wheel_zoom,box_zoom,reset')
    p.vbar(x='categories', top='counts', width=0.5, color='colors', source=source)
    hover = HoverTool(tooltips=[('Category', '@categories'), ('Count', '@counts')])
    p.add_tools(hover)
    return p

# BMI and Age Boxplots
bmi_plot = create_boxplot(diabetes_data['BMI'], 'BMI Distribution')
age_plot = create_boxplot(diabetes_data['Age'], 'Age Distribution')

# Interactive Categorical Bar Plots
smoker_counts = diabetes_data['Smoker'].value_counts()
smoker_plot = create_barplot(smoker_counts.values, ['Non-Smoker', 'Smoker'], 'Smokers Distribution', ['coral', 'coral'])

diabetes_counts = diabetes_data['Diabetes'].value_counts()
diabetes_plot = create_barplot(diabetes_counts.values, ['No Diabetes', 'Diabetes'], 'Diabetes Status', ['green', 'green'])

bp_counts = diabetes_data['HighBP'].value_counts()
bp_plot = create_barplot(bp_counts.values, ['Normal', 'High BP'], 'High Blood Pressure', ['purple', 'purple'])

sex_counts = diabetes_data['Sex'].value_counts()
sex_plot = create_barplot(sex_counts.values, ['Female', 'Male'], 'Sex Distribution', ['pink', 'blue'])

# Arrange plots in a 2-row grid layout
grid = gridplot([
    [bmi_plot, smoker_plot, diabetes_plot],
    [age_plot, bp_plot, sex_plot]
])

show(grid)




As you can see, since age and BMI are numerical, I used a boxplot to display their distribution. For the categorical variables, I used a bar chart, which clearly shows the distribution of each one in the Diabetes dataset.

In this section, the impact of the three parameters I previously mentioned, along with the age parameter, on diabetes is examined.

In [32]:
import pandas as pd
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import webbrowser
from threading import Timer

# Load dataset
diabetes_data = pd.read_csv('diabetes_data.csv')

# Convert binary columns to categorical
diabetes_data['Diabetes'] = diabetes_data['Diabetes'].map({0: 'No', 1: 'Yes'})
diabetes_data['Smoker'] = diabetes_data['Smoker'].map({0: 'No', 1: 'Yes'})
diabetes_data['HighBP'] = diabetes_data['HighBP'].map({0: 'No', 1: 'Yes'})

# Filter BMI data
diabetes_data = diabetes_data[(diabetes_data['BMI'] >= 10) & (diabetes_data['BMI'] <= 60)]

# Create Dash app
app = dash.Dash(__name__)
app.title = "Diabetes Risk Factors Dashboard"

# Function to generate plots
def generate_plots(data):
    continuous_vars = ['BMI', 'Age']
    categorical_vars = ['Smoker', 'HighBP']
    colors = ['#2b908f', '#ff7c43']
    figures = {}

    # Continuous Variables (Boxplots)
    for variable in continuous_vars:
        fig = px.box(
            data,
            x="Diabetes",
            y=variable,
            color="Diabetes",
            color_discrete_sequence=colors,
            labels={"Diabetes": "Diabetes Status"},
            title=f'{variable} Distribution by Diabetes Status'
        )
        fig.update_layout(
            showlegend=False,
            title_font_size=12,
            font_size=10,
            margin=dict(l=20, r=20, t=30, b=20)
        )
        
        # Set y-axis range
        if variable == 'BMI':
            fig.update_yaxes(range=[10, 60])
        elif variable == 'Age':
            fig.update_yaxes(range=[1, 13])
        
        figures[variable] = fig

    # Categorical Variables (Bar Plots)
    for variable in categorical_vars:
        cross_tab = pd.crosstab(data[variable], data['Diabetes'], normalize='index') * 100
        cross_tab = cross_tab.reset_index().melt(id_vars=variable, value_name="Percentage", var_name="Diabetes")
        
        fig = px.bar(
            cross_tab,
            x=variable,
            y="Percentage",
            color="Diabetes",
            barmode="group",
            text="Percentage",
            color_discrete_sequence=colors,
            title=f'{variable} Distribution by Diabetes Status'
        )
        
        fig.update_traces(
            texttemplate='%{text:.2f}%',
            textposition='inside',
            insidetextanchor='middle'
        )

        fig.update_layout(
            title_font_size=12,
            font_size=10,
            margin=dict(l=20, r=20, t=30, b=20)
        )
        figures[variable] = fig

    return figures

# Generate initial figures
figures = generate_plots(diabetes_data)

# Layout of the app
app.layout = html.Div([
    html.H1(
        "Diabetes Risk Factors Dashboard", 
        style={'textAlign': 'center', 'fontSize': '20px', 'marginBottom': '10px'}
    ),
    html.Div([
        html.Label("Select a Plot to View:", style={'fontSize': '14px'}),
        dcc.Dropdown(
            id="plot-dropdown",
            options=[
                {"label": "BMI Distribution by Diabetes Status", "value": "BMI"},
                {"label": "Age Distribution by Diabetes Status", "value": "Age"},
                {"label": "Smoker Distribution by Diabetes Status", "value": "Smoker"},
                {"label": "HighBP Distribution by Diabetes Status", "value": "HighBP"}
            ],
            value="BMI",
            clearable=False,
            style={'width': '60%', 'fontSize': '12px'}
        )
    ], style={'margin': '10px 0'}),
    dcc.Graph(
        id="selected-plot", 
        style={'height': '400px'}
    ),
    html.Button("Save Plot as HTML", id="save-button", n_clicks=0),
    html.Div(id="save-message", style={"marginTop": "10px", "fontSize": "14px", "color": "green"})
])

# Callback to update plot based on dropdown selection
@app.callback(
    Output("selected-plot", "figure"),
    [Input("plot-dropdown", "value")]
)
def update_plot(selected_plot):
    return figures[selected_plot]

# Callback to save the selected plot as HTML
@app.callback(
    Output("save-message", "children"),
    [Input("save-button", "n_clicks")],
    [Input("plot-dropdown", "value")]
)
def save_plot(n_clicks, selected_plot):
    if n_clicks > 0:
        filename = f"{selected_plot}_plot.html"
        figures[selected_plot].write_html(filename)
        return f"Plot saved as {filename}!"

# Function to open the app in a web browser
def open_browser():
    webbrowser.open_new("http://127.0.0.1:8052/")  # Use port 8052

if __name__ == "__main__":
    Timer(1, open_browser).start()
    app.run_server(debug=False, use_reloader=False, port=8052)  # Use port 8052 explicitly


### Conclusion
Briefly, as you can see in the plot above, individuals with diabetes tend to have a higher BMI on average. Also, the risk of developing diabetes increases with age. Moreover, according to the plots, a high percentage of people with diabetes smoke and have high blood pressure.

In [10]:
import pandas as pd
import statsmodels.api as sm

# Load the data
diabetes_data = pd.read_csv('diabetes_data.csv')

# Convert categorical columns to numeric
diabetes_data['Smoker'] = diabetes_data['Smoker'].map({0: 0, 1: 1})
diabetes_data['HighBP'] = diabetes_data['HighBP'].map({0: 0, 1: 1})
diabetes_data['Diabetes'] = diabetes_data['Diabetes'].map({0: 0, 1: 1})

# Define the independent variables (X) and dependent variable (y)
X = diabetes_data[['Smoker', 'Age', 'BMI', 'HighBP']]
y = diabetes_data['Diabetes']

# Add a constant term for the intercept
X = sm.add_constant(X)

# Fit the logistic regression model
model = sm.Logit(y, X).fit()

print(model.summary())


Optimization terminated successfully.
         Current function value: 0.567019
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               Diabetes   No. Observations:                70692
Model:                          Logit   Df Residuals:                    70687
Method:                           MLE   Df Model:                            4
Date:                Sat, 01 Feb 2025   Pseudo R-squ.:                  0.1820
Time:                        19:37:19   Log-Likelihood:                -40084.
converged:                       True   LL-Null:                       -49000.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.2158      0.059    -89.073      0.000      -5.331      -5.101
Smoker         0.2122      0.

To measure correlation, Pearson correlation measures the linear relationship between two continuous numerical variables. It quantifies how changes in one variable are associated with changes in another on a continuous scale. However, in our case, most of our parameters—such as Smoker, HighBP, and Diabetes—are categorical binary variables (0 or 1). This makes Pearson correlation an unsuitable metric for analyzing relationships between these factors. So I used regression.

### Regression conclusion
According to the coefficients and measuring Odds ratio, we can find out that HBP is the strongest predictor of diabetes in this model, and age and BMI also contribute significantly, showing that older and heavier individuals have a higher risk. Furthurmore, smoking has a smaller but still significant effect on diabetes risk.

# Load and inspect the Stroke data

In [11]:
stroke_data.head()

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,63.0,0,1,1,4,1,228.69,36.6,1,1
1,1.0,42.0,0,1,1,4,0,105.92,32.5,0,1
2,0.0,61.0,0,0,1,4,1,171.23,34.4,1,1
3,1.0,41.0,1,0,1,3,0,174.12,24.0,0,1
4,1.0,85.0,0,0,1,4,1,186.21,29.0,1,1


In [12]:
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40910 entries, 0 to 40909
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sex                40907 non-null  float64
 1   age                40910 non-null  float64
 2   hypertension       40910 non-null  int64  
 3   heart_disease      40910 non-null  int64  
 4   ever_married       40910 non-null  int64  
 5   work_type          40910 non-null  int64  
 6   Residence_type     40910 non-null  int64  
 7   avg_glucose_level  40910 non-null  float64
 8   bmi                40910 non-null  float64
 9   smoking_status     40910 non-null  int64  
 10  stroke             40910 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 3.4 MB


In [13]:
print(f'Missing values before handling:\n{stroke_data.isnull().sum()}')

Missing values before handling:
sex                  3
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


In [14]:
# The number of missing data is too small against the dataset size, so I remove them.
stroke_data.dropna(inplace=True)
stroke_data.isnull().sum()

sex                  0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [15]:
stroke_data.describe()

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,40907.0,40907.0,40907.0,40907.0,40907.0,40907.0,40907.0,40907.0,40907.0,40907.0,40907.0
mean,0.555162,51.327303,0.213851,0.127729,0.821326,3.461095,0.514851,122.079679,30.406488,0.488572,0.500159
std,0.496954,21.624171,0.410028,0.333792,0.383083,0.780934,0.499786,57.561951,6.835305,0.499875,0.500006
min,0.0,-9.0,0.0,0.0,0.0,0.0,0.0,55.12,11.5,0.0,0.0
25%,0.0,35.0,0.0,0.0,1.0,3.0,0.0,78.75,25.9,0.0,0.0
50%,1.0,52.0,0.0,0.0,1.0,4.0,1.0,97.92,29.4,0.0,1.0
75%,1.0,68.0,0.0,0.0,1.0,4.0,1.0,167.59,34.1,1.0,1.0
max,1.0,103.0,1.0,1.0,1.0,4.0,1.0,271.74,92.0,1.0,1.0


Based on the describe result, it seems that we have some invalid values in Age and BMI columns.

In [16]:
# ٍFilter age and BMI
stroke_data = stroke_data[stroke_data['age'] >= 0]
stroke_data = stroke_data[(stroke_data['bmi'] >= 10) & (stroke_data['bmi'] <= 60)]

stroke_data.describe()

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,40794.0,40794.0,40794.0,40794.0,40794.0,40794.0,40794.0,40794.0,40794.0,40794.0,40794.0
mean,0.554836,51.406628,0.213291,0.127862,0.821224,3.460386,0.514978,122.087292,30.354528,0.488773,0.500123
std,0.49699,21.545876,0.409637,0.33394,0.38317,0.781351,0.499782,57.560918,6.687059,0.49988,0.500006
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.12,11.5,0.0,0.0
25%,0.0,35.0,0.0,0.0,1.0,3.0,0.0,78.7825,25.8,0.0,0.0
50%,1.0,52.0,0.0,0.0,1.0,4.0,1.0,97.95,29.4,0.0,1.0
75%,1.0,68.0,0.0,0.0,1.0,4.0,1.0,167.545,34.1,1.0,1.0
max,1.0,103.0,1.0,1.0,1.0,4.0,1.0,271.74,59.7,1.0,1.0


The age column in the diabetes dataset was already categorized beforehand; therefore, I manually categorized the age column in the stroke dataset to ensure consistency.

In [17]:
# Define age bins and labels
bins = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, np.inf]
labels = range(14)  # 0 to 13

# Create age buckets for diabetes data
stroke_data['age_bucket'] = pd.cut(stroke_data['age'], bins=bins, labels=labels, right=False)
stroke_data.head()

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_bucket
0,1.0,63.0,0,1,1,4,1,228.69,36.6,1,1,9
1,1.0,42.0,0,1,1,4,0,105.92,32.5,0,1,5
2,0.0,61.0,0,0,1,4,1,171.23,34.4,1,1,9
3,1.0,41.0,1,0,1,3,0,174.12,24.0,0,1,5
4,1.0,85.0,0,0,1,4,1,186.21,29.0,1,1,13


## Plot the distribution

In [18]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, ColumnDataSource

output_notebook()

# Function to create consistent boxplots with hover tools
def create_boxplot(data, title):
    # Calculate statistics
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    median = data.median()
    iqr = q3 - q1
    lower_whisker = data[data >= (q1 - 1.5 * iqr)].min()
    upper_whisker = data[data <= (q3 + 1.5 * iqr)].max()
    outliers = data[(data < lower_whisker) | (data > upper_whisker)]

    p = figure(title=title, height=300, width=300, tools='hover,pan,wheel_zoom,box_zoom,reset')
    
    p.vbar(x=0, width=0.5, bottom=q1, top=q3, fill_color='skyblue', line_color='black')
    
    p.line([-.25, .25], [median, median], line_color='red', line_width=2)
    
    p.segment(x0=0, x1=0, y0=q3, y1=upper_whisker, line_color='black')
    p.segment(x0=0, x1=0, y0=q1, y1=lower_whisker, line_color='black')
    
    p.circle(x=[0]*len(outliers), y=outliers, color='red', size=5)
    
    hover = HoverTool(tooltips=[
        ('Median', f'{median:.2f}'),
        ('Q1', f'{q1:.2f}'),
        ('Q3', f'{q3:.2f}'),
        ('IQR', f'{iqr:.2f}'),
        ('Lower Whisker', f'{lower_whisker:.2f}'),
        ('Upper Whisker', f'{upper_whisker:.2f}')
    ])
    p.add_tools(hover)
    
    return p

# Function to create bar plots with interactive hover tools
def create_barplot(data, categories, title, colors=None):
    source = ColumnDataSource(data=dict(
        categories=categories,
        counts=data,
        colors=colors if colors else ['#1f77b4']*len(categories)
    ))
    
    p = figure(title=title, x_range=categories, height=300, width=300, 
              tools='hover,pan,wheel_zoom,box_zoom,reset')
    
    p.vbar(x='categories', top='counts', width=0.5, color='colors', source=source)
    
    hover = HoverTool(tooltips=[
        ('Category', '@categories'),
        ('Count', '@counts')
    ])
    p.add_tools(hover)
    
    return p

# Read and process the stroke data
stroke_data = pd.read_csv('stroke_data.csv')

# Clean the data
stroke_data.dropna(inplace=True)
stroke_data = stroke_data[stroke_data['age'] >= 1]
stroke_data = stroke_data[(stroke_data['bmi'] >= 10) & (stroke_data['bmi'] <= 60)]

# Define age bins and labels
bins = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, np.inf]
labels = range(14)  # 0 to 13

# Create age buckets
stroke_data['age_bucket'] = pd.cut(stroke_data['age'], bins=bins, labels=labels, right=False)

# Create the box plots
bmi_plot = create_boxplot(stroke_data['bmi'], 'BMI Distribution')
age_bucket_plot = create_boxplot(stroke_data['age_bucket'].astype(float), 'Age Bucket Distribution')

# Create the categorical bar plots
# Smoking status
smoker_counts = stroke_data['smoking_status'].value_counts()
smoker_plot = create_barplot(
    smoker_counts.values, 
    ['Non-Smoker', 'Smoker'], 
    'Smokers Distribution', 
    ['coral', 'coral']
)

# Stroke status
stroke_counts = stroke_data['stroke'].value_counts()
stroke_plot = create_barplot(
    stroke_counts.values,
    ['No Stroke', 'Stroke'],
    'Stroke Status',
    ['green', 'green']
)

# Blood pressure status
bp_counts = stroke_data['hypertension'].value_counts()
bp_plot = create_barplot(
    bp_counts.values,
    ['Normal', 'High BP'],
    'High Blood Pressure',
    ['purple', 'purple']
)

# Sex distribution
sex_counts = stroke_data['sex'].value_counts()
sex_plot = create_barplot(
    sex_counts.values,
    ['Female', 'Male'],
    'Sex Distribution',
    ['pink', 'blue']
)

# Arrange plots in a 2-row grid layout
grid = gridplot([
    [bmi_plot, smoker_plot, stroke_plot],
    [age_bucket_plot, bp_plot, sex_plot]
])

show(grid)


'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.


'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.



As we can see, 
- most people have a BMI between 25-30, with a median around 28. There are some extreme outliers, indicating severe obesity in a few cases.
- The dataset has a fairly even split between smokers and non-smokers, with non-smokers slightly outnumbering smokers.
- The distribution between stroke and non-stroke cases is balanced.
Similarly, the distribution status of other parameters can be examined from the plots.

In [19]:
import pandas as pd
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import webbrowser
from threading import Timer

# Load data from the CSV file
stroke_data = pd.read_csv('stroke_data.csv')

# Convert binary columns to categorical
stroke_data['stroke'] = stroke_data['stroke'].map({0: 'No', 1: 'Yes'})
stroke_data['smoking_status'] = stroke_data['smoking_status'].map({0: 'No', 1: 'Yes'})
stroke_data['hypertension'] = stroke_data['hypertension'].map({0: 'No', 1: 'Yes'})

# Filter BMI data
stroke_data = stroke_data[(stroke_data['bmi'] >= 10) & (stroke_data['bmi'] <= 60)]

# Create age buckets
bins = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, float('inf')]
labels = range(14)
stroke_data['age_bucket'] = pd.cut(stroke_data['age'], bins=bins, labels=labels, right=False)

# Create Dash app
app = dash.Dash(__name__)
app.title = "Stroke Risk Factors Dashboard"

# Function to generate plots
def generate_plots(data):
    continuous_vars = ['bmi', 'age_bucket']
    categorical_vars = ['smoking_status', 'hypertension']
    colors = ['#2b908f', '#ff7c43']
    figures = {}

    # Continuous Variables (Boxplots)
    for variable in continuous_vars:
        fig = px.box(
            data,
            x="stroke",
            y=variable,
            color="stroke",
            color_discrete_sequence=colors,
            labels={"stroke": "Stroke Status"},
            title=f'{variable.capitalize()} Distribution by Stroke Status'
        )
        fig.update_layout(
            showlegend=False,
            title_font_size=12,
            font_size=10,
            margin=dict(l=20, r=20, t=30, b=20)
        )
        
        if variable == 'bmi':
            fig.update_yaxes(range=[10, 60])
        elif variable == 'age_bucket':
            fig.update_yaxes(range=[0, 13])
        
        figures[variable] = fig

    # Categorical Variables (Bar Plots)
    for variable in categorical_vars:
        cross_tab = pd.crosstab(data[variable], data['stroke'], normalize='index') * 100
        cross_tab = cross_tab.reset_index().melt(id_vars=variable, value_name="Percentage", var_name="stroke")
        
        fig = px.bar(
            cross_tab,
            x=variable,
            y="Percentage",
            color="stroke",
            barmode="group",
            text="Percentage",
            color_discrete_sequence=colors,
            title=f'{variable.capitalize()} Distribution by Stroke Status'
        )
        
        fig.update_traces(
            texttemplate='%{text:.2f}%',
            textposition='inside',
            insidetextanchor='middle'
        )

        fig.update_layout(
            title_font_size=12,
            font_size=10,
            margin=dict(l=20, r=20, t=30, b=20)
        )
        figures[variable] = fig

    return figures

# Generate initial figures
figures = generate_plots(stroke_data)

# Layout of the app
app.layout = html.Div([
    html.H1(
        "Stroke Risk Factors Dashboard", 
        style={'textAlign': 'center', 'fontSize': '20px', 'marginBottom': '10px'}
    ),
    html.Div([
        html.Label("Select a Plot to View:", style={'fontSize': '14px'}),
        dcc.Dropdown(
            id="plot-dropdown",
            options=[
                {"label": "BMI Distribution by Stroke Status", "value": "bmi"},
                {"label": "Age Distribution by Stroke Status", "value": "age_bucket"},
                {"label": "Smoker Distribution by Stroke Status", "value": "smoking_status"},
                {"label": "Hypertension Distribution by Stroke Status", "value": "hypertension"}
            ],
            value="bmi",
            clearable=False,
            style={'width': '60%', 'fontSize': '12px'}
        )
    ], style={'margin': '10px 0'}),
    dcc.Graph(
        id="selected-plot", 
        style={'height': '400px'}
    )
])

@app.callback(
    Output("selected-plot", "figure"),
    [Input("plot-dropdown", "value")]
)
def update_plot(selected_plot):
    return figures[selected_plot]

def open_browser():
    webbrowser.open_new("http://127.0.0.1:8060/")

if __name__ == "__main__":
    Timer(1, open_browser).start()
    app.run_server(debug=False, use_reloader=False, port=8060)


### Conclusion
Based on the interactive plots, it is easy to see that the risk of stroke is higher in individuals who smoke and have high blood pressure.  

One important point to note is that, in reality, the risk of stroke increases with age. However, this cannot be concluded from the charts. Therefore, there is a high probability that the values in the age column are inaccurate.

In [20]:
# prepare the data to merge
# rename the Diabetes data corresponding to the columns in the stroke data
diabetes_data = diabetes_data.rename(columns={
    'Age': 'age',
    'Sex': 'sex',
    'BMI': 'bmi',
    'Smoker': 'smoking_status',
    'Stroke': 'stroke',
    'HighBP': 'hypertension'
})
diabetes_data.head()

Unnamed: 0,age,sex,HighChol,CholCheck,bmi,smoking_status,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,stroke,hypertension,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1,0
1,12.0,1.0,1.0,1.0,26.0,1,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1,0
2,13.0,1.0,0.0,1.0,26.0,0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0,0
3,11.0,1.0,1.0,1.0,28.0,1,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1,0
4,8.0,0.0,0.0,1.0,29.0,1,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0,0


Based on the interactive charts, it is easy to see that the risk of stroke is higher in individuals who smoke and have high blood pressure.  
One important point to note is that, in reality, the risk of stroke increases with age. However, this cannot be concluded from the charts. Therefore, there is a high probability that the values in the age column are inaccurate.

In [21]:
# Define BMI buckets
bmi_bins = [0, 18.5, 25, 30, float('inf')] 
bmi_labels = ["Underweight", "Normal weight", "Overweight", "Obese"]

# Create BMI buckets for diabetes data
diabetes_data['bmi_bucket'] = pd.cut(diabetes_data['bmi'], bins=bmi_bins, labels=bmi_labels)

# Create BMI buckets for stroke data
stroke_data['bmi_bucket'] = pd.cut(stroke_data['bmi'], bins=bmi_bins, labels=bmi_labels)

In [22]:
diabetes_data

Unnamed: 0,age,sex,HighChol,CholCheck,bmi,smoking_status,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,stroke,hypertension,Diabetes,bmi_bucket
0,4.0,1.0,0.0,1.0,26.0,0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1,0,Overweight
1,12.0,1.0,1.0,1.0,26.0,1,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1,0,Overweight
2,13.0,1.0,0.0,1.0,26.0,0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0,0,Overweight
3,11.0,1.0,1.0,1.0,28.0,1,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1,0,Overweight
4,8.0,0.0,0.0,1.0,29.0,1,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0,0,Overweight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,6.0,0.0,1.0,1.0,37.0,0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0,1,Obese
70688,10.0,1.0,1.0,1.0,29.0,1,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0,1,Overweight
70689,13.0,0.0,1.0,1.0,25.0,0,1.0,0.0,1.0,0.0,0.0,5.0,15.0,0.0,1.0,0.0,1,1,Normal weight
70690,11.0,0.0,1.0,1.0,18.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1,1,Underweight


In [23]:
stroke_data

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_bucket,bmi_bucket
0,1.0,63.0,No,1,1,4,1,228.69,36.6,Yes,Yes,9,Obese
1,1.0,42.0,No,1,1,4,0,105.92,32.5,No,Yes,5,Obese
2,0.0,61.0,No,0,1,4,1,171.23,34.4,Yes,Yes,9,Obese
3,1.0,41.0,Yes,0,1,3,0,174.12,24.0,No,Yes,5,Normal weight
4,1.0,85.0,No,0,1,4,1,186.21,29.0,Yes,Yes,13,Overweight
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40905,1.0,38.0,No,0,0,4,1,120.94,29.7,Yes,No,4,Overweight
40906,0.0,53.0,No,0,1,4,0,77.66,40.8,No,No,7,Obese
40907,1.0,32.0,No,0,1,2,0,231.95,33.2,No,No,3,Obese
40908,1.0,42.0,No,0,1,3,0,216.38,34.5,No,No,5,Obese


## Data wrangling

In [24]:
# remove the age column and rename the age_bucket to age to be the same
stroke_data = stroke_data.drop('age', axis=1)
stroke_data = stroke_data.rename(columns={'age_bucket': 'age'})

# filter and preprocess relevant columns from two datasets—one related to stroke and the other to diabetes—to prepare them for analysis or modeling
stroke_filtered = stroke_data[['age', 'sex', 'bmi_bucket', 'smoking_status', 'hypertension', 'stroke']]
diabetes_filtered = diabetes_data[['age', 'sex', 'bmi_bucket', 'smoking_status', 'hypertension', 'Diabetes']]

diabetes_filtered['hypertension'] = diabetes_filtered['hypertension'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
stroke_filtered['hypertension'] = stroke_filtered['hypertension'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

diabetes_filtered['smoking_status'] = diabetes_filtered['smoking_status'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)
stroke_filtered['smoking_status'] = stroke_filtered['smoking_status'].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

# Ensure the column is of integer type
diabetes_filtered['hypertension'] = diabetes_filtered['hypertension'].astype(int)
stroke_filtered['hypertension'] = stroke_filtered['hypertension'].astype(int)

merged_data = pd.merge(diabetes_filtered, stroke_filtered, on=['age', 'smoking_status', 'hypertension', 'bmi_bucket'], how='inner')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [25]:
merged_data

Unnamed: 0,age,sex_x,bmi_bucket,smoking_status,hypertension,Diabetes,sex_y,stroke
0,4.0,1.0,Overweight,0,0,0,0.0,Yes
1,4.0,1.0,Overweight,0,0,0,1.0,Yes
2,4.0,1.0,Overweight,0,0,0,1.0,Yes
3,4.0,1.0,Overweight,0,0,0,1.0,Yes
4,4.0,1.0,Overweight,0,0,0,0.0,Yes
...,...,...,...,...,...,...,...,...
28544620,9.0,0.0,Normal weight,0,0,1,1.0,No
28544621,9.0,0.0,Normal weight,0,0,1,0.0,No
28544622,9.0,0.0,Normal weight,0,0,1,0.0,No
28544623,9.0,0.0,Normal weight,0,0,1,1.0,No


In [26]:
merged_data.isnull().sum()

age               0
sex_x             0
bmi_bucket        0
smoking_status    0
hypertension      0
Diabetes          0
sex_y             0
stroke            0
dtype: int64

In [27]:
# Filter for individuals with both diabetes and stroke
merged_diabetics_stroke = merged_data[(merged_data['Diabetes'] == 1) & (merged_data['stroke'] == 'Yes')]

# Count the number of individuals with both conditions
number_of_diabetics_with_stroke = len(merged_diabetics_stroke)

print(f"Number of individuals with both diabetes and stroke: {number_of_diabetics_with_stroke}")

diabetics_stroke_perc = (number_of_diabetics_with_stroke / len(merged_data)) * 100
print(f"percentage of individuals with both diabetes and stroke: {diabetics_stroke_perc:.2f}%")


Number of individuals with both diabetes and stroke: 6028905
percentage of individuals with both diabetes and stroke: 21.12%


In [28]:
diabetics = merged_data[merged_data['Diabetes'] == 1]
non_diabetics = merged_data[merged_data['Diabetes'] == 0]

stroke_in_diabetics = len(diabetics[diabetics['stroke'] == 'Yes'])
stroke_in_non_diabetics = len(non_diabetics[non_diabetics['stroke'] == 'Yes'])

stroke_in_diabetics_perc = (stroke_in_diabetics / len(diabetics)) * 100
stroke_in_non_diabetics_perc = (stroke_in_non_diabetics / len(non_diabetics)) * 100

print(f"Stroke prevalence in diabetics: {stroke_in_diabetics_perc:.2f}%")
print(f"Stroke prevalence in non-diabetics: {stroke_in_non_diabetics_perc:.2f}%")


Stroke prevalence in diabetics: 39.61%
Stroke prevalence in non-diabetics: 36.98%


In [29]:
import statsmodels.api as sm
import pandas as pd

# Encode the dependent variable ('stroke') as binary (1 = Yes, 0 = No)
merged_data['stroke_binary'] = merged_data['stroke'].apply(lambda x: 1 if x == 'Yes' else 0)

# Define independent variables (Diabetes and any confounders)
X = merged_data[['Diabetes']]  # Add other variables like 'Age', 'BMI' if needed
X = sm.add_constant(X)  # Add intercept

# Define dependent variable
y = merged_data['stroke_binary']

# Logistic regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Print summary
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.665542
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:          stroke_binary   No. Observations:             28544625
Model:                          Logit   Df Residuals:                 28544623
Method:                           MLE   Df Model:                            1
Date:                Sat, 01 Feb 2025   Pseudo R-squ.:               0.0005459
Time:                        19:38:06   Log-Likelihood:            -1.8998e+07
converged:                       True   LL-Null:                   -1.9008e+07
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.5330      0.001   -939.228      0.000      -0.534      -0.532
Diabetes       0.1112      0.

## Conclusion
- The coefficient for Diabetes (0.1112) is positive, indicating that having diabetes increases the likelihood of having a stroke.
The p-value (0.000) suggests this effect is highly statistically significant.
- The negative intercept (-0.5330) represents the baseline log-odds of having a stroke when diabetes is absent.
- The pseudo R-squared (0.0005459) suggests a very low explanatory power of diabetes alone in predicting stroke.

Diabetes is a statistically significant predictor of stroke, but the low pseudo R-squared value suggests that other factors also play a crucial role in determining stroke risk.

In [30]:
import pandas as pd
from scipy.stats import chi2_contingency

# Create a contingency table
contingency_table = pd.crosstab(merged_data['Diabetes'], merged_data['stroke'])

# Perform the Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")

# Interpretation
if p < 0.05:
    print("There is a significant association between Diabetes and Stroke.")
else:
    print("There is no significant association between Diabetes and Stroke.")


Chi-Square Statistic: 20737.54199998552
P-value: 0.0
Degrees of Freedom: 1
There is a significant association between Diabetes and Stroke.


For further verification, I also used the Chi-Squared method, which is suitable for examining the relationship between two categorical variables. The result was similar to the regression outcome.