Importing Libraries and Designing Visualizations

In [7]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pywaffle.waffle import Waffle
%matplotlib inline
sns.set_style("darkgrid")

colors6 = sns.color_palette(['#1337f5', '#E80000', '#0f1e41', '#fd523e', '#404e5c', '#c9bbaa'], 6)
colors2 = sns.color_palette(['#1337f5', '#E80000'], 2)
colors1 = sns.color_palette(['#1337f5'], 1)

ImportError: cannot import name 'DataError' from 'pandas.core.base' (C:\Users\Administrator\anaconda3\Lib\site-packages\pandas\core\base.py)

Data Reading

In [2]:
df = pd.read_csv("data/dataset.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
ProfileReport(df)

NameError: name 'ProfileReport' is not defined

In [None]:
df.info()

Numerical

In [None]:
df.describe()

Categorical

In [None]:
df.describe(include="object")

EDA

In [None]:
def show_relation(col, according_to, type_='dis'):
    """
    Visualize relationships between variables using Seaborn.
    
    Parameters:
    col (str): The column to be plotted.
    according_to (str or None): The column to be used for grouping or hue (coloring). Can be None.
    type_ (str): Type of visualization, 'dis' for distribution or 'count' for count/percentage.
    
    Returns:
    None
    """
    
    plt.figure(figsize=(15, 7))
    
    if type_ == 'dis':
        sns.displot(data=df, x=col, hue=according_to, kind='kde', palette=colors2)
    elif type_ == 'count':
        if according_to is not None:
            perc = df.groupby(col)[according_to].value_counts(normalize=True).reset_index(name='Percentage')
            sns.barplot(data=perc, x=col, y='Percentage', hue=according_to, palette=colors6,
                        order=df[col].value_counts().index)
        else:
            sns.countplot(data=df, x=col, hue=according_to, palette=colors1, order=df[col].value_counts().index)
    
    if according_to is None:
        plt.title(f'{col}')
    else:
        plt.title(f'{col} according to {according_to}')

Percentage of people with heart disease

In [None]:
# get percentage of attrition then convert to dicrionary
disease_size = (df.groupby('HeartDisease').size()*100 / len(df)).to_dict()

# create figure
fig = plt.figure(
    FigureClass=Waffle, # type = waffle figure
    rows=5, # rows of people
    figsize = (9,3),
    values=disease_size, # data

    # legend labels
    labels=[f"{k} ({round(v / sum(disease_size.values()) * 100, 2)}%)" 
            for k, v in disease_size.items()],
    # colors for attrition and no attrition
    colors=(colors2[0], colors2[1]),
    # icons set to person for both attrition and no attriton
    icons = ['heart','heart'],
    
    # the legend at the bottom, after playing with the 
    # locations i centered it at the bottom
    legend={'loc': 'lower center',
            'bbox_to_anchor': (0.5, -0.5),
            'ncol': len(disease_size),
            'framealpha': 0,
            'fontsize': 20
          },

    # size of icons (people) 
    icon_size=20,
    # add icon to the legend at the bottom 
    icon_legend=True,

    #title of the waffle graph
    title={
        'label': 'Heart Disease Per 100 People',
        'loc': 'center',
        'fontdict': {'fontsize': 20}
          }
)

BMI and heart disease

In [None]:
show_relation(num_cols[0], 'HeartDisease');

Mental health and heart disease

In [None]:
show_relation(num_cols[2], 'HeartDisease')

Age and heart disease

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 6))

sns.histplot(data=df.loc[df.HeartDisease == 'No'].sort_values("AgeCategory"), x='AgeCategory',
                  color=colors1, ax=ax1);
ax1.set_title("Age Distribution of Poeple Without Heart Disease")

sns.histplot(data=df.loc[df.HeartDisease == 'Yes'].sort_values("AgeCategory"), x='AgeCategory',
                  color=colors2[1], ax=ax2);
ax2.set_title("Age Distribution of Heart Disease Patients")


fig.tight_layout()

Gender and Heart Disease

In [None]:
show_relation(obj_cols[4], 'HeartDisease', type_='count')

Race and heart disease

In [None]:
plt.figure(figsize=(16, 6), dpi=80)
x = df.groupby('Race').HeartDisease.value_counts(normalize=True).reset_index(name='Percentage')
x = x.loc[x.HeartDisease == 'Yes'].sort_values('Percentage', ascending=False)

sns.barplot(data=x, x='Race', y='Percentage', palette=colors6);

General health perception and heart disease

In [5]:
x = df.groupby('HeartDisease').GenHealth.value_counts(normalize=True).reset_index(name='Percentage')
x = x.sort_values(by='GenHealth', key=lambda x: x.map({'Excellent': 0, 
                                                       'Very good': 1, 
                                                       'Good': 2, 
                                                       'Fair': 3, 
                                                       'Poor': 4} ))

x.Percentage = round(x.Percentage * 100, 1) 

fig = px.bar(data_frame=x, x='Percentage', y='HeartDisease', color='GenHealth',
       text=x.Percentage.map(lambda x: str(x) + '%'), 
       color_discrete_sequence=['#1337f5', '#E80000', '#0f1e41', '#fd523e', '#404e5c', '#c9bbaa'])
fig.update_layout(title="What Individuals Say Their General Health Is", title_x=0.5)

fig.show()

NameError: name 'px' is not defined