## Importing the dataset and reading its details

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

patients_data = pd.read_csv('heart_2022_with_nans.csv')
patients_data

### Info about quantitative variables

In [18]:
patients_data.describe()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI
count,434205.0,436065.0,439679.0,416480.0,403054.0,396326.0
mean,4.347919,4.382649,7.022983,1.702691,83.07447,28.529842
std,8.688912,8.387475,1.502425,0.107177,21.448173,6.554889
min,0.0,0.0,1.0,0.91,22.68,12.02
25%,0.0,0.0,6.0,1.63,68.04,24.13
50%,0.0,0.0,7.0,1.7,80.74,27.44
75%,3.0,5.0,8.0,1.78,95.25,31.75
max,30.0,30.0,24.0,2.41,292.57,99.64


## Dataset treatment

### Drop irrelevant columns

In [3]:
patients_data = patients_data.drop(columns=['State'], axis=1)
patients_data.isnull().sum()

Unnamed: 0,0
Sex,0
GeneralHealth,1198
PhysicalHealthDays,10927
MentalHealthDays,9067
LastCheckupTime,8308
PhysicalActivities,1093
SleepHours,5453
RemovedTeeth,11360
HadHeartAttack,3065
HadAngina,4405


Reading the info again

In [20]:
patients_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 39 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Sex                        445132 non-null  object 
 1   GeneralHealth              443934 non-null  object 
 2   PhysicalHealthDays         434205 non-null  float64
 3   MentalHealthDays           436065 non-null  float64
 4   LastCheckupTime            436824 non-null  object 
 5   PhysicalActivities         444039 non-null  object 
 6   SleepHours                 439679 non-null  float64
 7   RemovedTeeth               433772 non-null  object 
 8   HadHeartAttack             442067 non-null  object 
 9   HadAngina                  440727 non-null  object 
 10  HadStroke                  443575 non-null  object 
 11  HadAsthma                  443359 non-null  object 
 12  HadSkinCancer              441989 non-null  object 
 13  HadCOPD                    44

### Label Encoder

In [None]:
encoded_patients_data = patients_data.copy()
np.shape(encoded_patients_data)

In [5]:
label_encoder_general_health = LabelEncoder()

mean = encoded_patients_data['GeneralHealth'].dropna().mode()[0]
encoded_patients_data['GeneralHealth'] = encoded_patients_data['GeneralHealth'].fillna(mean)

encoded_patients_data['GeneralHealth'] = label_encoder_general_health.fit_transform(encoded_patients_data['GeneralHealth'])
encoded_patients_data['GeneralHealth'].unique()

array([4, 0, 1, 3, 2])

In [6]:
label_encoder_sleep_hours = LabelEncoder()

mode = encoded_patients_data['SleepHours'].dropna().mode()[0]
encoded_patients_data['SleepHours'] = encoded_patients_data['SleepHours'].fillna(mode)

encoded_patients_data['SleepHours'] = label_encoder_sleep_hours.fit_transform(encoded_patients_data['SleepHours'])
encoded_patients_data['SleepHours'].unique()

array([ 7,  5,  4,  6,  8,  3,  9,  0, 11, 17,  2,  1, 10, 15, 14, 12, 13,
       19, 22, 16, 23, 21, 18, 20])

In [7]:
encoded_patients_data = encoded_patients_data.dropna()

label_encoder_sex = LabelEncoder()
encoded_patients_data['Sex'] = label_encoder_sex.fit_transform(encoded_patients_data['Sex'])

label_encoder_checkup_time = LabelEncoder()
encoded_patients_data['LastCheckupTime'] = label_encoder_checkup_time.fit_transform(encoded_patients_data['LastCheckupTime'])

label_encoder_sleep_hours = LabelEncoder()
encoded_patients_data['SleepHours'] = label_encoder_sleep_hours.fit_transform(encoded_patients_data['SleepHours'])

label_encoder_removed_teeth = LabelEncoder()
encoded_patients_data['RemovedTeeth'] = label_encoder_removed_teeth.fit_transform(encoded_patients_data['RemovedTeeth'])

label_encoder_had_heart_attack = LabelEncoder()
encoded_patients_data['HadHeartAttack'] = label_encoder_had_heart_attack.fit_transform(encoded_patients_data['HadHeartAttack'])

label_encoder_had_angina = LabelEncoder()
encoded_patients_data['HadAngina'] = label_encoder_had_angina.fit_transform(encoded_patients_data['HadAngina'])

label_encoder_had_stroke = LabelEncoder()
encoded_patients_data['HadStroke'] = label_encoder_had_stroke.fit_transform(encoded_patients_data['HadStroke'])

label_encoder_had_asthma = LabelEncoder()
encoded_patients_data['HadAsthma'] = label_encoder_had_asthma.fit_transform(encoded_patients_data['HadAsthma'])

label_encoder_had_skin_cancer = LabelEncoder()
encoded_patients_data['HadSkinCancer'] = label_encoder_had_skin_cancer.fit_transform(encoded_patients_data['HadSkinCancer'])

label_encoder_had_copd = LabelEncoder()
encoded_patients_data['HadCOPD'] = label_encoder_had_copd.fit_transform(encoded_patients_data['HadCOPD'])

label_encoder_had_depressive_disorder = LabelEncoder()
encoded_patients_data['HadDepressiveDisorder'] = label_encoder_had_depressive_disorder.fit_transform(encoded_patients_data['HadDepressiveDisorder'])

label_encoder_had_kidney_disease = LabelEncoder()
encoded_patients_data['HadKidneyDisease'] = label_encoder_had_kidney_disease.fit_transform(encoded_patients_data['HadKidneyDisease'])

label_encoder_had_arthritis = LabelEncoder()
encoded_patients_data['HadArthritis'] = label_encoder_had_arthritis.fit_transform(encoded_patients_data['HadArthritis'])

label_encoder_had_diabetes = LabelEncoder()
encoded_patients_data['HadDiabetes'] = label_encoder_had_diabetes.fit_transform(encoded_patients_data['HadDiabetes'])

label_encoder_deaf_or_hard_of_hearing = LabelEncoder()
encoded_patients_data['DeafOrHardOfHearing'] = label_encoder_deaf_or_hard_of_hearing.fit_transform(encoded_patients_data['DeafOrHardOfHearing'])

label_encoder_blind_or_vision_difficulty = LabelEncoder()
encoded_patients_data['BlindOrVisionDifficulty'] = label_encoder_blind_or_vision_difficulty.fit_transform(encoded_patients_data['BlindOrVisionDifficulty'])

label_encoder_difficulty_concentrating = LabelEncoder()
encoded_patients_data['DifficultyConcentrating'] = label_encoder_difficulty_concentrating.fit_transform(encoded_patients_data['DifficultyConcentrating'])

label_encoder_difficulty_walking = LabelEncoder()
encoded_patients_data['DifficultyWalking'] = label_encoder_difficulty_walking.fit_transform(encoded_patients_data['DifficultyWalking'])

label_encoder_difficulty_dressing_bathing = LabelEncoder()
encoded_patients_data['DifficultyDressingBathing'] = label_encoder_difficulty_dressing_bathing.fit_transform(encoded_patients_data['DifficultyDressingBathing'])

label_encoder_difficulty_errands = LabelEncoder()
encoded_patients_data['DifficultyErrands'] = label_encoder_difficulty_errands.fit_transform(encoded_patients_data['DifficultyErrands'])

label_encoder_smoker_status = LabelEncoder()
encoded_patients_data['SmokerStatus'] = label_encoder_smoker_status.fit_transform(encoded_patients_data['SmokerStatus'])

label_encoder_ecigarette_usage = LabelEncoder()
encoded_patients_data['ECigaretteUsage'] = label_encoder_ecigarette_usage.fit_transform(encoded_patients_data['ECigaretteUsage'])

label_encoder_chest_scan = LabelEncoder()
encoded_patients_data['ChestScan'] = label_encoder_chest_scan.fit_transform(encoded_patients_data['ChestScan'])

label_encoder_race_ethnicity_category = LabelEncoder()
encoded_patients_data['RaceEthnicityCategory'] = label_encoder_race_ethnicity_category.fit_transform(encoded_patients_data['RaceEthnicityCategory'])

label_encoder_age_category = LabelEncoder()
encoded_patients_data['AgeCategory'] = label_encoder_age_category.fit_transform(encoded_patients_data['AgeCategory'])

label_encoder_alcohol_drinkers = LabelEncoder()
encoded_patients_data['AlcoholDrinkers'] = label_encoder_alcohol_drinkers.fit_transform(encoded_patients_data['AlcoholDrinkers'])

label_encoder_hiv_testing = LabelEncoder()
encoded_patients_data['HIVTesting'] = label_encoder_hiv_testing.fit_transform(encoded_patients_data['HIVTesting'])

label_encoder_flu_vax_last_12 = LabelEncoder()
mode = encoded_patients_data['FluVaxLast12'].dropna().mode()[0]
encoded_patients_data['FluVaxLast12'] = encoded_patients_data['FluVaxLast12'].fillna(mode)
encoded_patients_data['FluVaxLast12'] = label_encoder_flu_vax_last_12.fit_transform(encoded_patients_data['FluVaxLast12'])

label_encoder_pneumo_vax_ever = LabelEncoder()
encoded_patients_data['PneumoVaxEver'] = label_encoder_pneumo_vax_ever.fit_transform(encoded_patients_data['PneumoVaxEver'])

label_encoder_tetanus_last_10_tdap = LabelEncoder()
encoded_patients_data['TetanusLast10Tdap'] = label_encoder_tetanus_last_10_tdap.fit_transform(encoded_patients_data['TetanusLast10Tdap'])

label_encoder_high_risk_last_year = LabelEncoder()
encoded_patients_data['HighRiskLastYear'] = label_encoder_high_risk_last_year.fit_transform(encoded_patients_data['HighRiskLastYear'])

label_encoder_covid_pos = LabelEncoder()
encoded_patients_data['CovidPos'] = label_encoder_covid_pos.fit_transform(encoded_patients_data['CovidPos'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  encoded_patients_data['Sex'] = label_encoder_sex.fit_transform(encoded_patients_data['Sex'])


In [8]:
mean = encoded_patients_data['PhysicalHealthDays'].dropna().mean()
encoded_patients_data['PhysicalHealthDays'] = encoded_patients_data['PhysicalHealthDays'].fillna(mean.round(2))

mean = encoded_patients_data['MentalHealthDays'].dropna().mean()
encoded_patients_data['MentalHealthDays'] = encoded_patients_data['MentalHealthDays'].fillna(mean.round(2))

mean = encoded_patients_data['HeightInMeters'].dropna().mean()
encoded_patients_data['HeightInMeters'] = encoded_patients_data['HeightInMeters'].fillna(mean.round(2))

mean = encoded_patients_data['WeightInKilograms'].dropna().mean()
encoded_patients_data['WeightInKilograms'] = encoded_patients_data['WeightInKilograms'].fillna(mean.round(2))

mean = encoded_patients_data['BMI'].dropna().mean()
encoded_patients_data['BMI'] = encoded_patients_data['BMI'].fillna(mean.round(2))

In [43]:
encoded_patients_data.describe()

Unnamed: 0,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,SleepHours,RemovedTeeth,HadHeartAttack,HadAngina,HadStroke,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
count,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,...,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0,247536.0
mean,0.480177,2.284391,4.142904,4.182268,2.622443,6.021116,1.820838,0.054865,0.060977,0.041396,...,1.704997,83.599943,28.668122,0.548728,0.343526,0.532884,0.406191,1.129965,0.043024,0.605128
std,0.499608,1.473328,8.436551,8.125434,0.830579,1.436247,1.349914,0.227717,0.239289,0.199205,...,0.106698,21.328566,6.517459,0.497621,0.474886,0.498918,0.491122,0.969638,0.202912,0.900015
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.91,28.12,12.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,0.0,...,1.63,68.04,24.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,2.0,0.0,0.0,3.0,6.0,3.0,0.0,0.0,0.0,...,1.7,81.65,27.46,1.0,0.0,1.0,0.0,1.0,0.0,0.0
75%,1.0,4.0,3.0,4.0,3.0,7.0,3.0,0.0,0.0,0.0,...,1.78,95.25,31.89,1.0,1.0,1.0,1.0,2.0,0.0,2.0
max,1.0,4.0,30.0,30.0,3.0,22.0,3.0,1.0,1.0,1.0,...,2.41,292.57,97.65,1.0,1.0,1.0,1.0,3.0,1.0,2.0


In [44]:
np.shape(encoded_patients_data)

(247536, 39)

In [45]:
encoded_patients_data.isnull().sum()

Unnamed: 0,0
Sex,0
GeneralHealth,0
PhysicalHealthDays,0
MentalHealthDays,0
LastCheckupTime,0
PhysicalActivities,0
SleepHours,0
RemovedTeeth,0
HadHeartAttack,0
HadAngina,0


In [48]:
encoded_patients_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 247536 entries, 342 to 445130
Data columns (total 39 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Sex                        247536 non-null  int64  
 1   GeneralHealth              247536 non-null  int64  
 2   PhysicalHealthDays         247536 non-null  float64
 3   MentalHealthDays           247536 non-null  float64
 4   LastCheckupTime            247536 non-null  int64  
 5   PhysicalActivities         247536 non-null  object 
 6   SleepHours                 247536 non-null  int64  
 7   RemovedTeeth               247536 non-null  int64  
 8   HadHeartAttack             247536 non-null  int64  
 9   HadAngina                  247536 non-null  int64  
 10  HadStroke                  247536 non-null  int64  
 11  HadAsthma                  247536 non-null  int64  
 12  HadSkinCancer              247536 non-null  int64  
 13  HadCOPD                    24753

In [None]:
data_to_build_graph = encoded_patients_data.drop(columns=['PhysicalActivities', 'DifficultyErrands', 'DifficultyDressingBathing', 'DifficultyConcentrating', 'DifficultyWalking', 'DeafOrHardOfHearing', 'RemovedTeeth', 'WeightInKilograms', 'HeightInMeters', 'HIVTesting', 'RaceEthnicityCategory', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap'], axis=1)
sns.pairplot(data_to_build_graph, hue='HadHeartAttack');

KeyboardInterrupt: 