In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import os
import warnings
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

In [4]:
import pandas as pd
import matplotlib.pyplot as plt

from virny.datasets import DiabetesDataset2019
from virny.utils.model_tuning_utils import tune_ML_models
from virny.preprocessing.basic_preprocessing import preprocess_dataset
from virny.utils.data_viz_utils import create_dataset_stats_bar_chart
from virny.utils.custom_initializers import create_models_config_from_tuned_params_df

from pprint import pprint
from datetime import datetime, timezone
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from EDA_utils import get_correlation_with_target, get_correlation_matrix, get_features_by_target_correlation_threshold
     

In [5]:
DATASET_SPLIT_SEED = 100
MODELS_TUNING_SEED = 100
TEST_SET_FRACTION = 0.2
DATASET_NAME = 'Diabetes_2019'

sensitive_attributes_dct = {'Gender': 'Female',  'Age': 'less than 40', 'Gender & Age': None}
sensitive_attributes = [attr for attr in sensitive_attributes_dct.keys() if '&' not in attr]
     

In [6]:
data_loader = DiabetesDataset2019(with_nulls=False)
data_loader.full_df.head()

Unnamed: 0,Age,Gender,Family_Diabetes,highBP,PhysicallyActive,BMI,Smoking,Alcohol,Sleep,SoundSleep,RegularMedicine,JunkFood,Stress,BPLevel,Pregnancies,Pdiabetes,UriationFreq,Diabetic
0,50-59,Male,no,yes,one hr or more,39.0,no,no,8,6,no,occasionally,sometimes,high,0.0,0,not much,0
1,50-59,Male,no,yes,less than half an hr,28.0,no,no,8,6,yes,very often,sometimes,normal,0.0,0,not much,0
2,40-49,Male,no,no,one hr or more,24.0,no,no,6,6,no,occasionally,sometimes,normal,0.0,0,not much,0
3,50-59,Male,no,no,one hr or more,23.0,no,no,8,6,no,occasionally,sometimes,normal,0.0,0,not much,0
4,40-49,Male,no,no,less than half an hr,27.0,no,no,8,8,no,occasionally,sometimes,normal,0.0,0,not much,0


In [7]:
data = data_loader.full_df

In [8]:
data.shape

(905, 18)

In [9]:
data_loader.full_df.isnull().sum()

Age                 0
Gender              0
Family_Diabetes     0
highBP              0
PhysicallyActive    0
BMI                 0
Smoking             0
Alcohol             0
Sleep               0
SoundSleep          0
RegularMedicine     0
JunkFood            0
Stress              0
BPLevel             0
Pregnancies         0
Pdiabetes           0
UriationFreq        0
Diabetic            0
dtype: int64

In [10]:
data_loader.full_df.nunique()

Age                  4
Gender               2
Family_Diabetes      2
highBP               2
PhysicallyActive     4
BMI                 26
Smoking              2
Alcohol              2
Sleep                8
SoundSleep          12
RegularMedicine      2
JunkFood             4
Stress               4
BPLevel              3
Pregnancies          5
Pdiabetes            2
UriationFreq         2
Diabetic             2
dtype: int64

In [11]:
print(data['Age'].unique())
print(data['Gender'].unique())
print(data['Family_Diabetes'].unique())
print(data['highBP'].unique())
print(data['PhysicallyActive'].unique())
print(data['BMI'].unique())
print(data['Smoking'].unique())
print(data['Alcohol'].unique())
print(data['Sleep'].unique())
print(data['SoundSleep'].unique())
print(data['RegularMedicine'].unique())
print(data['JunkFood'].unique())
print(data['BPLevel'].unique())
print(data['Pregnancies'].unique())
print(data['Pdiabetes'].unique())
print(data['UriationFreq'].unique())
print(data['Diabetic'].unique())

['50-59' '40-49' 'less than 40' '60 or older']
['Male' 'Female']
['no' 'yes']
['yes' 'no']
['one hr or more' 'less than half an hr' 'none' 'more than half an hr']
[39. 28. 24. 23. 27. 21. 20. 26. 22. 15. 34. 30. 29. 18. 32. 31. 36. 38.
 40. 35. 19. 33. 17. 25. 42. 45.]
['no' 'yes']
['no' 'yes']
[ 8  6 10  7 11  9  4  5]
[ 6  8 10  7 11  4  9  5  3  2  1  0]
['no' 'yes']
['occasionally' 'very often' 'often' 'always']
['high' 'normal' 'low']
[0. 1. 2. 3. 4.]
['0' 'yes']
['not much' 'quite often']
[0 1]


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905 entries, 0 to 904
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               905 non-null    object 
 1   Gender            905 non-null    object 
 2   Family_Diabetes   905 non-null    object 
 3   highBP            905 non-null    object 
 4   PhysicallyActive  905 non-null    object 
 5   BMI               905 non-null    float64
 6   Smoking           905 non-null    object 
 7   Alcohol           905 non-null    object 
 8   Sleep             905 non-null    int64  
 9   SoundSleep        905 non-null    int64  
 10  RegularMedicine   905 non-null    object 
 11  JunkFood          905 non-null    object 
 12  Stress            905 non-null    object 
 13  BPLevel           905 non-null    object 
 14  Pregnancies       905 non-null    float64
 15  Pdiabetes         905 non-null    object 
 16  UriationFreq      905 non-null    object 
 1

In [13]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
label_encoder = LabelEncoder()

data["Age"] = label_encoder.fit_transform(data["Age"])
data["Gender"] = label_encoder.fit_transform(data["Gender"])
data["Family_Diabetes"] = label_encoder.fit_transform(data["Family_Diabetes"])
data["highBP"] = label_encoder.fit_transform(data["highBP"])
data["PhysicallyActive"] = label_encoder.fit_transform(data["PhysicallyActive"])
data["Smoking"] = label_encoder.fit_transform(data["Smoking"])
data["Alcohol"] = label_encoder.fit_transform(data["Alcohol"])
data["RegularMedicine"] = label_encoder.fit_transform(data["RegularMedicine"])
data["JunkFood"] = label_encoder.fit_transform(data["JunkFood"])
data["Stress"] = label_encoder.fit_transform(data["Stress"])
data["BPLevel"] = label_encoder.fit_transform(data["BPLevel"])
data["Pdiabetes"] = label_encoder.fit_transform(data["Pdiabetes"])
data["UriationFreq"] = label_encoder.fit_transform(data["UriationFreq"])


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 905 entries, 0 to 904
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               905 non-null    int64  
 1   Gender            905 non-null    int64  
 2   Family_Diabetes   905 non-null    int64  
 3   highBP            905 non-null    int64  
 4   PhysicallyActive  905 non-null    int64  
 5   BMI               905 non-null    float64
 6   Smoking           905 non-null    int64  
 7   Alcohol           905 non-null    int64  
 8   Sleep             905 non-null    int64  
 9   SoundSleep        905 non-null    int64  
 10  RegularMedicine   905 non-null    int64  
 11  JunkFood          905 non-null    int64  
 12  Stress            905 non-null    int64  
 13  BPLevel           905 non-null    int64  
 14  Pregnancies       905 non-null    float64
 15  Pdiabetes         905 non-null    int64  
 16  UriationFreq      905 non-null    int64  
 1