# Import necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

# Data loading

In [2]:
df = pd.read_csv('D:/data science & python/37/diabetes_dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,...,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,0,58,Female,White,35.8,83.4,123.9,10.9,152,114,...,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,...,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,...,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,...,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,...,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,45,Female,Asian,39.4,114.0,76.2,13.4,109,82,...,60.3,88.7,19.1,6.1,Low,3582,,Former,1,1
9996,9996,41,Female,Asian,21.0,71.3,77.1,12.9,154,72,...,72.3,200.0,37.1,3.5,Low,2206,,Never,1,1
9997,9997,50,Female,Asian,29.0,106.3,97.5,4.9,122,61,...,69.8,156.1,85.8,4.9,High,3175,Heavy,Former,1,1
9998,9998,62,Female,White,27.3,119.9,89.0,11.5,99,115,...,74.2,110.9,25.3,5.2,High,3478,Moderate,Never,1,0


# Data Exploration 

In [3]:
# check info about columns such as Data taype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     10000 non-null  int64  
 1   Age                            10000 non-null  int64  
 2   Sex                            10000 non-null  object 
 3   Ethnicity                      10000 non-null  object 
 4   BMI                            10000 non-null  float64
 5   Waist_Circumference            10000 non-null  float64
 6   Fasting_Blood_Glucose          10000 non-null  float64
 7   HbA1c                          10000 non-null  float64
 8   Blood_Pressure_Systolic        10000 non-null  int64  
 9   Blood_Pressure_Diastolic       10000 non-null  int64  
 10  Cholesterol_Total              10000 non-null  float64
 11  Cholesterol_HDL                10000 non-null  float64
 12  Cholesterol_LDL                10000 non-null  

In [4]:
# check summary statstics for numerical columns
df.describe().round().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,10000.0,5000.0,2887.0,0.0,2500.0,5000.0,7499.0,9999.0
Age,10000.0,45.0,14.0,20.0,32.0,45.0,57.0,69.0
BMI,10000.0,29.0,6.0,18.0,24.0,30.0,35.0,40.0
Waist_Circumference,10000.0,95.0,14.0,70.0,82.0,95.0,107.0,120.0
Fasting_Blood_Glucose,10000.0,135.0,38.0,70.0,102.0,134.0,168.0,200.0
HbA1c,10000.0,10.0,3.0,4.0,7.0,10.0,12.0,15.0
Blood_Pressure_Systolic,10000.0,134.0,26.0,90.0,112.0,134.0,157.0,179.0
Blood_Pressure_Diastolic,10000.0,90.0,17.0,60.0,75.0,89.0,105.0,119.0
Cholesterol_Total,10000.0,225.0,43.0,150.0,188.0,226.0,262.0,300.0
Cholesterol_HDL,10000.0,55.0,15.0,30.0,42.0,55.0,68.0,80.0


In [5]:
# check summary statstics for categorical  columns
df.describe(include=object).round()

Unnamed: 0,Sex,Ethnicity,Physical_Activity_Level,Alcohol_Consumption,Smoking_Status
count,10000,10000,10000,6680,10000
unique,2,4,3,2,3
top,Female,Black,Low,Moderate,Current
freq,5005,2539,3372,3373,3364


In [6]:
#  check duplicate columns 
df.duplicated().sum()

np.int64(0)

In [7]:
# check missing value
df.isna().sum()

Unnamed: 0                          0
Age                                 0
Sex                                 0
Ethnicity                           0
BMI                                 0
Waist_Circumference                 0
Fasting_Blood_Glucose               0
HbA1c                               0
Blood_Pressure_Systolic             0
Blood_Pressure_Diastolic            0
Cholesterol_Total                   0
Cholesterol_HDL                     0
Cholesterol_LDL                     0
GGT                                 0
Serum_Urate                         0
Physical_Activity_Level             0
Dietary_Intake_Calories             0
Alcohol_Consumption              3320
Smoking_Status                      0
Family_History_of_Diabetes          0
Previous_Gestational_Diabetes       0
dtype: int64

In [8]:
# check missing values percentage
((df.isna().sum())/10000)*100

Unnamed: 0                        0.0
Age                               0.0
Sex                               0.0
Ethnicity                         0.0
BMI                               0.0
Waist_Circumference               0.0
Fasting_Blood_Glucose             0.0
HbA1c                             0.0
Blood_Pressure_Systolic           0.0
Blood_Pressure_Diastolic          0.0
Cholesterol_Total                 0.0
Cholesterol_HDL                   0.0
Cholesterol_LDL                   0.0
GGT                               0.0
Serum_Urate                       0.0
Physical_Activity_Level           0.0
Dietary_Intake_Calories           0.0
Alcohol_Consumption              33.2
Smoking_Status                    0.0
Family_History_of_Diabetes        0.0
Previous_Gestational_Diabetes     0.0
dtype: float64

# Data Cleaning

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,...,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,0,58,Female,White,35.8,83.4,123.9,10.9,152,114,...,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,...,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,...,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,...,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,...,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0


In [10]:
# drop duplicate column index
df.drop('Unnamed: 0',axis=1,inplace=True)
df

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,45,Female,Asian,39.4,114.0,76.2,13.4,109,82,187.7,60.3,88.7,19.1,6.1,Low,3582,,Former,1,1
9996,41,Female,Asian,21.0,71.3,77.1,12.9,154,72,234.3,72.3,200.0,37.1,3.5,Low,2206,,Never,1,1
9997,50,Female,Asian,29.0,106.3,97.5,4.9,122,61,266.0,69.8,156.1,85.8,4.9,High,3175,Heavy,Former,1,1
9998,62,Female,White,27.3,119.9,89.0,11.5,99,115,172.3,74.2,110.9,25.3,5.2,High,3478,Moderate,Never,1,0


# handling categorical columns

In [11]:
cat_col =df.select_dtypes(include=object).columns
cat_col

Index(['Sex', 'Ethnicity', 'Physical_Activity_Level', 'Alcohol_Consumption',
       'Smoking_Status'],
      dtype='object')

In [12]:
# check  categorical columns in deepth
for col in cat_col:
    print(col)
    print(df[col].nunique())
    print(df[col].unique())
    print("*" * 100)

Sex
2
['Female' 'Male']
****************************************************************************************************
Ethnicity
4
['White' 'Asian' 'Black' 'Hispanic']
****************************************************************************************************
Physical_Activity_Level
3
['Moderate' 'Low' 'High']
****************************************************************************************************
Alcohol_Consumption
2
['Moderate' 'Heavy' nan]
****************************************************************************************************
Smoking_Status
3
['Never' 'Current' 'Former']
****************************************************************************************************


# handling numerical columns

In [13]:
num_col = df.select_dtypes(include='number').columns
num_col

Index(['Age', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose', 'HbA1c',
       'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic',
       'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT',
       'Serum_Urate', 'Dietary_Intake_Calories', 'Family_History_of_Diabetes',
       'Previous_Gestational_Diabetes'],
      dtype='object')

In [14]:
# check numerical columns in deepth
for col in num_col:
    px.histogram(data_frame=df,x=col,title= col).show()

 # Feature Engineering 
 There is no data to Extract 

# Data Analysis (Univariate, Bivariate, Multivariate)

 Q1 # What is the relationship between BMI and fasting blood glucose or HbA1c levels?

 Q2 # Do individuals with a family history of diabetes show higher average HbA1c or glucose levels?

 Q3 # Is there a significant difference in diabetes markers (like HbA1c) between males and females?

 Q4 # How does physical activity level impact blood sugar levels or BMI?

 Q5 # Are there differences in HBA1C levels among different ethnic groups?

 Q6 # Is smoking status linked to higher LDL or lower HDL cholesterol?

 Q7 # How does previous gestational diabetes affect current glucose control among females?

In [15]:
df

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,45,Female,Asian,39.4,114.0,76.2,13.4,109,82,187.7,60.3,88.7,19.1,6.1,Low,3582,,Former,1,1
9996,41,Female,Asian,21.0,71.3,77.1,12.9,154,72,234.3,72.3,200.0,37.1,3.5,Low,2206,,Never,1,1
9997,50,Female,Asian,29.0,106.3,97.5,4.9,122,61,266.0,69.8,156.1,85.8,4.9,High,3175,Heavy,Former,1,1
9998,62,Female,White,27.3,119.9,89.0,11.5,99,115,172.3,74.2,110.9,25.3,5.2,High,3478,Moderate,Never,1,0


Q1 # What is the relationship between BMI and fasting blood glucose or HbA1c levels?


In [16]:
# Q1 What is the relationship between BMI and fasting blood glucose levels?
relation_BMI_Glucose = df[['BMI','Fasting_Blood_Glucose']].corr()
relation_BMI_Glucose

Unnamed: 0,BMI,Fasting_Blood_Glucose
BMI,1.0,-0.014478
Fasting_Blood_Glucose,-0.014478,1.0


In [17]:
# Q1 What is the relationship between BMI and HbA1c levels?
relation_BMI_HBA1c =df[['BMI','HbA1c']].corr()
relation_BMI_HBA1c

Unnamed: 0,BMI,HbA1c
BMI,1.0,-0.008163
HbA1c,-0.008163,1.0


### insight from Q1 that the relationship between BMI and fasting blood glucose or HbA1c levels is very weak negative relationship

In [18]:
#relationship between BMI and fasting blood glucose
px.scatter(data_frame=df ,x='BMI',y ='Fasting_Blood_Glucose' ,title='relationship between BMI and fasting blood glucose')

In [19]:
# relationship between BMI and HbA1c
px.scatter(data_frame=df ,x='BMI',y ='HbA1c' ,title='relationship between BMI and HbA1c')

Q2 # Do individuals with a family history of diabetes show higher average HbA1c levels?


In [20]:
df.head()

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0


In [21]:
family_history_HBA1c = df.groupby('Family_History_of_Diabetes')['HbA1c'].mean().round(3).reset_index()
family_history_HBA1c

Unnamed: 0,Family_History_of_Diabetes,HbA1c
0,0,9.509
1,1,9.506


### insight from Q2 after comparing the averge HBA1C for both the individuals with a family history of diabetes and the individuals with a no family history of diabete i didnt  show higher average HbA1c But the percentages are very close


In [22]:
px.bar(data_frame=family_history_HBA1c, x= 'Family_History_of_Diabetes' ,y = 'HbA1c',title='Do individuals with a family history of diabetes show higher average HbA1c or glucose levels')

  Q3 # Is there a significant difference in diabetes markers (like HbA1c) between males and females?

In [23]:
df_sex_HbA1c = df.groupby('Sex')['HbA1c'].mean().round(2).reset_index()
df_sex_HbA1c

Unnamed: 0,Sex,HbA1c
0,Female,9.54
1,Male,9.48


### insight from Q3 after comparing the averge HBA1C for both males and females there is no a significant difference in diabetes markers

In [24]:
px.bar(data_frame=df_sex_HbA1c,x='Sex',y ='HbA1c' ,title='difference in diabetes markers (like HbA1c) between males and females')

Q4 # How does physical activity level impact blood sugar levels or BMI?

In [25]:
df.head()

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0


In [26]:
# How does physical activity level impact blood sugar?
ph_activity_level_Glucose = df.groupby('Physical_Activity_Level')['Fasting_Blood_Glucose'].mean().round(2).reset_index()
ph_activity_level_Glucose

Unnamed: 0,Physical_Activity_Level,Fasting_Blood_Glucose
0,High,134.82
1,Low,134.44
2,Moderate,135.08


### insight from Q4 after comparing Physical_Activity_Level impact for blood suger There is no noticeable effect on blood suger

In [27]:
px.bar(data_frame=ph_activity_level_Glucose,x='Physical_Activity_Level',y ='Fasting_Blood_Glucose' ,title='How does physical activity level impact blood sugar levels')

In [28]:
# How does physical activity level impact BMI ?
ph_activity_level_BMI = df.groupby('Physical_Activity_Level')['BMI'].mean().round(2).reset_index()
ph_activity_level_BMI

Unnamed: 0,Physical_Activity_Level,BMI
0,High,29.36
1,Low,29.53
2,Moderate,29.36


### insight from Q4 after comparing Physical_Activity_Level impact for BMI There is no noticeable effect on BMI

In [29]:
px.bar(data_frame=ph_activity_level_BMI,x='Physical_Activity_Level',y ='BMI' ,title='How does physical activity level impact BMI')

 Q5 # Are there differences in HBA1C levels among different ethnic groups?

In [30]:
df.head()

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0


In [31]:
Ethnicity_HBA1C = df.groupby('Ethnicity')['HbA1c'].mean().round(2).reset_index()
Ethnicity_HBA1C

Unnamed: 0,Ethnicity,HbA1c
0,Asian,9.49
1,Black,9.55
2,Hispanic,9.6
3,White,9.39


### insight from Q5 after comparing HbA1c impact for each Ethnicity There is no noticeable effect on HBA1C

In [32]:
px.bar(data_frame=Ethnicity_HBA1C,x='Ethnicity',y ='HbA1c' ,title='differences in HBA1C levels among different ethnic groups?')

 Q6 # Is smoking status linked to higher LDL or lower HDL cholesterol?

In [33]:
smoking_LDL = df.groupby('Smoking_Status')[['Cholesterol_LDL','Cholesterol_HDL']].mean().round(2).reset_index()
smoking_LDL

Unnamed: 0,Smoking_Status,Cholesterol_LDL,Cholesterol_HDL
0,Current,134.25,55.01
1,Former,134.35,54.9
2,Never,134.46,55.14


### insight from Q6 after comparing Smoking_Status impact for each Cholesterol_LDL and Cholesterol_HDL There is no noticeable effect on higher LDL or lower HDL cholesterol

In [34]:
#Is smoking status linked to higher LDL
px.bar(data_frame=smoking_LDL,x='Smoking_Status',y ='Cholesterol_LDL' ,text_auto=True,title=' Is smoking status linked to higher LDL ?')

In [35]:
# Is smoking status linked to Lower HDL
smoking_HDL = df.groupby('Smoking_Status')['Cholesterol_HDL'].mean().round(2).reset_index()
smoking_HDL

Unnamed: 0,Smoking_Status,Cholesterol_HDL
0,Current,55.01
1,Former,54.9
2,Never,55.14


In [36]:
# Is smoking status linked to Lower HDL
px.bar(data_frame=smoking_HDL,x='Smoking_Status',y ='Cholesterol_HDL' ,text_auto=True,title=' Is smoking status linked to Lower HDL ?')

 Q7 # How does previous gestational diabetes affect current glucose control among females?

In [37]:
df.head()

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0


In [38]:
female_pr_ge_di__Glucose = df[df.Sex =='Female'][['Previous_Gestational_Diabetes','Fasting_Blood_Glucose']].corr()
female_pr_ge_di__Glucose

Unnamed: 0,Previous_Gestational_Diabetes,Fasting_Blood_Glucose
Previous_Gestational_Diabetes,1.0,-0.042292
Fasting_Blood_Glucose,-0.042292,1.0


### insight from Q7 after comparing Previous_Gestational_Diabetes impact for Fasting_Blood_Glucose for females There is a weak negative relationship between Previous_Gestational_Diabetes and Fasting_Blood_Glucose

In [39]:
px.line(data_frame=female_pr_ge_di__Glucose,x='Previous_Gestational_Diabetes',y = 'Fasting_Blood_Glucose' ,title='How does previous gestational diabetes affect current glucose control among females?')

 # Deployment (stremlit)

In [40]:
df.to_csv('cleaned_df.csv')

In [41]:
import streamlit as st

In [76]:
%%writefile diabetes.py

import pandas as pd
import streamlit as st
import plotly.express as px
st.set_page_config(layout= 'wide', page_title= 'diabetes_dataset')
st.markdown("<h1 style='text-align: center; color: white;'>diabetes dataset with Analysis</h1>", unsafe_allow_html=True)
st.image('https://elfagr-med.com/media/rokanthemes/blog/images/f/i/file_25.jpg')
df = pd.read_csv('cleaned_df.csv',index_col=0)
page = st.sidebar.radio('Pages', ['Introduction', 'Analysis Questions', 'Reporting'])
if page == 'Introduction':

    st.dataframe(df.head())

    st.header('diabetes_dataset Description')

    st.write('''The dataset contains 10,000 entries and 21 columns. Here's a summary of the columns:

Unnamed: 0 : Index column (can likely be dropped).

Age : Age of the individual (integer).

Sex : Gender: Male or Female.

Ethnicity : Ethnic background (categorical).

BMI : Body Mass Index (float).

Waist_Circumference : Waist size in cm (float).

Fasting_Blood_Glucose : Fasting blood sugar level (float).

HbA1c : Hemoglobin A1c percentage, used to assess blood sugar control (float).

Blood_Pressure_Systolic : Systolic BP (integer).

Blood_Pressure_Diastolic : Diastolic BP (integer).

Cholesterol_Total : Total cholesterol level (float).

Cholesterol_HDL : HDL (good) cholesterol (float).

Cholesterol_LDL : LDL (bad) cholesterol (float).

GGT : Gamma-glutamyl transferase, a liver enzyme (float).

Serum_Urate : Uric acid level in blood (float).

Physical_Activity_Level : Level of physical activity: e.g., Low, Moderate, etc. (categorical).

Dietary_Intake_Calories : Daily caloric intake (integer).

Alcohol_Consumption : Alcohol consumption level (categorical).

Smoking_Status : Smoking status: Never, Former, Current (categorical).

Family_History_of_Diabetes : Binary: 1 if family history exists, 0 otherwise (integer).

Previous_Gestational_Diabetes : Binary: 1 if individual had gestational diabetes previously (integer).''')

elif page == 'Analysis Questions':

    st.header(' Q1 What is the relationship between BMI and fasting blood glucose levels?')
    relation_BMI_Glucose = df[['BMI','Fasting_Blood_Glucose']].corr()
    st.write(df[['BMI','Fasting_Blood_Glucose']].corr())
    st.write(px.scatter(data_frame = df ,x='BMI',y ='Fasting_Blood_Glucose' ,title='relationship between BMI and fasting blood glucose'))
    st.header('Q1 What is the relationship between BMI and HbA1c levels?')
    relation_BMI_HBA1c =df[['BMI','HbA1c']].corr()
    st.write(df[['BMI','HbA1c']].corr())
    st.write(px.scatter(data_frame= df ,x='BMI',y ='HbA1c' ,title='relationship between BMI and HbA1c'))
    st.write('## insight from Q1 that the relationship between BMI and fasting blood glucose or HbA1c levels is very weak negative relationship')
    st.header('Q2 Do individuals with a family history of diabetes show higher average HbA1c or glucose levels?')
    family_history_HBA1c = df.groupby('Family_History_of_Diabetes')['HbA1c'].mean().round(3).reset_index()
    st.write(df.groupby('Family_History_of_Diabetes')['HbA1c'].mean().round(3).reset_index())
    st.write(px.bar(data_frame=family_history_HBA1c, x= 'Family_History_of_Diabetes' ,y = 'HbA1c',text_auto=True ,title='Do individuals with a family history of diabetes show higher average HbA1c or glucose levels'))
    st.write('## insight from Q2 after comparing the averge HBA1C for both the individuals with a family history of diabetes and the individuals with a no family history of diabete i didnt show higher average HbA1c But the percentages are very close')
    st.header(' Q3 Is there a significant difference in diabetes markers (like HbA1c) between males and females?')
    df_sex_HbA1c = df.groupby('Sex')['HbA1c'].mean().round(2).reset_index()
    st.write(df.groupby('Sex')['HbA1c'].mean().round(2).reset_index())
    st.write(px.bar(data_frame=df_sex_HbA1c,x='Sex',y ='HbA1c' ,text_auto=True ,title='difference in diabetes markers (like HbA1c) between males and females'))
    st.write('## insight from Q3 after comparing the averge HBA1C for both males and females there is no a significant difference in diabetes markers')
    st.header('Q4 How does physical activity level impact blood sugar levels or BMI?')
    ph_activity_level_Glucose = df.groupby('Physical_Activity_Level')['Fasting_Blood_Glucose'].mean().round(2).reset_index()
    st.write(df.groupby('Physical_Activity_Level')['Fasting_Blood_Glucose'].mean().round(2).reset_index())
    st.write(px.bar(data_frame=ph_activity_level_Glucose,x='Physical_Activity_Level',y ='Fasting_Blood_Glucose',text_auto=True,title='How does physical activity level impact blood sugar levels'))
    st.write('## insight from Q4 after comparing Physical_Activity_Level impact for BMI There is no noticeable effect on BMI')
    st.header('Q5 Are there differences in HBA1C levels among different ethnic groups?')
    Ethnicity_HBA1C = df.groupby('Ethnicity')['HbA1c'].mean().round(2).reset_index()
    st.write( df.groupby('Ethnicity')['HbA1c'].mean().round(2).reset_index())
    st.plotly_chart(px.bar(data_frame=Ethnicity_HBA1C,x='Ethnicity',y ='HbA1c' ,text_auto=True ,title='differences in HBA1C levels among different ethnic groups?'))
    st.write('## insight from Q5 after comparing HbA1c impact for each Ethnicity There is no noticeable effect on HBA1C')
    st.header('Q6 Is smoking status linked to higher LDL or lower HDL cholesterol?')
    smoking_LDL = df.groupby('Smoking_Status')[['Cholesterol_LDL','Cholesterol_HDL']].mean().round(2).reset_index()
    smoking_HDL = df.groupby('Smoking_Status')['Cholesterol_HDL'].mean().round(2).reset_index()
    st.write(df.groupby('Smoking_Status')[['Cholesterol_LDL','Cholesterol_HDL']].mean().round(2).reset_index())
    st.plotly_chart(px.bar(data_frame=smoking_LDL,x='Smoking_Status',y ='Cholesterol_LDL' ,text_auto=True,title=' Is smoking status linked to higher LDL ?'))
    st.plotly_chart(px.bar(data_frame=smoking_HDL,x='Smoking_Status',y ='Cholesterol_HDL' ,text_auto=True,title=' Is smoking status linked to Lower HDL ?'))
    st.write('## insight from Q6 after comparing Smoking_Status impact for each Cholesterol_LDL and Cholesterol_HDL There is no noticeable effect on higher LDL or lower HDL cholesterol')
    st.header(' Q7 How does previous gestational diabetes affect current glucose control among females?')
    st.write(df[df.Sex =='Female'][['Previous_Gestational_Diabetes','Fasting_Blood_Glucose']].corr())
    female_pr_ge_di__Glucose = df[df.Sex =='Female'][['Previous_Gestational_Diabetes','Fasting_Blood_Glucose']].corr()
    st.plotly_chart(px.line(data_frame=female_pr_ge_di__Glucose,x='Previous_Gestational_Diabetes',y = 'Fasting_Blood_Glucose' ,title='How does previous gestational diabetes affect current glucose control among females?,text'))
    st.write('## insight from Q7 after comparing Previous_Gestational_Diabetes impact for Fasting_Blood_Glucose for females There is a weak negative relationship between Previous_Gestational_Diabetes and Fasting_Blood_Glucose')
elif page == 'Reporting':
    sex = st.sidebar.selectbox('Sex' ,df['Sex'].unique())
    ethnicity = st.sidebar.selectbox('Ethnicity', df['Ethnicity'].unique())
    physical_activity_level = st.sidebar.selectbox('physical_activity_level',df['Physical_Activity_Level'].unique())
    smoking_status = st.sidebar.selectbox('Smoking_Status', df['Smoking_Status'].unique())
    df2 = df[(df['Sex']== sex) & (df['Ethnicity'] == ethnicity) &(df['Physical_Activity_Level']== physical_activity_level) & (df['Smoking_Status']==smoking_status) ]
    st.dataframe(df2.head(50))
  

Overwriting diabetes.py


In [43]:
!streamlit run diabetes.py


^C


In [44]:
df

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,45,Female,Asian,39.4,114.0,76.2,13.4,109,82,187.7,60.3,88.7,19.1,6.1,Low,3582,,Former,1,1
9996,41,Female,Asian,21.0,71.3,77.1,12.9,154,72,234.3,72.3,200.0,37.1,3.5,Low,2206,,Never,1,1
9997,50,Female,Asian,29.0,106.3,97.5,4.9,122,61,266.0,69.8,156.1,85.8,4.9,High,3175,Heavy,Former,1,1
9998,62,Female,White,27.3,119.9,89.0,11.5,99,115,172.3,74.2,110.9,25.3,5.2,High,3478,Moderate,Never,1,0


# data preprocessing for machine learnning

In [45]:
df.head()

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,HbA1c,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,58,Female,White,35.8,83.4,123.9,10.9,152,114,197.8,50.2,99.2,37.5,7.2,Moderate,1538,Moderate,Never,0,1
1,48,Male,Asian,24.1,71.4,183.7,12.8,103,91,261.6,62.0,146.4,88.5,6.1,Moderate,2653,Moderate,Current,0,1
2,34,Female,Black,25.0,113.8,142.0,14.5,179,104,261.0,32.1,164.1,56.2,6.9,Low,1684,Heavy,Former,1,0
3,62,Male,Asian,32.7,100.4,167.4,8.8,176,118,183.4,41.1,84.0,34.4,5.4,Low,3796,Moderate,Never,1,0
4,27,Female,Asian,33.5,110.8,146.4,7.1,122,97,203.2,53.9,92.8,81.9,7.4,Moderate,3161,Heavy,Current,0,0


 # Split Data into Input Features and Target Variable


In [46]:

 # Split Data into Input Features and Target Variable
x = df.drop('HbA1c',axis=1)
y = df['HbA1c']

 # Split Data into Train and Test

In [47]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2,random_state= 42)

In [48]:
# update index for x_train ,x_test ,y_train , y_test
x_train.reset_index(drop=True,inplace=True)
x_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [49]:
x_train

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,59,Female,Hispanic,32.6,93.3,113.7,110,85,171.2,61.4,103.3,62.6,3.1,Low,2177,,Never,0,1
1,67,Male,White,30.2,89.3,174.1,114,75,229.5,71.9,113.1,39.6,5.8,High,2425,Heavy,Current,0,1
2,62,Female,Black,21.5,89.4,92.0,172,112,248.8,78.1,116.0,33.7,5.1,Moderate,2137,Heavy,Former,1,1
3,38,Male,Asian,35.1,90.9,105.5,139,74,279.9,76.6,176.9,11.4,3.8,Moderate,2999,,Former,1,1
4,60,Female,Black,24.0,86.0,74.5,163,83,159.0,74.3,123.2,44.6,6.3,Low,1704,Heavy,Never,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,57,Female,Black,23.4,82.7,154.2,107,117,258.9,30.4,74.8,50.7,5.1,Low,3219,Moderate,Never,1,0
7996,27,Female,Black,30.8,102.3,106.2,134,102,294.9,69.4,90.5,29.5,6.0,Moderate,2796,Heavy,Never,1,1
7997,59,Male,Asian,26.1,87.6,75.0,176,104,264.4,46.5,185.1,91.1,7.0,Moderate,2797,Heavy,Current,0,1
7998,61,Male,Black,23.7,119.3,93.1,119,115,252.2,60.7,114.5,57.3,5.6,Moderate,2879,Heavy,Never,1,1


In [50]:
x_test

Unnamed: 0,Age,Sex,Ethnicity,BMI,Waist_Circumference,Fasting_Blood_Glucose,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,Serum_Urate,Physical_Activity_Level,Dietary_Intake_Calories,Alcohol_Consumption,Smoking_Status,Family_History_of_Diabetes,Previous_Gestational_Diabetes
0,50,Male,Black,27.7,107.9,172.6,149,117,281.8,36.2,152.1,64.9,6.8,Moderate,2466,Heavy,Former,1,1
1,25,Male,White,27.6,85.5,95.8,122,94,178.5,55.6,155.9,27.1,3.5,High,3105,Moderate,Never,1,0
2,56,Male,White,35.3,83.8,145.2,178,94,182.1,63.9,133.9,48.9,5.1,High,2434,Heavy,Never,1,0
3,54,Female,White,39.8,74.9,105.8,145,64,160.0,42.5,91.2,38.9,3.8,High,3198,,Current,0,0
4,50,Female,White,32.0,84.9,150.6,116,79,194.4,64.2,76.2,38.7,6.9,High,2205,Heavy,Never,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,36,Male,White,37.6,114.7,126.8,142,62,207.3,58.4,105.2,11.0,4.1,Low,3916,Heavy,Former,0,0
1996,38,Female,Hispanic,35.9,104.7,151.4,93,72,274.4,76.3,139.1,48.8,6.2,Low,1709,Moderate,Former,0,1
1997,41,Female,Asian,27.4,83.0,135.1,110,111,234.5,34.7,142.4,22.0,7.4,Low,1914,Moderate,Never,0,0
1998,20,Male,Hispanic,37.9,101.3,141.4,102,64,292.6,62.0,117.2,96.0,4.1,Low,3342,,Never,0,1


In [51]:
y_train

0       12.9
1        7.1
2       11.1
3        4.5
4       11.3
        ... 
7995     6.0
7996     6.7
7997     6.9
7998     7.0
7999     5.1
Name: HbA1c, Length: 8000, dtype: float64

In [52]:
y_test


0       14.3
1       13.3
2       14.0
3        4.3
4        6.9
        ... 
1995     8.3
1996     5.8
1997     6.8
1998     9.4
1999     8.4
Name: HbA1c, Length: 2000, dtype: float64

# handling categorical columns 

In [53]:
cat_col = x_train.select_dtypes(include=object).columns
cat_col

Index(['Sex', 'Ethnicity', 'Physical_Activity_Level', 'Alcohol_Consumption',
       'Smoking_Status'],
      dtype='object')

In [54]:
# check missing value 
x_train.isna().sum()

Age                                 0
Sex                                 0
Ethnicity                           0
BMI                                 0
Waist_Circumference                 0
Fasting_Blood_Glucose               0
Blood_Pressure_Systolic             0
Blood_Pressure_Diastolic            0
Cholesterol_Total                   0
Cholesterol_HDL                     0
Cholesterol_LDL                     0
GGT                                 0
Serum_Urate                         0
Physical_Activity_Level             0
Dietary_Intake_Calories             0
Alcohol_Consumption              2631
Smoking_Status                      0
Family_History_of_Diabetes          0
Previous_Gestational_Diabetes       0
dtype: int64

In [55]:
# check missing value percentage 
x_train.isna().mean()*100

Age                               0.0000
Sex                               0.0000
Ethnicity                         0.0000
BMI                               0.0000
Waist_Circumference               0.0000
Fasting_Blood_Glucose             0.0000
Blood_Pressure_Systolic           0.0000
Blood_Pressure_Diastolic          0.0000
Cholesterol_Total                 0.0000
Cholesterol_HDL                   0.0000
Cholesterol_LDL                   0.0000
GGT                               0.0000
Serum_Urate                       0.0000
Physical_Activity_Level           0.0000
Dietary_Intake_Calories           0.0000
Alcohol_Consumption              32.8875
Smoking_Status                    0.0000
Family_History_of_Diabetes        0.0000
Previous_Gestational_Diabetes     0.0000
dtype: float64

In [56]:
# check value count for Alcohol_Consumption column before fill missing value
x_train['Alcohol_Consumption'].value_counts()

Alcohol_Consumption
Moderate    2690
Heavy       2679
Name: count, dtype: int64

In [57]:
# fill missing value with constant value "unknown"
simple_imputer = SimpleImputer(strategy='constant',fill_value='unknown')
x_train[['Alcohol_Consumption']] = simple_imputer.fit_transform(x_train[['Alcohol_Consumption']])
x_test[['Alcohol_Consumption']] = simple_imputer.transform(x_test[['Alcohol_Consumption']])

In [58]:
# check value count for Alcohol_Consumption column
x_train['Alcohol_Consumption'].value_counts()

Alcohol_Consumption
Moderate    2690
Heavy       2679
unknown     2631
Name: count, dtype: int64

In [59]:
# check missing value after fill nan
x_train.isna().sum()

Age                              0
Sex                              0
Ethnicity                        0
BMI                              0
Waist_Circumference              0
Fasting_Blood_Glucose            0
Blood_Pressure_Systolic          0
Blood_Pressure_Diastolic         0
Cholesterol_Total                0
Cholesterol_HDL                  0
Cholesterol_LDL                  0
GGT                              0
Serum_Urate                      0
Physical_Activity_Level          0
Dietary_Intake_Calories          0
Alcohol_Consumption              0
Smoking_Status                   0
Family_History_of_Diabetes       0
Previous_Gestational_Diabetes    0
dtype: int64

# encoding categorical columns

In [60]:
cat_col = x_train.select_dtypes(include=object).columns
cat_col

Index(['Sex', 'Ethnicity', 'Physical_Activity_Level', 'Alcohol_Consumption',
       'Smoking_Status'],
      dtype='object')

In [61]:
# Check categorical columns in deepth and check number of category
for col in cat_col:
    print(col)
    print(x_train[col].nunique())
    print(x_train[col].unique())
    print(x_train[col].value_counts())
    print("*" *100)

Sex
2
['Female' 'Male']
Sex
Male      4027
Female    3973
Name: count, dtype: int64
****************************************************************************************************
Ethnicity
4
['Hispanic' 'White' 'Black' 'Asian']
Ethnicity
Black       2020
Hispanic    2011
Asian       2000
White       1969
Name: count, dtype: int64
****************************************************************************************************
Physical_Activity_Level
3
['Low' 'High' 'Moderate']
Physical_Activity_Level
Low         2693
High        2687
Moderate    2620
Name: count, dtype: int64
****************************************************************************************************
Alcohol_Consumption
3
['unknown' 'Heavy' 'Moderate']
Alcohol_Consumption
Moderate    2690
Heavy       2679
unknown     2631
Name: count, dtype: int64
****************************************************************************************************
Smoking_Status
3
['Never' 'Current' 'Former']
Smoking_St

In [62]:
# handling nominal column encoding
ohe = OneHotEncoder(drop= 'first', sparse_output= False)
ohe_arr_train = ohe.fit_transform(x_train[cat_col])
ohe_arr_test = ohe.transform(x_test[cat_col])


In [63]:
ohe_arr_train

array([[0., 0., 1., ..., 1., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 1.],
       [1., 1., 0., ..., 0., 0., 1.]], shape=(8000, 10))

In [64]:
ohe_arr_test

array([[1., 1., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 1., 0., 1.],
       [1., 1., 0., ..., 0., 0., 1.]], shape=(2000, 10))

In [65]:
# convert array ohe_arr_train into dataframe
ohe_arr_train = pd.DataFrame(ohe_arr_train,columns=ohe.get_feature_names_out())
ohe_arr_train

Unnamed: 0,Sex_Male,Ethnicity_Black,Ethnicity_Hispanic,Ethnicity_White,Physical_Activity_Level_Low,Physical_Activity_Level_Moderate,Alcohol_Consumption_Moderate,Alcohol_Consumption_unknown,Smoking_Status_Former,Smoking_Status_Never
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
7995,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
7996,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7997,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7998,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [66]:
# convert array ohe_arr_test into dataframe
ohe_arr_test = pd.DataFrame(ohe_arr_test,columns=ohe.get_feature_names_out())
ohe_arr_test

Unnamed: 0,Sex_Male,Ethnicity_Black,Ethnicity_Hispanic,Ethnicity_White,Physical_Activity_Level_Low,Physical_Activity_Level_Moderate,Alcohol_Consumption_Moderate,Alcohol_Consumption_unknown,Smoking_Status_Former,Smoking_Status_Never
0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1995,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1996,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1997,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1998,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [67]:
#  concatenation both x_train and ohe_arr_train
x_train = pd.concat([x_train,ohe_arr_train],axis=1).drop(['Sex', 'Ethnicity', 'Physical_Activity_Level', 'Alcohol_Consumption','Smoking_Status'],axis=1)
x_train

Unnamed: 0,Age,BMI,Waist_Circumference,Fasting_Blood_Glucose,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,...,Sex_Male,Ethnicity_Black,Ethnicity_Hispanic,Ethnicity_White,Physical_Activity_Level_Low,Physical_Activity_Level_Moderate,Alcohol_Consumption_Moderate,Alcohol_Consumption_unknown,Smoking_Status_Former,Smoking_Status_Never
0,59,32.6,93.3,113.7,110,85,171.2,61.4,103.3,62.6,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,67,30.2,89.3,174.1,114,75,229.5,71.9,113.1,39.6,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,21.5,89.4,92.0,172,112,248.8,78.1,116.0,33.7,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,38,35.1,90.9,105.5,139,74,279.9,76.6,176.9,11.4,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,60,24.0,86.0,74.5,163,83,159.0,74.3,123.2,44.6,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,57,23.4,82.7,154.2,107,117,258.9,30.4,74.8,50.7,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
7996,27,30.8,102.3,106.2,134,102,294.9,69.4,90.5,29.5,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7997,59,26.1,87.6,75.0,176,104,264.4,46.5,185.1,91.1,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7998,61,23.7,119.3,93.1,119,115,252.2,60.7,114.5,57.3,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [68]:
#  concatenation both x_test and ohe_arr_test
x_test = pd.concat([x_test,ohe_arr_test],axis=1).drop(['Sex', 'Ethnicity', 'Physical_Activity_Level', 'Alcohol_Consumption','Smoking_Status'],axis=1)
x_test

Unnamed: 0,Age,BMI,Waist_Circumference,Fasting_Blood_Glucose,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,...,Sex_Male,Ethnicity_Black,Ethnicity_Hispanic,Ethnicity_White,Physical_Activity_Level_Low,Physical_Activity_Level_Moderate,Alcohol_Consumption_Moderate,Alcohol_Consumption_unknown,Smoking_Status_Former,Smoking_Status_Never
0,50,27.7,107.9,172.6,149,117,281.8,36.2,152.1,64.9,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,25,27.6,85.5,95.8,122,94,178.5,55.6,155.9,27.1,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,56,35.3,83.8,145.2,178,94,182.1,63.9,133.9,48.9,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,54,39.8,74.9,105.8,145,64,160.0,42.5,91.2,38.9,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,50,32.0,84.9,150.6,116,79,194.4,64.2,76.2,38.7,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,36,37.6,114.7,126.8,142,62,207.3,58.4,105.2,11.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1996,38,35.9,104.7,151.4,93,72,274.4,76.3,139.1,48.8,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1997,41,27.4,83.0,135.1,110,111,234.5,34.7,142.4,22.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1998,20,37.9,101.3,141.4,102,64,292.6,62.0,117.2,96.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


# Feature Scaling

In [69]:
scaled_columns = x_train.columns
scaled_columns

Index(['Age', 'BMI', 'Waist_Circumference', 'Fasting_Blood_Glucose',
       'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic',
       'Cholesterol_Total', 'Cholesterol_HDL', 'Cholesterol_LDL', 'GGT',
       'Serum_Urate', 'Dietary_Intake_Calories', 'Family_History_of_Diabetes',
       'Previous_Gestational_Diabetes', 'Sex_Male', 'Ethnicity_Black',
       'Ethnicity_Hispanic', 'Ethnicity_White', 'Physical_Activity_Level_Low',
       'Physical_Activity_Level_Moderate', 'Alcohol_Consumption_Moderate',
       'Alcohol_Consumption_unknown', 'Smoking_Status_Former',
       'Smoking_Status_Never'],
      dtype='object')

In [70]:
# featFeature Scaling with robustscaler
from sklearn.preprocessing import RobustScaler
rc = RobustScaler()
x_train[scaled_columns] = rc.fit_transform(x_train[scaled_columns])
x_test[scaled_columns] = rc.transform(x_test[scaled_columns])

In [71]:
x_train

Unnamed: 0,Age,BMI,Waist_Circumference,Fasting_Blood_Glucose,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,...,Sex_Male,Ethnicity_Black,Ethnicity_Hispanic,Ethnicity_White,Physical_Activity_Level_Low,Physical_Activity_Level_Moderate,Alcohol_Consumption_Moderate,Alcohol_Consumption_unknown,Smoking_Status_Former,Smoking_Status_Never
0,0.56,0.295238,-0.066802,-0.316109,-0.533333,-0.172414,-0.728745,0.242187,-0.481481,0.159292,...,-1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,0.88,0.066667,-0.228745,0.601824,-0.444444,-0.517241,0.058030,0.652344,-0.330247,-0.349558,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.68,-0.761905,-0.224696,-0.645897,0.844444,0.758621,0.318489,0.894531,-0.285494,-0.480088,...,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.28,0.533333,-0.163968,-0.440729,0.111111,-0.551724,0.738192,0.835937,0.654321,-0.973451,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0.60,-0.523810,-0.362348,-0.911854,0.644444,-0.241379,-0.893387,0.746094,-0.174383,-0.238938,...,-1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.48,-0.580952,-0.495951,0.299392,-0.600000,0.931034,0.454791,-0.968750,-0.921296,-0.103982,...,-1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
7996,-0.72,0.123810,0.297571,-0.430091,0.000000,0.413793,0.940621,0.554687,-0.679012,-0.573009,...,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7997,0.56,-0.323810,-0.297571,-0.904255,0.933333,0.482759,0.529015,-0.339844,0.780864,0.789823,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7998,0.64,-0.552381,0.985830,-0.629179,-0.333333,0.862069,0.364372,0.214844,-0.308642,0.042035,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [72]:
x_test

Unnamed: 0,Age,BMI,Waist_Circumference,Fasting_Blood_Glucose,Blood_Pressure_Systolic,Blood_Pressure_Diastolic,Cholesterol_Total,Cholesterol_HDL,Cholesterol_LDL,GGT,...,Sex_Male,Ethnicity_Black,Ethnicity_Hispanic,Ethnicity_White,Physical_Activity_Level_Low,Physical_Activity_Level_Moderate,Alcohol_Consumption_Moderate,Alcohol_Consumption_unknown,Smoking_Status_Former,Smoking_Status_Never
0,0.20,-0.171429,0.524291,0.579027,0.333333,0.931034,0.763833,-0.742187,0.271605,0.210177,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.80,-0.180952,-0.382591,-0.588146,-0.266667,0.137931,-0.630229,0.015625,0.330247,-0.626106,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.44,0.552381,-0.451417,0.162614,0.977778,0.137931,-0.581646,0.339844,-0.009259,-0.143805,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.36,0.980952,-0.811741,-0.436170,0.244444,-0.896552,-0.879892,-0.496094,-0.668210,-0.365044,...,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.20,0.238095,-0.406883,0.244681,-0.400000,-0.379310,-0.415655,0.351562,-0.899691,-0.369469,...,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.36,0.771429,0.799595,-0.117021,0.177778,-0.965517,-0.241565,0.125000,-0.452160,-0.982301,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1996,-0.28,0.609524,0.394737,0.256839,-0.911111,-0.620690,0.663968,0.824219,0.070988,-0.146018,...,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1997,-0.16,-0.200000,-0.483806,0.009119,-0.533333,0.724138,0.125506,-0.800781,0.121914,-0.738938,...,-1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1998,-1.00,0.800000,0.257085,0.104863,-0.711111,-0.896552,0.909582,0.265625,-0.266975,0.898230,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


In [73]:
!pip install pipreqs



In [74]:
import pipreqs

In [75]:
! pipreqs

INFO: Not scanning for jupyter notebooks.
