In [315]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [316]:
dataset = pd.read_csv('medical_students_dataset.csv')
dataset.head()

Unnamed: 0,Student ID,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,1.0,18.0,Female,161.777924,72.354947,O,27.645835,,95.0,109.0,203.0,No,
1,2.0,,Male,152.069157,47.630941,B,,98.714977,93.0,104.0,163.0,No,No
2,3.0,32.0,Female,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,Yes,No
3,,30.0,Male,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,No,Yes
4,5.0,23.0,Female,,46.234173,O,,98.480008,95.0,,231.0,No,No


In [317]:
dataset.shape

(200000, 13)

In [318]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Student ID      180000 non-null  float64
 1   Age             180000 non-null  float64
 2   Gender          180000 non-null  object 
 3   Height          180000 non-null  float64
 4   Weight          180000 non-null  float64
 5   Blood Type      180000 non-null  object 
 6   BMI             180000 non-null  float64
 7   Temperature     180000 non-null  float64
 8   Heart Rate      180000 non-null  float64
 9   Blood Pressure  180000 non-null  float64
 10  Cholesterol     180000 non-null  float64
 11  Diabetes        180000 non-null  object 
 12  Smoking         180000 non-null  object 
dtypes: float64(9), object(4)
memory usage: 19.8+ MB


In [319]:
dataset.apply(pd.isnull).sum()

Student ID        20000
Age               20000
Gender            20000
Height            20000
Weight            20000
Blood Type        20000
BMI               20000
Temperature       20000
Heart Rate        20000
Blood Pressure    20000
Cholesterol       20000
Diabetes          20000
Smoking           20000
dtype: int64

In [320]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')

In [321]:
dataset.drop(columns = ['Student ID'], axis = 1, inplace = True)

In [322]:
numerical_columns, categorical_columns = [], []
for column in dataset.columns:
    if dataset[column].dtype == 'object':
        categorical_columns.append(column)
    else:
        numerical_columns.append(column)

In [323]:
dataset.describe()

Unnamed: 0,Age,Height,Weight,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol
count,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0
mean,26.021561,174.947103,69.971585,23.338869,98.600948,79.503767,114.558033,184.486361
std,4.890528,14.44756,17.322574,7.033554,0.50053,11.540755,14.403353,37.559678
min,18.0,150.000041,40.000578,10.074837,96.397835,60.0,90.0,120.0
25%,22.0,162.47611,54.969838,17.858396,98.26475,70.0,102.0,152.0
50%,26.0,174.899914,69.979384,22.671401,98.599654,80.0,115.0,184.0
75%,30.0,187.464417,84.980097,27.997487,98.940543,90.0,127.0,217.0
max,34.0,199.998639,99.999907,44.355113,100.824857,99.0,139.0,249.0


In [324]:
dataset[numerical_columns] = imputer.fit_transform(dataset[numerical_columns])

In [325]:
dataset.isnull().sum()

Age                   0
Gender            20000
Height                0
Weight                0
Blood Type        20000
BMI                   0
Temperature           0
Heart Rate            0
Blood Pressure        0
Cholesterol           0
Diabetes          20000
Smoking           20000
dtype: int64

In [326]:
dataset.describe()

Unnamed: 0,Age,Height,Weight,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,26.019405,174.942384,69.972365,23.272123,98.600818,79.55339,114.60223,184.437725
std,4.639565,13.706162,16.433632,6.675617,0.474845,10.949531,13.66486,35.632528
min,18.0,150.000041,40.000578,10.074837,96.397835,60.0,90.0,120.0
25%,22.0,163.859439,56.626643,18.382809,98.306875,71.0,104.0,156.0
50%,26.0,174.899914,69.979384,22.671401,98.599654,80.0,115.0,184.0
75%,30.0,186.07936,83.316641,27.255521,98.897102,88.0,126.0,213.0
max,34.0,199.998639,99.999907,44.355113,100.824857,99.0,139.0,249.0


In [327]:
dataset.dropna(subset = categorical_columns, inplace = True)

In [328]:
dataset.apply(pd.isnull).sum()

Age               0
Gender            0
Height            0
Weight            0
Blood Type        0
BMI               0
Temperature       0
Heart Rate        0
Blood Pressure    0
Cholesterol       0
Diabetes          0
Smoking           0
dtype: int64

In [329]:
dataset.shape

(131263, 12)

In [330]:
dataset.describe()

Unnamed: 0,Age,Height,Weight,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol
count,131263.0,131263.0,131263.0,131263.0,131263.0,131263.0,131263.0,131263.0
mean,26.032705,174.911534,69.985388,23.282855,98.601262,79.546841,114.623222,184.442973
std,4.642859,13.708091,16.441834,6.683392,0.474729,10.942657,13.682233,35.683262
min,18.0,150.000041,40.000578,10.074837,96.397835,60.0,90.0,120.0
25%,22.0,163.831119,56.641581,18.395065,98.306793,71.0,104.0,155.0
50%,26.0,174.899914,69.979384,22.671401,98.599654,80.0,115.0,184.0
75%,30.0,186.08607,83.334252,27.274271,98.898043,88.0,126.0,213.0
max,34.0,199.998639,99.999907,44.355113,100.824857,99.0,139.0,249.0


In [331]:
dataset.head()

Unnamed: 0,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
1,26.0,Male,152.069157,47.630941,B,22.671401,98.714977,93.0,104.0,163.0,No,No
2,32.0,Female,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,Yes,No
3,30.0,Male,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,No,Yes
4,23.0,Female,174.899914,46.234173,O,22.671401,98.480008,95.0,115.0,231.0,No,No
7,28.0,Male,186.489402,52.389752,AB,15.063921,98.227788,85.0,123.0,128.0,No,No


In [332]:
dataset.reset_index(drop = True, inplace = True)

In [333]:
dataset.head()

Unnamed: 0,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Diabetes,Smoking
0,26.0,Male,152.069157,47.630941,B,22.671401,98.714977,93.0,104.0,163.0,No,No
1,32.0,Female,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,Yes,No
2,30.0,Male,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,No,Yes
3,23.0,Female,174.899914,46.234173,O,22.671401,98.480008,95.0,115.0,231.0,No,No
4,28.0,Male,186.489402,52.389752,AB,15.063921,98.227788,85.0,123.0,128.0,No,No


In [334]:
le = LabelEncoder()

In [335]:
data = dataset.copy()

In [336]:
y = dataset['Diabetes']
dataset.drop(columns = ['Diabetes'], axis = 'columns', inplace = True)

In [337]:
y = le.fit_transform(y)

In [338]:
cols1 = ['Gender', 'Smoking']
for column in cols1:
    dataset[column] = le.fit_transform(dataset[column])

In [339]:
dataset.head()

Unnamed: 0,Age,Gender,Height,Weight,Blood Type,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Smoking
0,26.0,1,152.069157,47.630941,B,22.671401,98.714977,93.0,104.0,163.0,0
1,32.0,0,182.537664,55.741083,A,16.729017,98.260293,76.0,130.0,216.0,0
2,30.0,1,182.112867,63.332207,B,19.096042,98.839605,99.0,112.0,141.0,1
3,23.0,0,174.899914,46.234173,O,22.671401,98.480008,95.0,115.0,231.0,0
4,28.0,1,186.489402,52.389752,AB,15.063921,98.227788,85.0,123.0,128.0,0


In [340]:
dataset['Gender_le'] = dataset['Gender']
dataset['Blood Type ohe'] = dataset['Blood Type']

In [341]:
dataset.drop(columns = ['Gender', 'Blood Type'], axis = 1, inplace = True)

In [342]:
dataset.head()

Unnamed: 0,Age,Height,Weight,BMI,Temperature,Heart Rate,Blood Pressure,Cholesterol,Smoking,Gender_le,Blood Type ohe
0,26.0,152.069157,47.630941,22.671401,98.714977,93.0,104.0,163.0,0,1,B
1,32.0,182.537664,55.741083,16.729017,98.260293,76.0,130.0,216.0,0,0,A
2,30.0,182.112867,63.332207,19.096042,98.839605,99.0,112.0,141.0,1,1,B
3,23.0,174.899914,46.234173,22.671401,98.480008,95.0,115.0,231.0,0,0,O
4,28.0,186.489402,52.389752,15.063921,98.227788,85.0,123.0,128.0,0,1,AB


In [343]:
ct = ColumnTransformer(
    transformers = [('encoder', OneHotEncoder(drop = 'first', dtype = 'int32'), [10])],
    remainder = 'passthrough'
)

In [344]:
dataset = np.array(ct.fit_transform(dataset))

In [345]:
dataset

array([[  0.,   1.,   0., ..., 163.,   0.,   1.],
       [  0.,   0.,   0., ..., 216.,   0.,   0.],
       [  0.,   1.,   0., ..., 141.,   1.,   1.],
       ...,
       [  0.,   0.,   0., ..., 184.,   0.,   1.],
       [  0.,   1.,   0., ..., 130.,   0.,   1.],
       [  0.,   0.,   0., ..., 225.,   0.,   0.]])

In [346]:
y

array([0, 1, 0, ..., 1, 0, 0])

In [347]:
X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size = 0.2, random_state = 101, stratify = y, 
                                                    shuffle = True)

In [348]:
print(X_train)

[[  0.   0.   0. ... 122.   0.   1.]
 [  1.   0.   0. ... 211.   0.   0.]
 [  0.   0.   0. ... 150.   0.   0.]
 ...
 [  1.   0.   0. ... 180.   0.   1.]
 [  0.   0.   1. ... 215.   1.   0.]
 [  0.   1.   0. ... 221.   0.   0.]]


In [349]:
print(X_test)

[[  0.   0.   1. ... 188.   0.   0.]
 [  0.   1.   0. ... 180.   0.   1.]
 [  1.   0.   0. ... 180.   1.   0.]
 ...
 [  0.   0.   0. ... 178.   0.   1.]
 [  0.   1.   0. ... 153.   1.   1.]
 [  1.   0.   0. ... 232.   0.   1.]]


In [350]:
print(y_train)

[1 0 0 ... 0 0 0]


In [351]:
print(y_test)

[0 0 0 ... 0 0 1]


In [352]:
scaler = StandardScaler()

In [353]:
X_train[0]

array([  0.        ,   0.        ,   0.        ,  21.        ,
       193.0393267 ,  78.02513837,  22.67140145,  99.41422002,
        81.        , 106.        , 122.        ,   0.        ,
         1.        ])

In [354]:
X_train[:, 3: 11] = scaler.fit_transform(X_train[:, 3: 11])
X_test[:, 3: 11] = scaler.transform(X_test[:, 3: 11])

In [355]:
X_train

array([[ 0.        ,  0.        ,  0.        , ..., -1.75273775,
         0.        ,  1.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.74286307,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -0.9676049 ,
         0.        ,  0.        ],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.12639115,
         0.        ,  1.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.85502491,
         1.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        , ...,  1.02326766,
         0.        ,  0.        ]])

In [356]:
X_test

array([[ 0.        ,  0.        ,  1.        , ...,  0.09793252,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        , ..., -0.12639115,
         0.        ,  1.        ],
       [ 1.        ,  0.        ,  0.        , ..., -0.12639115,
         1.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.18247206,
         0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.        , ..., -0.88348353,
         1.        ,  1.        ],
       [ 1.        ,  0.        ,  0.        , ...,  1.3317127 ,
         0.        ,  1.        ]])