In [20]:
import pandas as pd
import numpy as np

data = pd.read_csv('sleep_health_clean.csv',sep=';')

In [21]:
data.describe()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Systolic Blood Pressure,Diastolic Blood Pressure,Heart Rate,Daily Steps
count,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0,374.0
mean,187.5,42.184492,7.132086,7.312834,59.171123,5.385027,128.553476,84.649733,70.165775,6816.84492
std,108.108742,8.673133,0.795657,1.196956,20.830804,1.774526,7.748118,6.161611,4.135676,1617.915679
min,1.0,27.0,5.8,4.0,30.0,3.0,115.0,75.0,65.0,3000.0
25%,94.25,35.25,6.4,6.0,45.0,4.0,125.0,80.0,68.0,5600.0
50%,187.5,43.0,7.2,7.0,60.0,5.0,130.0,85.0,70.0,7000.0
75%,280.75,50.0,7.8,8.0,75.0,7.0,135.0,90.0,72.0,8000.0
max,374.0,59.0,8.5,9.0,90.0,8.0,142.0,95.0,86.0,10000.0


In [22]:
data.columns

Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Systolic Blood Pressure', 'Diastolic Blood Pressure',
       'Heart Rate', 'Daily Steps', 'Sleep Disorder'],
      dtype='object')

In [23]:
for col in data:
    print(col)
    print(len(data[col].unique()))
    print()

Person ID
374

Gender
2

Age
31

Occupation
11

Sleep Duration
27

Quality of Sleep
6

Physical Activity Level
16

Stress Level
6

BMI Category
4

Systolic Blood Pressure
18

Diastolic Blood Pressure
17

Heart Rate
19

Daily Steps
20

Sleep Disorder
3



In [24]:
data["Sleep Disorder"] = data["Sleep Disorder"].replace([None], "None")
data.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Systolic Blood Pressure,Diastolic Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,2,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,
1,3,Male,28,Doctor,6.2,6,60,8,Normal,125,80,75,10000,
2,34,Male,31,Doctor,6.1,6,30,8,Normal,125,80,72,5000,
3,36,Male,31,Doctor,6.1,6,30,8,Normal,125,80,72,5000,
4,37,Male,31,Doctor,6.1,6,30,8,Normal,125,80,72,5000,


In [25]:
data.drop(columns=["Person ID", "Diastolic Blood Pressure"],inplace=True)


In [26]:

X = data.drop(columns=["Sleep Disorder"], inplace=False)
y = data["Sleep Disorder"]

In [27]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

from sklearn.preprocessing import OrdinalEncoder

gender_encoder = OrdinalEncoder(categories=[["Female", "Male"]], dtype=np.int8)

data["Gender"] = gender_encoder.fit_transform(data["Gender"].values.reshape(-1,1))
data.head()

bmi_encoder = OrdinalEncoder(categories=[["Normal", "Normal Weight", "Overweight", "Obese"]], dtype=np.int8)

data["BMI Category"] = bmi_encoder.fit_transform(data["BMI Category"].values.reshape(-1,1))
data.head()

In [28]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
ct = ColumnTransformer(transformers=
    [("one_hot_encoder",OneHotEncoder(),["Occupation"]),
     ("bmi_encoder",OrdinalEncoder(categories=[["Normal", "Normal Weight", "Overweight", "Obese"]], dtype=np.int8),["BMI Category"]),
     ("gender_encoder", OrdinalEncoder(categories=[["Female", "Male"]], dtype=np.int8),["Gender"])], remainder='passthrough'
)
X = ct.fit_transform(X)


In [29]:
cols = ct.get_feature_names_out()
X = pd.DataFrame(data=X, columns=cols)

In [30]:
y.shape

(374,)

In [31]:
y = y.values.reshape(-1,1)

In [32]:

y_encoder = OneHotEncoder()

y = y_encoder.fit_transform(y)



In [33]:
y = y.toarray()

In [34]:
y = pd.DataFrame(y, columns=y_encoder.get_feature_names_out())

In [35]:
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)

In [36]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

daily_steps_scaler = RobustScaler()

X_train.iloc[:,-1] = daily_steps_scaler.fit_transform(X_train.iloc[:,-1].values.reshape(-1,1))

X_test.iloc[:,-1] = daily_steps_scaler.transform(X_test.iloc[:,-1].values.reshape(-1,1))

In [37]:
scaler = MinMaxScaler()

X_train.iloc[:,-8:-2] = scaler.fit_transform(X_train.iloc[:,-8:-2])
X_test.iloc[:,-8:-2] = scaler.transform(X_test.iloc[:,-8:-2])

In [38]:
X_train.to_csv('data/X_train.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)