#### Installing necessary libraries

In [1]:
%pip install pandas openpyxl scikit-learn ydata-profiling pycaret

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install tensorflow keras

Note: you may need to restart the kernel to use updated packages.


<center><h1><b>Importing the dataset</b></h1><center>

In [3]:
import pandas as pd

data = pd.read_csv('data/Sleep_Efficiency.csv')
data.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0


<center><h1><b>Exploratory Data Analysis (EDA)</b></h1><center>

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      452 non-null    int64  
 1   Age                     452 non-null    int64  
 2   Gender                  452 non-null    object 
 3   Bedtime                 452 non-null    object 
 4   Wakeup time             452 non-null    object 
 5   Sleep duration          452 non-null    float64
 6   Sleep efficiency        452 non-null    float64
 7   REM sleep percentage    452 non-null    int64  
 8   Deep sleep percentage   452 non-null    int64  
 9   Light sleep percentage  452 non-null    int64  
 10  Awakenings              432 non-null    float64
 11  Caffeine consumption    427 non-null    float64
 12  Alcohol consumption     438 non-null    float64
 13  Smoking status          452 non-null    object 
 14  Exercise frequency      446 non-null    fl

In [5]:
data.describe()

Unnamed: 0,ID,Age,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Exercise frequency
count,452.0,452.0,452.0,452.0,452.0,452.0,452.0,432.0,427.0,438.0,446.0
mean,226.5,40.285398,7.465708,0.788916,22.615044,52.823009,24.561947,1.641204,23.653396,1.173516,1.79148
std,130.625419,13.17225,0.866625,0.135237,3.525963,15.654235,15.313665,1.356762,30.202785,1.621377,1.428134
min,1.0,9.0,5.0,0.5,15.0,18.0,7.0,0.0,0.0,0.0,0.0
25%,113.75,29.0,7.0,0.6975,20.0,48.25,15.0,1.0,0.0,0.0,0.0
50%,226.5,40.0,7.5,0.82,22.0,58.0,18.0,1.0,25.0,0.0,2.0
75%,339.25,52.0,8.0,0.9,25.0,63.0,32.5,3.0,50.0,2.0,3.0
max,452.0,69.0,10.0,0.99,30.0,75.0,63.0,4.0,200.0,5.0,5.0


In [6]:
data.columns

Index(['ID', 'Age', 'Gender', 'Bedtime', 'Wakeup time', 'Sleep duration',
       'Sleep efficiency', 'REM sleep percentage', 'Deep sleep percentage',
       'Light sleep percentage', 'Awakenings', 'Caffeine consumption',
       'Alcohol consumption', 'Smoking status', 'Exercise frequency'],
      dtype='object')

In [7]:
import ydata_profiling as ydp

ydp.ProfileReport(data)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



<center><h1><b>Preprocessing</b></h1><center>

#### Handling null and duplicate values

In [8]:
# data.isnull().sum()

In [9]:
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

data.isnull().sum()

ID                        0
Age                       0
Gender                    0
Bedtime                   0
Wakeup time               0
Sleep duration            0
Sleep efficiency          0
REM sleep percentage      0
Deep sleep percentage     0
Light sleep percentage    0
Awakenings                0
Caffeine consumption      0
Alcohol consumption       0
Smoking status            0
Exercise frequency        0
dtype: int64

#### Label encoding the Gender and Alcohol Consumption columns

In [10]:
data['Gender'] = data['Gender'].replace({'Male': 1, 'Female': 0})
data['Smoking status'] = data['Smoking status'].replace({'Yes': 1, 'No': 0})

#### Converting date to numerical format

In [11]:
data.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,0,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,1,3.0
1,2,69,1,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,1,3.0
2,3,40,0,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,0,3.0
3,4,40,0,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,1,1.0
4,5,57,1,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,0,3.0


In [12]:
import datetime as dt

data['Bedtime'] = pd.to_datetime(data['Bedtime']).dt.time
data['Bedtime'] = data['Bedtime'].apply(lambda x: dt.datetime.combine(dt.date(1, 1, 1), x).toordinal())

data['Wakeup time'] = pd.to_datetime(data['Wakeup time']).dt.time
data['Wakeup time'] = data['Wakeup time'].apply(lambda x: dt.datetime.combine(dt.date(1, 1, 1), x).toordinal())

data.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,0,1,1,6.0,0.88,18,70,12,0.0,0.0,0.0,1,3.0
1,2,69,1,1,1,7.0,0.66,19,28,53,3.0,0.0,3.0,1,3.0
2,3,40,0,1,1,8.0,0.89,20,70,10,1.0,0.0,0.0,0,3.0
3,4,40,0,1,1,6.0,0.51,23,25,52,3.0,50.0,5.0,1,1.0
4,5,57,1,1,1,8.0,0.76,27,55,18,3.0,0.0,3.0,0,3.0


<center><h1><b>Training the Models</b></h1><center>

### Splitting the data

In [13]:
# feature selection
X = data[['Age', 'Gender', 'Sleep duration',
       'REM sleep percentage', 'Deep sleep percentage', 'Light sleep percentage', 
       'Awakenings', 'Caffeine consumption']]

# target variable
y = data['Sleep efficiency']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((310, 8), (78, 8), (310,), (78,))

In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

<h3><b>A) Classic Machine Learning Algorithm : Linear Regression</b></h3>

In [16]:
import numpy as np
y_test = np.array(y_test)

In [17]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

#### Evaluation

In [18]:
print("Linear Regression Score: {:.2f}\n".format(lr_model.score(X_test, y_test)))

y_pred_lr = lr_model.predict(X_test)

lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(lr_mse)
lr_r2 = r2_score(y_test, y_pred_lr)
lr_mape = np.mean(np.abs((y_test - y_pred_lr) / y_test)) * 100

print(f"Mean Absolute Error (MAE): {lr_mae.round(3)}")
print(f"Mean Squared Error (MSE): {lr_mse.round(3)}")
print(f"Root Mean Squared Error (RMSE): {lr_rmse.round(3)}")
print(f"R-squared (R²): {lr_r2.round(3)}")
print(f"Mean Absolute Percentage Error (MAPE): {lr_mape.round(3)}")

Linear Regression Score: 0.77

Mean Absolute Error (MAE): 0.054
Mean Squared Error (MSE): 0.004
Root Mean Squared Error (RMSE): 0.066
R-squared (R²): 0.771
Mean Absolute Percentage Error (MAPE): 7.59


<h3><b>B) Ensemble Model : Random Forest</b></h3>

In [19]:
from sklearn.ensemble import RandomForestRegressor

rf_model  = RandomForestRegressor(n_jobs=-1, random_state=123)
rf_model.fit(X_train, y_train)

#### Evaluation

In [20]:
print("Random Forest Regressor Score: {:.2f}\n".format(rf_model.score(X_test, y_test)))

y_pred_rf = rf_model.predict(X_test)

rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(rf_mse)
rf_r2 = r2_score(y_test, y_pred_rf)
rf_mape = np.mean(np.abs((y_test - y_pred_rf) / y_test)) * 100

print(f"Mean Absolute Error (lr_MAE): {rf_mae.round(3)}")
print(f"Mean Squared Error (MSE): {rf_mse.round(3)}")
print(f"Root Mean Squared Error (RMEf): {rf_rmse.round(3)}")
print(f"R-squared (R²): {rf_r2.round(3)}")
print(f"Mean Absolute Percentage Error (MAPE): {rf_mape.round(3)}")

Random Forest Regressor Score: 0.87

Mean Absolute Error (lr_MAE): 0.041
Mean Squared Error (MSE): 0.002
Root Mean Squared Error (RMEf): 0.049
R-squared (R²): 0.87
Mean Absolute Percentage Error (MAPE): 5.66


<h3><b>C) Custom Neural Network</b></h3>

In [21]:
import keras
from keras.models import Sequential
from keras.layers import Dense




In [22]:
input_shape = X.shape[1]

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(input_shape,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')





In [23]:
epochs = 30
model.fit(X_train, y_train, epochs=epochs, verbose=1)

Epoch 1/30

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x285160a5ed0>

In [24]:
loss_val = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
loss_val



0.02703346125781536

<center><h1><b>Saving the best model</b></h1><center>

In [25]:
import pickle

with open('model/rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)