<a href="https://colab.research.google.com/github/ankirani/Sleep-and-Health-Lifestyle/blob/main/Sleep_Health_and_Lifestyle_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

###Load the dataset

In [None]:
df = pd.read_csv('/content/Sleep_health_and_lifestyle_dataset.csv')
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


*   Numerical Column : Person ID, Age, Sleep Duration, Quality of Sleep, Physical Activity Level, Stress Level, Heart Rate, Daily Steps
*   Categorical Column : Gender,
Occupation, BMI Category, Blood Pressure, Sleep Disorder


###Check missing value

In [None]:
df.isnull().sum()

Unnamed: 0,0
Person ID,0
Gender,0
Age,0
Occupation,0
Sleep Duration,0
Quality of Sleep,0
Physical Activity Level,0
Stress Level,0
BMI Category,0
Blood Pressure,0


*   Sleep Disorder has 219 missing values



### Fill the missing value

In [None]:
#checking the categories in the Sleep Disorder variable
df['Sleep Disorder'].unique()

array([nan, 'Sleep Apnea', 'Insomnia'], dtype=object)

In [None]:
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('Good Sleep')

*   Fill the missing value with "Good Sleep" category



###Check Duplicate data

In [None]:
df.duplicated().sum()

np.int64(0)

###Detect Outliers using Z-Score (for Numerical Variables)

In [None]:
from scipy.stats import zscore

# List of numerical columns to check
numerical_columns = [
    'Age', 'Sleep Duration', 'Quality of Sleep',
    'Physical Activity Level', 'Stress Level',
    'Heart Rate', 'Daily Steps'
]

# Drop rows with missing values in these columns
df_clean = df.dropna(subset=numerical_columns)

# Calculate Z-scores for the selected columns
z_scores = df_clean[numerical_columns].apply(zscore)

# Define a threshold for outliers
threshold = 3

# Create a boolean mask for outliers
outliers_mask = (z_scores.abs() > threshold)

# Combine the mask across all columns (row is an outlier if any column is)
outliers_combined = df_clean[outliers_mask.any(axis=1)]

# Print outliers
print(outliers_combined)


     Person ID  Gender  Age            Occupation  Sleep Duration  \
3            4    Male   28  Sales Representative             5.9   
4            5    Male   28  Sales Representative             5.9   
5            6    Male   28     Software Engineer             5.9   
93          94    Male   35                Lawyer             7.4   
145        146  Female   38                Lawyer             7.4   
264        265    Male   48                Doctor             7.3   
266        267    Male   48                Doctor             7.3   
276        277    Male   49                Doctor             8.1   
277        278    Male   49                Doctor             8.1   

     Quality of Sleep  Physical Activity Level  Stress Level BMI Category  \
3                   4                       30             8        Obese   
4                   4                       30             8        Obese   
5                   4                       30             8        Obese   
9

* In this data preprocessing step, i used the Z-score to detect outliers in each variables
* Z-score tells how many standard deviations aaway a data point is from the mean. The formula is (X - mean) / std.deviation.
* Data points with a Z-score greater than 3 or less than -3 are considered outliers.
* After applying the Z-score, outliers were found in the Heart Rate variable with a total of 9 rows.



###Drop the outliers

In [None]:
#create mask for non-outliers
non_outliers_mask = (z_scores.abs() <= threshold).all(axis=1)

#apply the mask to keep only non-outliers rows
df_no_outliers = df_clean[non_outliers_mask].copy()

#print the cleaned dataset without outliers
print(df_no_outliers.shape)

(365, 14)


In [None]:
print(df_no_outliers.head())

   Person ID Gender  Age         Occupation  Sleep Duration  Quality of Sleep  \
0          1   Male   27  Software Engineer             6.1                 6   
1          2   Male   28             Doctor             6.2                 6   
2          3   Male   28             Doctor             6.2                 6   
6          7   Male   29            Teacher             6.3                 6   
7          8   Male   29             Doctor             7.8                 7   

   Physical Activity Level  Stress Level BMI Category Blood Pressure  \
0                       42             6   Overweight         126/83   
1                       60             8       Normal         125/80   
2                       60             8       Normal         125/80   
6                       40             7        Obese         140/90   
7                       75             6       Normal         120/80   

   Heart Rate  Daily Steps Sleep Disorder  z_score_heart_rate  
0          77   

In [None]:
df = df_no_outliers.drop(columns=['z_score_heart_rate'])

###Convert categorical variables into numerical using Label Encoding

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col} Value Counts:")
    print(df[col].value_counts(dropna=False))


Gender Value Counts:
Gender
Female    184
Male      181
Name: count, dtype: int64

Occupation Value Counts:
Occupation
Nurse                73
Doctor               67
Engineer             63
Lawyer               45
Teacher              40
Accountant           37
Salesperson          32
Scientist             4
Software Engineer     3
Manager               1
Name: count, dtype: int64

BMI Category Value Counts:
BMI Category
Normal           195
Overweight       148
Normal Weight     21
Obese              1
Name: count, dtype: int64

Blood Pressure Value Counts:
Blood Pressure
130/85    99
125/80    65
140/95    65
120/80    45
115/75    32
135/90    27
125/82     4
128/85     3
132/87     3
126/83     2
118/75     2
130/86     2
128/84     2
117/76     2
119/77     2
129/84     2
131/86     2
115/78     2
140/90     1
118/76     1
121/79     1
122/80     1
Name: count, dtype: int64

Sleep Disorder Value Counts:
Sleep Disorder
Good Sleep     219
Insomnia        74
Sleep Apnea     72
Name

In [None]:
from sklearn.preprocessing import LabelEncoder

df_label_encoded = df.copy()

le = LabelEncoder()

for col in categorical_cols:
    df_label_encoded[col] = le.fit_transform(df_label_encoded[col].astype(str))

In [None]:
print(df_label_encoded.head())

   Person ID  Gender  Age  Occupation  Sleep Duration  Quality of Sleep  \
0          1       1   27           8             6.1                 6   
1          2       1   28           1             6.2                 6   
2          3       1   28           1             6.2                 6   
6          7       1   29           9             6.3                 6   
7          8       1   29           1             7.8                 7   

   Physical Activity Level  Stress Level  BMI Category  Blood Pressure  \
0                       42             6             3              11   
1                       60             8             0               9   
2                       60             8             0               9   
6                       40             7             2              20   
7                       75             6             0               6   

   Heart Rate  Daily Steps  Sleep Disorder  
0          77         4200               0  
1          75 

In [None]:
df_label_encoded.to_csv('df_label_encoded.csv', index=False)

###Standardize Numerical Variables

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_scaled = df_label_encoded.copy()

df_scaled[numerical_columns] = scaler.fit_transform(df_scaled[numerical_columns])

In [None]:
print(df_scaled.head())

   Person ID  Gender       Age  Occupation  Sleep Duration  Quality of Sleep  \
0          1       1 -1.767343           8       -1.304636         -1.145467   
1          2       1 -1.651550           1       -1.178526         -1.145467   
2          3       1 -1.651550           1       -1.178526         -1.145467   
6          7       1 -1.535758           9       -1.052416         -1.145467   
7          8       1 -1.535758           1        0.839238         -0.286955   

   Physical Activity Level  Stress Level  BMI Category  Blood Pressure  \
0                -0.828552      0.350039             3              11   
1                 0.036883      1.480696             0               9   
2                 0.036883      1.480696             0               9   
6                -0.924711      0.915368             2              20   
7                 0.758079      0.350039             0               6   

   Heart Rate  Daily Steps  Sleep Disorder  
0    2.056611    -1.756870   

In [None]:
df_scaled.to_csv('df_scaled.csv', index=False)