In [1]:
# I import pandas and numpy and set the processed data path
import pandas as pd
import numpy as np
import os

PROCESSED_PATH = "../data/processed/sleepsense_processed.csv"
OUT_PATH = "../data/processed/sleepsense_features.csv"
os.makedirs("../data/processed", exist_ok=True)

print("ready")


ready


Inference: I set paths and ensured the processed folder exists.

In [2]:
# I load the processed dataset from previous step
df = pd.read_csv(PROCESSED_PATH)
print("loaded shape:", df.shape)
df.head(2)


loaded shape: (12000, 38)


Unnamed: 0,age,family_size,work_hours,avg_sleep_hours,screen_time_hours,tea_cups,coffee_cups,late_snack,spice_intake,religious_freq,...,city_Kanpur,city_Kolkata,city_Mumbai,city_Navi Mumbai,city_Other,city_Pimpri-Chinchwad,city_Pune,city_Surat,city_Thane,city_Vasai-Virar
0,-0.55424,0.269326,1.398265,-1.712387,1.328569,0.596543,0.29285,-0.815221,-0.294897,0.228669,...,False,False,False,False,True,False,False,False,False,False
1,1.98961,-1.347035,1.843044,-0.795982,0.373011,-1.371705,1.362622,-0.815221,-0.294897,-0.94726,...,False,True,False,False,False,False,False,False,False,False


Inference: Data loaded successfully and ready for new features.

In [3]:
# I create 'sleep_deficit' because people with lower sleep hours usually have poor sleep quality.
# Since 'avg_sleep_hours' is already scaled, I use its negative value to represent deficit.
# Higher positive value here = greater lack of sleep compared to others.
if 'avg_sleep_hours' in df.columns:
    df['sleep_deficit'] = (-1) * df['avg_sleep_hours']
    print("sleep_deficit created")
else:
    print("avg_sleep_hours not found")


sleep_deficit created


Inference: sleep_deficit shows how far someone is from ideal sleep time (higher = worse).

In [4]:
# I create 'digital_fatigue' because too much screen time and stress together harm sleep.
# The formula combines 'screen_time_hours' and 'stress_level' by taking their average.
# Both are already scaled, so this combination makes a balanced fatigue indicator.
if {'screen_time_hours','stress_level'}.issubset(df.columns):
    df['digital_fatigue'] = (df['screen_time_hours'] + df['stress_level']) / 2
    print("digital_fatigue created")
else:
    print("needed columns not found")


digital_fatigue created


Inference: digital_fatigue captures total tiredness caused by both stress and device exposure.

In [5]:
# I create 'lifestyle_balance' because a healthy life means high activity and low stress.
# The formula subtracts stress_level from physical_activity_min (both are scaled).
# Positive values mean good balance; negative means imbalance (stress > activity).
if {'physical_activity_min','stress_level'}.issubset(df.columns):
    df['lifestyle_balance'] = df['physical_activity_min'] - df['stress_level']
    print("lifestyle_balance created")
else:
    print("columns missing for lifestyle_balance")


lifestyle_balance created


Inference: lifestyle_balance shows if someone’s lifestyle is balanced between work and relaxation.

In [6]:
# I make 'late_snack_effect' because eating late, especially spicy food, can disturb sleep.
# The formula multiplies 'late_snack' and 'spice_intake' to reflect stronger effect for spicy eaters.
if {'late_snack','spice_intake'}.issubset(df.columns):
    df['late_snack_effect'] = df['late_snack'] * df['spice_intake']
    print("late_snack_effect created")
else:
    print("columns missing for late_snack_effect")


late_snack_effect created


Inference: late_snack_effect shows how much late spicy meals might reduce sleep quality.

In [9]:
# I make 'env_stress' because city noise, light pollution, and air quality together affect sleep.
# I take the mean of these three scaled columns to represent total environmental stress.

# Create 'env_stress' as the mean of environmental factors
cols = [c for c in ['city_noise_dB', 'light_pollution_index', 'air_quality_index'] if c in df.columns]

if cols:
    df['env_stress'] = df[cols].mean(axis=1)
    print(f"env_stress created using: {cols}")
else:
    print("No environmental columns found")


env_stress created using: ['city_noise_dB', 'light_pollution_index', 'air_quality_index']


Creating 'env_stress' as the mean of environmental factors

In [10]:
# I create 'fatigue_env_interaction' to see how digital fatigue combines with bad environment.
# If both are high, sleep quality may drop sharply, so I multiply them.
if {'digital_fatigue','env_stress'}.issubset(df.columns):
    df['fatigue_env_interaction'] = df['digital_fatigue'] * df['env_stress']
    print("fatigue_env_interaction created")
else:
    print("interaction columns missing")


fatigue_env_interaction created


Inference: fatigue_env_interaction represents the combined impact of devices + pollution on sleep.

In [11]:
# I create 'is_metro' to mark people living in major metro cities like Mumbai, Delhi, Bengaluru, etc.
# Metro residents usually face higher noise, stress, and irregular routines.
# I find these cities from the one-hot encoded columns and mark 1 for metro, 0 for others.
metro_words = ['Mumbai','Delhi','Bengaluru','Kolkata','Chennai','Hyderabad','Pune','Ahmedabad']
metro_cols = [c for c in df.columns if c.startswith('city_') and any(m in c for m in metro_words)]

if metro_cols:
    df['is_metro'] = df[metro_cols].any(axis=1).astype(int)
    print("is_metro created")
else:
    print("no city dummy columns found")


is_metro created


Inference: is_metro shows whether a person lives in a big, high-stress city.

In [12]:
# I save the dataset with all new features for the modeling notebook.
OUT_PATH = "../data/processed/sleepsense_features.csv"
df.to_csv(OUT_PATH, index=False)
print("saved with new features:", df.shape)


saved with new features: (12000, 45)


### ***Inference: The dataset with engineered features is ready for modeling.***

## ***Checking New Features***

In [13]:
# I quickly check first few rows for the new columns to make sure they look fine.
cols_to_check = ['sleep_deficit','digital_fatigue','env_stress','lifestyle_balance',
                 'late_snack_effect','fatigue_env_interaction','is_metro']
df[cols_to_check].head(3)


Unnamed: 0,sleep_deficit,digital_fatigue,env_stress,lifestyle_balance,late_snack_effect,fatigue_env_interaction,is_metro
0,1.712387,0.759338,0.553976,-2.179185,0.240407,0.420655,0
1,0.795982,-0.218681,0.257733,-1.224279,0.240407,-0.056361,1
2,0.502377,-0.801945,-0.661131,2.474885,0.79861,0.530191,0


Inference: New features are visible and seem logically consistent with expectations.