In [25]:
# I import simple libraries and set file paths
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

RAW_PATH = "../data/raw/SleepSense_India_Full.csv"
PROCESSED_PATH = "../data/processed/sleepsense_processed.csv"
SCALER_PATH = "../models/scaler.joblib"

os.makedirs("../data/processed", exist_ok=True)
os.makedirs("../models", exist_ok=True)

print("Paths ready")


Paths ready


Inference: I prepared paths and created folders for processed data and models.

In [26]:
# I load the raw dataset
df = pd.read_csv(RAW_PATH)
print("loaded shape:", df.shape)
df.head(3)


loaded shape: (12000, 23)


Unnamed: 0,city,age,sex,family_size,work_hours,avg_sleep_hours,screen_time_hours,tea_cups,coffee_cups,late_snack,...,physical_activity_min,bedtime_variability,stress_level,city_noise_dB,light_pollution_index,temperature_night,humidity_night,air_quality_index,screen_brightness_behavior,sleep_quality_score
0,Thrissur,25.3,Female,4,11.6,4.79,3.69,3,1,0,...,0.9,0.25,5.37,65.97,46.18,27.73,53.42,179.7,0.29,38.42
1,Kolkata,49.0,Female,2,12.3,5.82,2.71,0,2,0,...,0.0,0.43,3.41,57.56,57.09,21.27,50.0,144.7,0.31,48.63
2,Varanasi,16.4,Female,4,9.0,6.15,1.21,0,0,1,...,78.9,1.57,3.99,61.37,31.77,28.02,73.11,69.4,0.73,52.9


Inference: I confirm the raw data is available and check the first rows.

In [27]:
# I remove exact duplicate rows if any
before = len(df)
df = df.drop_duplicates()
after = len(df)
print("duplicates removed:", before - after)


duplicates removed: 0


Inference: Exact duplicates (if any) are removed so training is not biased.

In [28]:
# I check missing counts and show columns with missing values
miss = df.isnull().sum()
miss = miss[miss > 0].sort_values(ascending=False)
print("columns with missing values (count):")
print(miss)


columns with missing values (count):
Series([], dtype: int64)


Inference: I will impute numeric columns with median and categorical with mode if any missing values exist.

In [29]:
# I impute numeric columns with median and categorical with mode
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

# Impute numeric
for c in num_cols:
    if df[c].isnull().any():
        med = df[c].median()
        df[c] = df[c].fillna(med)
        print(f"filled numeric {c} with median {med}")

# Impute categorical
for c in cat_cols:
    if df[c].isnull().any():
        mode = df[c].mode().iloc[0]
        df[c] = df[c].fillna(mode)
        print(f"filled categorical {c} with mode '{mode}'")


Inference: Missing numeric values are filled by median and categorical by the most common value.

In [30]:
# Keep top 15 cities, label others as 'Other'
if 'city' in df.columns:
    top_cities = df['city'].value_counts().head(15).index
    df['city_simple'] = df['city'].where(df['city'].isin(top_cities), 'Other')
    print(f"Unique cities before: {df['city'].nunique()}, after: {df['city_simple'].nunique()}")
else:
    print("No 'city' column found")


Unique cities before: 79, after: 16


Inference: I reduce number of city categories to avoid too many sparse one-hot columns.

In [31]:
# I convert small categorical columns into numeric or dummy columns
# sex: map to numbers
if 'sex' in df.columns:
    df['sex_num'] = df['sex'].map({'Male':0, 'Female':1}).fillna(2).astype(int)  # 2 => Other/unknown

# city_simple: one-hot for top cities + Other
if 'city_simple' in df.columns:
    dummies = pd.get_dummies(df['city_simple'], prefix='city', drop_first=False)
    df = pd.concat([df, dummies], axis=1)
    print("created city one-hot columns:", list(dummies.columns)[:5], "...")


created city one-hot columns: ['city_Ahmedabad', 'city_Bengaluru', 'city_Chennai', 'city_Delhi', 'city_Hyderabad'] ...


Inference: I converted sex to numeric and turned city_simple into several one-hot columns for the model.

In [32]:
# I drop raw columns that will not be used further
drop_cols = ['city', 'city_simple', 'sex']  # keep sex_num and city dummies
for c in drop_cols:
    if c in df.columns:
        df = df.drop(columns=[c])
print("remaining columns:", df.shape[1])


remaining columns: 38


Inference: I removed original categorical columns to keep the dataset clean.

In [33]:
# I choose numeric features to scale and define target
target = 'sleep_quality_score'

# Exclude target from features
features = [c for c in df.columns if c != target]

# Identify numeric features among them
num_to_scale = df[features].select_dtypes(include=[np.number]).columns.tolist()

# I will scale numeric features (including tea/coffee, screen_time, etc.)
print("numeric features to scale (sample):", num_to_scale[:10])


numeric features to scale (sample): ['age', 'family_size', 'work_hours', 'avg_sleep_hours', 'screen_time_hours', 'tea_cups', 'coffee_cups', 'late_snack', 'spice_intake', 'religious_freq']


Inference: I collected numeric columns that need scaling before training.

In [34]:
# I scale numeric features using sklearn StandardScaler and save the scaler
scaler = StandardScaler()
df[num_to_scale] = scaler.fit_transform(df[num_to_scale])

# Save scaler for future use (app or inference)
joblib.dump(scaler, SCALER_PATH)
print("scaler saved to", SCALER_PATH)


scaler saved to ../models/scaler.joblib


Inference: Features are standardized (mean 0, std 1) so regression coefficients are comparable; scaler is saved.

In [35]:
# I check final shape and save processed data
print("final shape:", df.shape)
df.to_csv(PROCESSED_PATH, index=False)
print("saved processed data to:", PROCESSED_PATH)


final shape: (12000, 38)


saved processed data to: ../data/processed/sleepsense_processed.csv


Inference: The cleaned and scaled dataset is saved for modeling notebooks.

In [36]:
# I view top 3 rows of processed data and basic stats
display(df.head(3))
display(df.describe().T.round(3))


Unnamed: 0,age,family_size,work_hours,avg_sleep_hours,screen_time_hours,tea_cups,coffee_cups,late_snack,spice_intake,religious_freq,...,city_Kanpur,city_Kolkata,city_Mumbai,city_Navi Mumbai,city_Other,city_Pimpri-Chinchwad,city_Pune,city_Surat,city_Thane,city_Vasai-Virar
0,-0.55424,0.269326,1.398265,-1.712387,1.328569,0.596543,0.29285,-0.815221,-0.294897,0.228669,...,False,False,False,False,True,False,False,False,False,False
1,1.98961,-1.347035,1.843044,-0.795982,0.373011,-1.371705,1.362622,-0.815221,-0.294897,-0.94726,...,False,True,False,False,False,False,False,False,False,False
2,-1.509525,0.269326,-0.253773,-0.502377,-1.089577,-1.371705,-0.776922,1.226661,0.651044,0.816634,...,False,False,False,False,True,False,False,False,False,False


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,12000.0,-0.0,1.0,-1.552,-0.769,-0.039,0.691,4.759
family_size,12000.0,0.0,1.0,-2.155,-0.539,0.269,0.269,1.886
work_hours,12000.0,-0.0,1.0,-3.431,-0.699,0.0,0.699,3.622
avg_sleep_hours,12000.0,0.0,1.0,-3.305,-0.671,0.005,0.681,2.923
screen_time_hours,12000.0,0.0,1.0,-2.269,-0.69,-0.037,0.656,3.961
tea_cups,12000.0,0.0,1.0,-1.372,-0.716,-0.06,0.597,4.533
coffee_cups,12000.0,0.0,1.0,-0.777,-0.777,-0.777,0.293,6.711
late_snack,12000.0,-0.0,1.0,-0.815,-0.815,-0.815,1.227,1.227
spice_intake,12000.0,0.0,1.0,-2.187,-0.295,-0.295,0.651,1.597
religious_freq,12000.0,0.0,1.0,-1.535,-0.947,-0.359,0.817,4.932


Inference: Processed data looks numeric-ready for Linear Regression.