In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import joblib
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV

In [2]:
data = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
data.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [4]:
print(data['Gender'].nunique())
print(data['Occupation'].nunique())
print(data['BMI Category'].nunique())
print(data['Sleep Disorder'].nunique())

2
11
4
2


In [5]:
print(data['Gender'].unique())

['Male' 'Female']


In [6]:
print(data['Occupation'].unique())

['Software Engineer' 'Doctor' 'Sales Representative' 'Teacher' 'Nurse'
 'Engineer' 'Accountant' 'Scientist' 'Lawyer' 'Salesperson' 'Manager']


In [7]:
print(data['BMI Category'].unique())

['Overweight' 'Normal' 'Obese' 'Normal Weight']


In [8]:
print(data['Sleep Disorder'].unique())

[nan 'Sleep Apnea' 'Insomnia']


In [9]:
print(data['Blood Pressure'].unique())

['126/83' '125/80' '140/90' '120/80' '132/87' '130/86' '117/76' '118/76'
 '128/85' '131/86' '128/84' '115/75' '135/88' '129/84' '130/85' '115/78'
 '119/77' '121/79' '125/82' '135/90' '122/80' '142/92' '140/95' '139/91'
 '118/75']


In [10]:
# Split the 'Blood Pressure' column into 'Systolic' and 'Diastolic'
data[['Systolic', 'Diastolic']] = data['Blood Pressure'].str.split('/', expand=True)

# Convert the new columns to numeric (in case they are strings)
data['Systolic'] = pd.to_numeric(data['Systolic'])
data['Diastolic'] = pd.to_numeric(data['Diastolic'])


In [11]:
data.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder,Systolic,Diastolic
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,,126,83
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,,125,80
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,,125,80
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea,140,90
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea,140,90


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
 13  Systolic                 374 non-null    int64  
 14  Diastolic                3

In [13]:
data.drop(['Person ID','Blood Pressure','Sleep Disorder'],axis=1,inplace=True)
data.head()

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Systolic,Diastolic
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,126,83
1,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,125,80
2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,125,80
3,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,140,90
4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,140,90


In [14]:
cat = data.select_dtypes(include='object').columns

In [15]:
le = LabelEncoder()
for i in cat:
    data[i] = le.fit_transform(data[i])

data.head(2)

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Systolic,Diastolic
0,1,27,9,6.1,6,42,6,3,77,4200,126,83
1,1,28,1,6.2,6,60,8,0,75,10000,125,80


In [18]:
sc = StandardScaler()
data[['Daily Steps']] = sc.fit_transform(data[['Daily Steps']])
data.head(2)

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Systolic,Diastolic
0,1,27,9,6.1,6,42,6,3,77,-1.619584,126,83
1,1,28,1,6.2,6,60,8,0,75,1.970077,125,80


In [19]:
X = data.drop(['Sleep Duration'],axis=1)
y = data['Sleep Duration']

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [22]:
from sklearn.pipeline import Pipeline

In [23]:
models = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}  # No hyperparameters for LinearRegression by default
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(),
        "params": {
            'max_depth': [3, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(),
        "params": {
            'n_estimators': [50, 100],
            'max_depth': [3, 5, 10]
        }
    },
    "XGBoost": {
        "model": XGBRegressor(objective='reg:squarederror'),
        "params": {
            'n_estimators': [50, 100],
            'max_depth': [3, 5, 10],
            'learning_rate': [0.01, 0.1]
        }
    }
}


In [24]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [25]:
results = []

for name, m in models.items():
    print(f"\nRunning GridSearchCV for {name}")
    grid = GridSearchCV(m["model"], m["params"], cv=5, scoring='r2', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': name,
        'Best Params': grid.best_params_,
        'MSE': mse,
        'MAE': mae,
        'R2 Score': r2
    })

# Convert results to DataFrame
results_data = pd.DataFrame(results)
print(results_data)


Running GridSearchCV for Linear Regression

Running GridSearchCV for Decision Tree

Running GridSearchCV for Random Forest

Running GridSearchCV for XGBoost
               Model                                        Best Params  \
0  Linear Regression                                                 {}   
1      Decision Tree         {'max_depth': 10, 'min_samples_split': 10}   
2      Random Forest              {'max_depth': 10, 'n_estimators': 50}   
3            XGBoost  {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...   

        MSE       MAE  R2 Score  
0  0.120364  0.279676  0.741601  
1  0.007957  0.062641  0.982918  
2  0.007597  0.064178  0.983691  
3  0.006820  0.056654  0.985358  


In [26]:
preprocessor = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [28]:
from sklearn.model_selection import GridSearchCV

best_model = None
best_score = 0

for name, mp in models.items():
    model = mp["model"]
    params = mp["params"]
    
    # Preprocessing + model pipeline
    pipe = Pipeline([
        ('prep', preprocessor),
        ('model', model)
    ])
    
    # Prefix parameters with 'model__' for pipeline
    param_grid = {f"model__{key}": value for key, value in params.items()}
    
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='r2')
    grid.fit(X_train, y_train)
    
    print(f"{name} best R2 score: {grid.best_score_:.4f}")
    
    if grid.best_score_ > best_score:
        best_score = grid.best_score_
        best_model = grid.best_estimator_
best_score

Linear Regression best R2 score: 0.8750
Decision Tree best R2 score: 0.9758
Random Forest best R2 score: 0.9832
XGBoost best R2 score: 0.9819


0.9831674536322502

In [29]:
best_model

In [35]:
# Preprocessing pipeline
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Grid search over all models
best_model = None
best_score = 0

for name, mp in models.items():
    model = mp["model"]
    params = mp["params"]

    # Full pipeline
    pipe = Pipeline([
        ('prep', preprocessor),
        ('model', model)
    ])

    # Adjust parameter names for the pipeline
    param_grid = {f"model__{key}": value for key, value in params.items()}

    # GridSearchCV
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, scoring='r2')
    grid.fit(X_train, y_train)

    print(f"\n{name}")
    print(f"Best R2 score: {grid.best_score_:.4f}")
    print("Best Parameters:", grid.best_params_)

    if grid.best_score_ > best_score:
        best_score = grid.best_score_
        best_model = grid.best_estimator_


Linear Regression
Best R2 score: 0.8750
Best Parameters: {}

Decision Tree
Best R2 score: 0.9767
Best Parameters: {'model__max_depth': 10, 'model__min_samples_split': 10}

Random Forest
Best R2 score: 0.9834
Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 50}

XGBoost
Best R2 score: 0.9819
Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 100}


In [36]:
joblib.dump(best_model,'Sleep health and lifestyle.joblib')

['Sleep health and lifestyle.joblib']

In [37]:
feature_columns = X.columns.to_list()
joblib.dump(feature_columns,'features_Sleep health and lifestyle.joblib')

['features_Sleep health and lifestyle.joblib']

In [None]:
# model deployement
import streamlit as st
import pandas as pd
import joblib

model = joblib.load('Sleep health and lifestyle.joblib')
features = joblib.load('features_Sleep health and lifestyle.joblib')
st.title('Sleep health and lifestyle')

user_input={}
for i in features:
    user_input[i] = st.number_input(i,value=0.0)

if st.button('Predict'):
    data = pd.DataFrame([user_input])
    pred = model.predict(data)[0]
    st.success(f'Prediction : {(pred)}')

In [None]:
# Terminal
cd .venu
streamlit run filename.py