In [1]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from utils import Utils, Imputation

In [2]:
df = Utils.load_data('Sleep_Efficiency.csv')
df

Unnamed: 0_level_0,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,27,Female,2021-11-13 22:00:00,2021-11-13 05:30:00,7.5,0.91,22,57,21,0.0,0.0,0.0,No,5.0
449,52,Male,2021-03-31 21:00:00,2021-03-31 03:00:00,6.0,0.74,28,57,15,4.0,25.0,0.0,No,3.0
450,40,Female,2021-09-07 23:00:00,2021-09-07 07:30:00,8.5,0.55,20,32,48,1.0,,3.0,Yes,0.0
451,45,Male,2021-07-29 21:00:00,2021-07-29 04:00:00,7.0,0.76,18,72,10,3.0,0.0,0.0,No,3.0


In [3]:
categorical_columns = Utils.find_binary_columns(df)
categorical_columns


['Gender', 'Smoking status']

In [4]:
df_encoded = Utils.label_encode(df, categorical_columns)
df_encoded

Unnamed: 0_level_0,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,65,0,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,1,3.0
2,69,1,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,1,3.0
3,40,0,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,0,3.0
4,40,0,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,1,1.0
5,57,1,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,27,0,2021-11-13 22:00:00,2021-11-13 05:30:00,7.5,0.91,22,57,21,0.0,0.0,0.0,0,5.0
449,52,1,2021-03-31 21:00:00,2021-03-31 03:00:00,6.0,0.74,28,57,15,4.0,25.0,0.0,0,3.0
450,40,0,2021-09-07 23:00:00,2021-09-07 07:30:00,8.5,0.55,20,32,48,1.0,,3.0,1,0.0
451,45,1,2021-07-29 21:00:00,2021-07-29 04:00:00,7.0,0.76,18,72,10,3.0,0.0,0.0,0,3.0


In [5]:
df_encoded['bed_hour'] = df_encoded['Bedtime'].apply(Utils.date_to_decimal)
df_encoded['wake_hour'] = df_encoded['Wakeup time'].apply(Utils.date_to_decimal)
df_encoded = df_encoded.drop(['Bedtime', 'Wakeup time'], axis=1)
df_encoded

Unnamed: 0_level_0,Age,Gender,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency,bed_hour,wake_hour
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,65,0,6.0,0.88,18,70,12,0.0,0.0,0.0,1,3.0,1.0,7.0
2,69,1,7.0,0.66,19,28,53,3.0,0.0,3.0,1,3.0,2.0,9.0
3,40,0,8.0,0.89,20,70,10,1.0,0.0,0.0,0,3.0,21.5,5.5
4,40,0,6.0,0.51,23,25,52,3.0,50.0,5.0,1,1.0,2.5,8.5
5,57,1,8.0,0.76,27,55,18,3.0,0.0,3.0,0,3.0,1.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448,27,0,7.5,0.91,22,57,21,0.0,0.0,0.0,0,5.0,22.0,5.5
449,52,1,6.0,0.74,28,57,15,4.0,25.0,0.0,0,3.0,21.0,3.0
450,40,0,8.5,0.55,20,32,48,1.0,,3.0,1,0.0,23.0,7.5
451,45,1,7.0,0.76,18,72,10,3.0,0.0,0.0,0,3.0,21.0,4.0


In [6]:
columns_with_missing = Utils.missing_info(df_encoded)

<class 'pandas.core.frame.DataFrame'>
Index: 452 entries, 1 to 452
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     452 non-null    int64  
 1   Gender                  452 non-null    int32  
 2   Sleep duration          452 non-null    float64
 3   Sleep efficiency        452 non-null    float64
 4   REM sleep percentage    452 non-null    int64  
 5   Deep sleep percentage   452 non-null    int64  
 6   Light sleep percentage  452 non-null    int64  
 7   Awakenings              432 non-null    float64
 8   Caffeine consumption    427 non-null    float64
 9   Alcohol consumption     438 non-null    float64
 10  Smoking status          452 non-null    int32  
 11  Exercise frequency      446 non-null    float64
 12  bed_hour                452 non-null    float64
 13  wake_hour               452 non-null    float64
dtypes: float64(8), int32(2), int64(4)
memory usage:

Too much of the data would be lost. Columns with missing data are not completely independant. Usual imputation methods may not work well here.

In [13]:
imputation = Imputation()
results = imputation.try_imputation_models(df_encoded, columns_with_missing)

Linear Regression
Decision Tree
Random Forest
SVR
Gradient Boosting Regressor
KNeighbors
Lasso
Ridge
HuberRegressor
ElasticNet
SGDRegressor
Simple Imputer (Mean)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


ValueError: y_true and y_pred have different number of output (1!=10)

In [None]:
imputation.evaluate_best_methods(results)

In [None]:
imputation.model_imputation(RandomForestRegressor(), df_encoded, columns_with_missing)

In [None]:
def calculate_grade(x, target_range, decrease_factor):
    if x >= target_range[0] and x <= target_range[1]:
        return 1 
    else:
        distance = min(abs(x - target_range[0]), abs(x - target_range[1]))
        grade = max(0, 1 - decrease_factor * distance)
        return grade
    
calculate_grade(0.60, (0.85, 1), 2.25)

In [None]:
from scipy.stats import norm

# Define the range of values to plot
from matplotlib import pyplot as plt

def calculate_grade(x, target_range, decrease_factor):
    if x >= target_range[0] and x <= target_range[1]:
        return 1
    else:
        distance = min(abs(x - target_range[0]), abs(x - target_range[1]))
        grade = max(0, 1 - decrease_factor * distance)

        smoothed_grade = norm.pdf(grade, loc=1, scale=0.4)
        return smoothed_grade

x_values = np.linspace(0, 7, 100)  # Adjust as needed


# Calculate the grades for each value in the range
grades = [calculate_grade(x, (5, 7), .225) for x in x_values]

# Plot the grades
plt.plot(x_values, grades)
plt.xlabel('Range')
plt.ylabel('Grade')
plt.show()

calculate_grade(0, (5, 7), .2)

In [None]:

# # Getting at least 7 to 9 hours of sleep
# df_imputed['sleep_duration_grade'] = [calculate_grade(x, (7, 9), 0.3) for x in df_imputed['Sleep duration']]

# # Asleep for at least 85% of the night
# df_imputed['sleep_efficiency_grade'] = [calculate_grade(x, (0.85, 1), 3) for x in df_imputed['Sleep efficiency']]

# # Rem sleep between 20% and 30%
# df_imputed['rem_sleep_grade'] = [calculate_grade(x, (22, 28), 0.1) for x in df_imputed['REM sleep percentage']]

# # Deep sleep between 20% and 30% of sleep
# df_imputed['deep_sleep_grade'] = [calculate_grade(x, (22, 28), 0.1) for x in df_imputed['Deep sleep percentage']]

# # Light sleep between 45% and 55% of sleep
# df_imputed['light_sleep_grade'] = [calculate_grade(x, (47, 53), 0.05) for x in df_imputed['Light sleep percentage']]

# # Waking up 1 or less times
# df_imputed['awakenings_grade'] = [calculate_grade(x, (0, 1), .35) for x in df_imputed['Awakenings']]

# # Caffeine
# df_imputed['caffeine_grade'] = [calculate_grade(x, (0, 100), .004) for x in df_imputed['Caffeine consumption']]

# # Alcohol
# df_imputed['alcohol_grade'] = [calculate_grade(4, (0, 0), .15) for x in df_imputed['Alcohol consumption']]

# # Smoking negatively impacts sleep
# df_imputed['smoker_grade'] = [calculate_grade(x, (0, 0), .6) for x in df_imputed['Smoking status']]

# # Exercise
# df_imputed['exercise_grade'] = [calculate_grade(x, (5, 7), .225) for x in df_imputed['Exercise frequency']]

# # Summing each grade
# df_imputed['sleep_quality'] = df_imputed['sleep_duration_grade'] + df_imputed['sleep_efficiency_grade'] + \
#                               df_imputed['light_sleep_grade'] + df_imputed['awakenings_grade'] + \
#                               df_imputed['rem_sleep_grade'] + df_imputed['deep_sleep_grade'] + \
#                               df_imputed['caffeine_grade'] + df_imputed['alcohol_grade'] + \
#                               df_imputed['smoker_grade'] + df_imputed['exercise_grade']
