In [1]:
#%%capture
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as tick
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

from scipy import stats

import statsmodels.api as sm
import statsmodels.formula.api as smf

#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all" # Print multiple output in one cell

try:
    import seaborn as sns
    print("Module 'seaborn' is installed")
except ModuleNotFoundError:
    %pip install seaborn
    import seaborn as sns

#import datetime
#import random
#%pip install nbconvert[webpdf]
#%pip install latex

Module 'seaborn' is installed


In [2]:
df = pd.read_csv("Datasets/sleepdata.csv", delimiter=";", header = 0)

In [3]:
f"This dataset has {df.shape[0]} rows and {df.shape[1]} columns."

'This dataset has 887 rows and 8 columns.'

In [4]:
df.head(5)

Unnamed: 0,Start,End,Sleep quality,Time in bed,Wake up,Sleep Notes,Heart rate,Activity (steps)
0,2014-12-29 22:57:49,2014-12-30 07:30:13,100%,8:32,:),,59.0,0
1,2014-12-30 21:17:50,2014-12-30 21:33:54,3%,0:16,:|,Stressful day,72.0,0
2,2014-12-30 22:42:49,2014-12-31 07:13:31,98%,8:30,:|,,57.0,0
3,2014-12-31 22:31:01,2015-01-01 06:03:01,65%,7:32,,,,0
4,2015-01-01 22:12:10,2015-01-02 04:56:35,72%,6:44,:),Drank coffee:Drank tea,68.0,0


In [5]:
df.dtypes

Start                object
End                  object
Sleep quality        object
Time in bed          object
Wake up              object
Sleep Notes          object
Heart rate          float64
Activity (steps)      int64
dtype: object

We'll change the start and end columns to the datetime data type, and convert time in bed to minutes.

In [6]:
#Convert the dates to datetime, and convert "time in bed" to a quantitative variable, measured in minutes
df["Start"] = pd.to_datetime(df["Start"])
df["End"] = pd.to_datetime(df["End"])

i = 0
for time in df["Time in bed"]:
    #print(time)
    df["Time in bed"].iloc[i] = int(df["Time in bed"].iloc[i].split(":")[0]) * 60 + int(df["Time in bed"].iloc[i].split(":")[1])
    i += 1
    if i >= df.shape[0]:
        break

df["Time in bed"] = pd.to_numeric(df["Time in bed"], downcast="float")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [7]:
df.dtypes

Start               datetime64[ns]
End                 datetime64[ns]
Sleep quality               object
Time in bed                float32
Wake up                     object
Sleep Notes                 object
Heart rate                 float64
Activity (steps)             int64
dtype: object

In [8]:
df.head(5)

Unnamed: 0,Start,End,Sleep quality,Time in bed,Wake up,Sleep Notes,Heart rate,Activity (steps)
0,2014-12-29 22:57:49,2014-12-30 07:30:13,100%,512.0,:),,59.0,0
1,2014-12-30 21:17:50,2014-12-30 21:33:54,3%,16.0,:|,Stressful day,72.0,0
2,2014-12-30 22:42:49,2014-12-31 07:13:31,98%,510.0,:|,,57.0,0
3,2014-12-31 22:31:01,2015-01-01 06:03:01,65%,452.0,,,,0
4,2015-01-01 22:12:10,2015-01-02 04:56:35,72%,404.0,:),Drank coffee:Drank tea,68.0,0


Next, we'll split up the sleep notes into multiple boolean columns.


In [9]:
#Get all possible unique vales of the "sleep notes" column
uniqueSleepNotes = []
for notes in df["Sleep Notes"]:
    if pd.notna(notes):
        if ":" in notes:
            #print(notes.split(":"))
            for item in notes.split(":"):
                [uniqueSleepNotes.append(item) for item in notes.split(":") if item not in uniqueSleepNotes]

f"The potential values for Sleep Notes are: {uniqueSleepNotes}"

"The potential values for Sleep Notes are: ['Drank coffee', 'Drank tea', 'Ate late', 'Worked out', 'Stressful day']"

In [10]:
#This section replaces NaN in the "sleep notes" column with None, and codes the new categorical variables as booleans with 0 or 1 values.
df["Sleep Notes"] = df["Sleep Notes"].fillna("None")

for note in uniqueSleepNotes:
    df[note] = False

df.head(5)

i = 0
for notes in df["Sleep Notes"]:
    if "None" in notes:
        #print(notes)
        i += 1
    elif ":" not in notes:
        #print(notes)
        df[notes].iloc[i] = True
        i += 1
    elif ":" in notes:
        for item in notes.split(":"):
            df[item].iloc[i] = True
        i += 1

for note in uniqueSleepNotes:
    df[note] = df[note].astype("int")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [11]:
df.head(5)

Unnamed: 0,Start,End,Sleep quality,Time in bed,Wake up,Sleep Notes,Heart rate,Activity (steps),Drank coffee,Drank tea,Ate late,Worked out,Stressful day
0,2014-12-29 22:57:49,2014-12-30 07:30:13,100%,512.0,:),,59.0,0,0,0,0,0,0
1,2014-12-30 21:17:50,2014-12-30 21:33:54,3%,16.0,:|,Stressful day,72.0,0,0,0,0,0,1
2,2014-12-30 22:42:49,2014-12-31 07:13:31,98%,510.0,:|,,57.0,0,0,0,0,0,0
3,2014-12-31 22:31:01,2015-01-01 06:03:01,65%,452.0,,,,0,0,0,0,0,0
4,2015-01-01 22:12:10,2015-01-02 04:56:35,72%,404.0,:),Drank coffee:Drank tea,68.0,0,1,1,0,0,0


In [12]:
#This section codes the "wake up column" as ordinal, with :) being 2 and :( being 0.
df = df.dropna()

i = 0 
for emoticon in df["Wake up"]:
    if ":)" in emoticon:
        df["Wake up"].iloc[i] = 2
        i += 1
    if ":|" in emoticon:
        df["Wake up"].iloc[i] = 1
        i += 1
    if ":(" in emoticon:
        df["Wake up"].iloc[i] = 0
        i += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [13]:
#print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
#print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
#print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

#print(r2_score(y_test, y_pred))
#df.head(5)

In [14]:
y = df["Time in bed"]
x = df[["Wake up", "Drank coffee", "Drank tea", "Ate late", "Worked out", "Stressful day", "Heart rate"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

dfTrain = pd.DataFrame({
    "Wake_up": x_train["Wake up"],
    "Drank_coffee": x_train["Drank coffee"],
    "Drank_tea": x_train["Drank tea"],
    "Ate_late": x_train["Ate late"],
    "Worked_out": x_train["Worked out"],
    "Stressful_day": x_train["Stressful day"],
    "Heart_rate": x_train["Heart rate"],
    "Time_in_bed": y_train
})

dfTrain.head(5)

mod = smf.ols(formula = "Time_in_bed ~  C(Drank_coffee) + C(Drank_tea)", data = dfTrain).fit()

print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:            Time_in_bed   R-squared:                       0.061
Model:                            OLS   Adj. R-squared:                  0.046
Method:                 Least Squares   F-statistic:                     4.109
Date:                Fri, 13 Aug 2021   Prob (F-statistic):             0.0187
Time:                        21:39:13   Log-Likelihood:                -735.33
No. Observations:                 129   AIC:                             1477.
Df Residuals:                     126   BIC:                             1485.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept              405.0409 

After doing manual stepwise adjustments (removing and adding variables as the p-value dictates), we are left over with only "Drank coffee" and "Drank tea".

Clearly, using only these categorical var
iables does little to explain the variance in sleep time on their own. While "Drank coffee" and "Drank tea" are significant, meaning they do have an effect on the response, the adjusted R-squared is very, very low. In the future, it may be worth it to consider these 2 variables on other analyses, as they explain something about "Time in bed", and may be bolstered by other, new variables.