# Sleep Data Analysis

In [1]:
import pandas as pd

In [2]:
sleep_data_until_2019 = pd.read_csv('sleepdata.csv',sep=";")

In [3]:
sleep_data_until_2019

Unnamed: 0,Start,End,Sleep quality,Time in bed,Wake up,Sleep Notes,Heart rate,Activity (steps)
0,2014-12-29 22:57:49,2014-12-30 07:30:13,100%,8:32,:),,59.0,0
1,2014-12-30 21:17:50,2014-12-30 21:33:54,3%,0:16,:|,Stressful day,72.0,0
2,2014-12-30 22:42:49,2014-12-31 07:13:31,98%,8:30,:|,,57.0,0
3,2014-12-31 22:31:01,2015-01-01 06:03:01,65%,7:32,,,,0
4,2015-01-01 22:12:10,2015-01-02 04:56:35,72%,6:44,:),Drank coffee:Drank tea,68.0,0
...,...,...,...,...,...,...,...,...
882,2018-02-12 21:54:14,2018-02-13 07:02:15,91%,9:08,,,,56
883,2018-02-13 23:49:19,2018-02-14 07:00:53,81%,7:11,,,,64
884,2018-02-14 21:24:05,2018-02-15 06:20:52,71%,8:56,,,,3316
885,2018-02-15 21:36:32,2018-02-16 06:50:31,80%,9:13,,,,6555


## Check data types

In [4]:
sleep_data_until_2019.dtypes

Start                object
End                  object
Sleep quality        object
Time in bed          object
Wake up              object
Sleep Notes          object
Heart rate          float64
Activity (steps)      int64
dtype: object

Start and end columns are supposed to be DateTime objects. Sleep quality is supposed to be a number not a percent. Wake up is supposed to be imputed.

## Converting the datatypes

Change the datatypes to dates

In [5]:
def change_dtypes(df, columns, result_types):
    for i in range(len(columns)):
        if result_types[i] == 'date':
            df[columns[i]] = pd.to_datetime(df[columns[i]])
        else:
            df[columns[i]] = df[columns[i]].astype(result_types[i])
change_dtypes(sleep_data_until_2019,['Start','End'],['date','date'])

Change the sleep quality to a float.

In [6]:
sleep_data_until_2019['Sleep quality'] = sleep_data_until_2019['Sleep quality'].str.rstrip('%').astype('float')

Change the time in bed to number of minutes in bed.

In [7]:
sleep_data_until_2019['Hours in bed'] = sleep_data_until_2019['Time in bed'].str.split(':').str[0]
sleep_data_until_2019['Minutes in bed'] = sleep_data_until_2019['Time in bed'].str.split(':').str[1]

In [8]:
sleep_data_until_2019['Time in bed'] = sleep_data_until_2019['Hours in bed'].astype(int) * 60 + sleep_data_until_2019['Minutes in bed'].astype(int)

Change the sleep notes into variables.

In [9]:
sleep_data_until_2019['Sleep Notes'] = sleep_data_until_2019['Sleep Notes'].fillna("None")

In [10]:
sleep_data_until_2019['Stressful day'] = sleep_data_until_2019['Sleep Notes'].str.contains('Stressful day').astype(int)
sleep_data_until_2019['Worked out'] = sleep_data_until_2019['Sleep Notes'].str.contains('Worked out').astype(int)
sleep_data_until_2019['Drank tea'] = sleep_data_until_2019['Sleep Notes'].str.contains('Drank tea').astype(int)
sleep_data_until_2019['Drank coffee'] = sleep_data_until_2019['Sleep Notes'].str.contains('Drank coffee').astype(int)
sleep_data_until_2019['Ate late'] = sleep_data_until_2019['Sleep Notes'].str.contains('Ate late').astype(int)
sleep_data_until_2019 = sleep_data_until_2019.drop('Sleep Notes',axis = 1)

Detect null values in the dataset

In [11]:
sleep_data_until_2019.isnull().sum()

Start                 0
End                   0
Sleep quality         0
Time in bed           0
Wake up             641
Heart rate          725
Activity (steps)      0
Hours in bed          0
Minutes in bed        0
Stressful day         0
Worked out            0
Drank tea             0
Drank coffee          0
Ate late              0
dtype: int64

Split into train and test data sets based on whether the mood upon waking up is null.

In [12]:
known_mood = sleep_data_until_2019[pd.notna(sleep_data_until_2019['Wake up'])]
unknown_mood = sleep_data_until_2019[pd.isna(sleep_data_until_2019['Wake up'])]

In [13]:
known_mood

Unnamed: 0,Start,End,Sleep quality,Time in bed,Wake up,Heart rate,Activity (steps),Hours in bed,Minutes in bed,Stressful day,Worked out,Drank tea,Drank coffee,Ate late
0,2014-12-29 22:57:49,2014-12-30 07:30:13,100.0,512,:),59.0,0,8,32,0,0,0,0,0
1,2014-12-30 21:17:50,2014-12-30 21:33:54,3.0,16,:|,72.0,0,0,16,1,0,0,0,0
2,2014-12-30 22:42:49,2014-12-31 07:13:31,98.0,510,:|,57.0,0,8,30,0,0,0,0,0
4,2015-01-01 22:12:10,2015-01-02 04:56:35,72.0,404,:),68.0,0,6,44,0,0,1,1,0
5,2015-01-03 00:34:57,2015-01-03 07:47:23,83.0,432,:),60.0,0,7,12,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,2016-01-04 22:17:03,2016-01-05 05:45:29,74.0,448,:|,,0,7,28,1,0,1,1,0
244,2016-01-05 22:18:45,2016-01-06 06:20:45,66.0,481,:|,,0,8,01,0,1,0,1,0
245,2016-01-13 22:44:29,2016-01-14 06:20:25,72.0,455,:|,,0,7,35,1,0,1,1,1
246,2016-01-14 22:10:58,2016-01-15 05:28:34,73.0,437,:|,,0,7,17,0,1,1,1,0


Change the mood upon waking up into a quantitative variable.

In [14]:
one_hot = pd.get_dummies(known_mood['Wake up'])
known_mood = known_mood.drop('Wake up',axis = 1)
known_mood = known_mood.join(one_hot)

In [15]:
known_mood

Unnamed: 0,Start,End,Sleep quality,Time in bed,Heart rate,Activity (steps),Hours in bed,Minutes in bed,Stressful day,Worked out,Drank tea,Drank coffee,Ate late,:(,:),:|
0,2014-12-29 22:57:49,2014-12-30 07:30:13,100.0,512,59.0,0,8,32,0,0,0,0,0,0,1,0
1,2014-12-30 21:17:50,2014-12-30 21:33:54,3.0,16,72.0,0,0,16,1,0,0,0,0,0,0,1
2,2014-12-30 22:42:49,2014-12-31 07:13:31,98.0,510,57.0,0,8,30,0,0,0,0,0,0,0,1
4,2015-01-01 22:12:10,2015-01-02 04:56:35,72.0,404,68.0,0,6,44,0,0,1,1,0,0,1,0
5,2015-01-03 00:34:57,2015-01-03 07:47:23,83.0,432,60.0,0,7,12,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,2016-01-04 22:17:03,2016-01-05 05:45:29,74.0,448,,0,7,28,1,0,1,1,0,0,0,1
244,2016-01-05 22:18:45,2016-01-06 06:20:45,66.0,481,,0,8,01,0,1,0,1,0,0,0,1
245,2016-01-13 22:44:29,2016-01-14 06:20:25,72.0,455,,0,7,35,1,0,1,1,1,0,0,1
246,2016-01-14 22:10:58,2016-01-15 05:28:34,73.0,437,,0,7,17,0,1,1,1,0,0,0,1


In [17]:
known_mood['Wake up'] = known_mood[':|'] + known_mood[":)"] * 2
known_mood = known_mood.drop(':)',axis = 1)
known_mood = known_mood.drop(':(',axis = 1)
known_mood = known_mood.drop(':|',axis = 1)

In [18]:
known_mood

Unnamed: 0,Start,End,Sleep quality,Time in bed,Heart rate,Activity (steps),Hours in bed,Minutes in bed,Stressful day,Worked out,Drank tea,Drank coffee,Ate late,Wake up
0,2014-12-29 22:57:49,2014-12-30 07:30:13,100.0,512,59.0,0,8,32,0,0,0,0,0,2
1,2014-12-30 21:17:50,2014-12-30 21:33:54,3.0,16,72.0,0,0,16,1,0,0,0,0,1
2,2014-12-30 22:42:49,2014-12-31 07:13:31,98.0,510,57.0,0,8,30,0,0,0,0,0,1
4,2015-01-01 22:12:10,2015-01-02 04:56:35,72.0,404,68.0,0,6,44,0,0,1,1,0,2
5,2015-01-03 00:34:57,2015-01-03 07:47:23,83.0,432,60.0,0,7,12,0,0,1,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,2016-01-04 22:17:03,2016-01-05 05:45:29,74.0,448,,0,7,28,1,0,1,1,0,1
244,2016-01-05 22:18:45,2016-01-06 06:20:45,66.0,481,,0,8,01,0,1,0,1,0,1
245,2016-01-13 22:44:29,2016-01-14 06:20:25,72.0,455,,0,7,35,1,0,1,1,1,1
246,2016-01-14 22:10:58,2016-01-15 05:28:34,73.0,437,,0,7,17,0,1,1,1,0,1


In [19]:
unknown_mood

Unnamed: 0,Start,End,Sleep quality,Time in bed,Wake up,Heart rate,Activity (steps),Hours in bed,Minutes in bed,Stressful day,Worked out,Drank tea,Drank coffee,Ate late
3,2014-12-31 22:31:01,2015-01-01 06:03:01,65.0,452,,,0,7,32,0,0,0,0,0
6,2015-01-04 00:23:06,2015-01-04 07:37:09,78.0,434,,,0,7,14,0,0,1,0,0
248,2016-01-18 21:56:26,2016-01-19 05:34:59,78.0,458,,,0,7,38,0,0,1,1,1
249,2016-01-19 22:46:50,2016-01-20 06:35:37,75.0,468,,,0,7,48,0,1,1,1,0
250,2016-01-20 21:13:40,2016-01-21 06:29:37,77.0,555,,,0,9,15,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,2018-02-12 21:54:14,2018-02-13 07:02:15,91.0,548,,,56,9,08,0,0,0,0,0
883,2018-02-13 23:49:19,2018-02-14 07:00:53,81.0,431,,,64,7,11,0,0,0,0,0
884,2018-02-14 21:24:05,2018-02-15 06:20:52,71.0,536,,,3316,8,56,0,0,0,0,0
885,2018-02-15 21:36:32,2018-02-16 06:50:31,80.0,553,,,6555,9,13,0,0,0,0,0


In [30]:
known_mood.fillna(known_mood.median())

  """Entry point for launching an IPython kernel.


Unnamed: 0,Start,End,Sleep quality,Time in bed,Heart rate,Activity (steps),Hours in bed,Minutes in bed,Stressful day,Worked out,Drank tea,Drank coffee,Ate late,Wake up
0,2014-12-29 22:57:49,2014-12-30 07:30:13,100.0,512,59.0,0,8,32,0,0,0,0,0,2
1,2014-12-30 21:17:50,2014-12-30 21:33:54,3.0,16,72.0,0,0,16,1,0,0,0,0,1
2,2014-12-30 22:42:49,2014-12-31 07:13:31,98.0,510,57.0,0,8,30,0,0,0,0,0,1
4,2015-01-01 22:12:10,2015-01-02 04:56:35,72.0,404,68.0,0,6,44,0,0,1,1,0,2
5,2015-01-03 00:34:57,2015-01-03 07:47:23,83.0,432,60.0,0,7,12,0,0,1,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,2016-01-04 22:17:03,2016-01-05 05:45:29,74.0,448,60.0,0,7,28,1,0,1,1,0,1
244,2016-01-05 22:18:45,2016-01-06 06:20:45,66.0,481,60.0,0,8,01,0,1,0,1,0,1
245,2016-01-13 22:44:29,2016-01-14 06:20:25,72.0,455,60.0,0,7,35,1,0,1,1,1,1
246,2016-01-14 22:10:58,2016-01-15 05:28:34,73.0,437,60.0,0,7,17,0,1,1,1,0,1


In [None]:
known_mood

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X = known_mood[]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
unknown_mood['Heart rate'].unique()

array([nan])

In [21]:
sleep_data_from_2019 = pd.read_csv('sleepdata_2.csv',sep=";")

In [22]:
sleep_data_from_2019

Unnamed: 0,Start,End,Sleep Quality,Regularity,Mood,Heart rate (bpm),Steps,Alarm mode,Air Pressure (Pa),City,...,Time in bed (seconds),Time asleep (seconds),Time before sleep (seconds),Window start,Window stop,Did snore,Snore time,Weather temperature (°C),Weather type,Notes
0,2019-05-12 23:26:13,2019-05-13 06:11:03,60%,0%,,0,8350,Normal,,,...,24289.2,22993.8,161.9,2019-05-13 06:00:00,2019-05-13 06:00:00,True,92.0,0.0,No weather,
1,2019-05-13 22:10:31,2019-05-14 06:10:42,73%,0%,,0,4746,Normal,,,...,28810.2,25160.9,192.1,2019-05-14 05:50:00,2019-05-14 05:50:00,True,0.0,0.0,No weather,
2,2019-05-14 21:43:00,2019-05-15 06:10:41,86%,96%,,0,4007,Normal,,,...,30461.5,28430.8,203.1,2019-05-15 05:50:00,2019-05-15 05:50:00,True,74.0,0.0,No weather,
3,2019-05-15 23:11:51,2019-05-16 06:13:59,77%,92%,,0,6578,Normal,,,...,25327.6,23132.5,168.9,2019-05-16 05:50:00,2019-05-16 05:50:00,True,0.0,0.0,No weather,
4,2019-05-16 23:12:13,2019-05-17 06:20:32,78%,94%,,0,4913,Normal,,,...,25698.4,22614.6,171.3,2019-05-17 05:50:00,2019-05-17 05:50:00,True,188.0,0.0,No weather,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
916,2022-03-24 21:33:46,2022-03-25 04:21:40,71%,77%,,0,3903,Normal,95.2,Central Kootenay,...,24474.1,20803.0,489.5,2022-03-25 04:20:00,2022-03-25 04:20:00,True,695.7,4.0,Cloudy,
917,2022-03-25 16:48:05,2022-03-25 17:30:22,9%,14%,,0,495,Normal,83.8,North Okanagan,...,2536.2,0.0,0.0,2022-03-25 17:30:00,2022-03-25 17:30:00,True,0.0,5.0,Cloudy,
918,2022-03-26 21:14:23,2022-03-27 06:11:01,49%,-1%,,0,13388,Normal,83.5,North Okanagan,...,32198.1,24577.9,3649.1,2022-03-27 06:00:00,2022-03-27 06:00:00,True,506.4,-1.1,Cloudy,
919,2022-03-28 22:53:23,2022-03-29 04:50:36,77%,22%,,0,456,Normal,93.9,Central Kootenay,...,21433.6,15860.8,428.7,2022-03-29 04:20:00,2022-03-29 04:20:00,True,60.0,6.1,Sunny,


In [23]:
sleep_data_from_2019.columns

Index(['Start', 'End', 'Sleep Quality', 'Regularity', 'Mood',
       'Heart rate (bpm)', 'Steps', 'Alarm mode', 'Air Pressure (Pa)', 'City',
       'Movements per hour', 'Time in bed (seconds)', 'Time asleep (seconds)',
       'Time before sleep (seconds)', 'Window start', 'Window stop',
       'Did snore', 'Snore time', 'Weather temperature (°C)', 'Weather type',
       'Notes'],
      dtype='object')

In [24]:
sleep_data_from_2019.Mood.unique()

array([nan])