# Sleep Habits Data Prep

In [4]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw Sleep Habits Data

In [5]:
sleephabits_raw_id = 'syn7117919'
sleephabits = pd.read_csv(syn.get(sleephabits_raw_id).path)

## Replace white space with NaN

In [6]:
sleephabits = sleephabits.replace(r'^\s*$', np.nan, regex=True)

### Process Sleep Habits Data

In [7]:
#convert seconds to hours   
sleephabits.sleep_time_weekday = sleephabits.sleep_time_weekday.astype(float)/3600
sleephabits.sleep_time_weekend = sleephabits.sleep_time_weekend.astype(float)/3600
sleephabits.sleep_time_weekday = sleephabits.sleep_time_weekday.round(decimals=2)
sleephabits.sleep_time_weekend = sleephabits.sleep_time_weekend.round(decimals=2)

#rename sleep_time_weekday to sleep_time_workday so it's more consistent with the way the question was asked
sleephabits = sleephabits.rename(columns={'sleep_time_weekday':'sleep_time_workday'})

#convert seconds to minutes
sleephabits.sleep_lost = sleephabits.sleep_lost.astype(float)/60
sleephabits.sleep_lost = sleephabits.sleep_lost.round(decimals=2)

#remove brackets
def remove_brackets(col):
    return( col.astype(str).str.replace(']', '').str.replace('[',''))
sleephabits.alarm_dependency = remove_brackets(sleephabits.alarm_dependency)
sleephabits.driving_sleepy = remove_brackets(sleephabits.driving_sleepy)
sleephabits.falling_asleep = remove_brackets(sleephabits.falling_asleep)
sleephabits.morning_person = remove_brackets(sleephabits.morning_person)
sleephabits.nap_duration = remove_brackets(sleephabits.nap_duration)
sleephabits.sleep_partner = remove_brackets(sleephabits.sleep_partner)
sleephabits.wake_up_choices = remove_brackets(sleephabits.wake_up_choices)
sleephabits.weekly_naps = remove_brackets(sleephabits.weekly_naps)
sleephabits.what_wakes_you = remove_brackets(sleephabits.what_wakes_you)

### Filter based on age

In [8]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
sleephabits = sleephabits[~sleephabits.participantId.isin(underage_participants.participantId)]

### Clean Sleep Habits data

### Clean wake_ups

In [27]:
sleephabits.wake_ups.describe()
sleephabits.wake_ups.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.100      0.0
0.500      2.0
0.600      2.0
0.800      3.0
0.850      4.0
0.900      4.0
0.950      5.0
0.970      6.0
0.990     10.0
0.999     25.0
1.000    100.0
Name: wake_ups, dtype: float64

count    5510.000000
mean        2.193103
std         2.848907
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max       100.000000
Name: wake_ups, dtype: float64

In [28]:
## Exclude if wake ups is > .99 percentile of the sample in this case > 10
sleephabits.wake_ups[ sleephabits.wake_ups > 10 ] = 'DATA QUALITY ISSUE'
sleephabits.wake_ups.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



1.0                   1435
2.0                   1406
3.0                    918
0.0                    894
4.0                    328
5.0                    294
6.0                     81
10.0                    44
7.0                     43
DATA QUALITY ISSUE      34
8.0                     29
9.0                      4
Name: wake_ups, dtype: int64

### Clean sleep_needed

In [30]:
sleephabits.sleep_needed.describe()
sleephabits.sleep_needed.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.100     6.00
0.500     8.00
0.600     8.00
0.800     8.00
0.850     9.00
0.900     9.00
0.950    10.00
0.970    10.00
0.990    11.43
0.999    14.00
1.000    24.00
Name: sleep_needed, dtype: float64

count    5458.000000
mean        7.561927
std         1.373428
min         0.000000
25%         7.000000
50%         8.000000
75%         8.000000
max        24.000000
Name: sleep_needed, dtype: float64

In [31]:
## Exclude if sleep needed is > .99 percentile of the sample in this case > 11.43
sleephabits.sleep_needed[ sleephabits.sleep_needed > 11.43 ] = 'DATA QUALITY ISSUE'
sleephabits.sleep_needed.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



8.0                   2059
7.0                   1517
9.0                    644
6.0                    622
10.0                   239
5.0                    197
4.0                     73
DATA QUALITY ISSUE      55
11.0                    21
3.0                     12
2.0                      8
1.0                      7
0.0                      4
Name: sleep_needed, dtype: int64

### Clean sleep_lost

In [33]:
sleephabits.sleep_lost.describe()
sleephabits.sleep_lost.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.100       1.000
0.500       1.000
0.600      10.000
0.800      30.000
0.850      60.000
0.900      60.000
0.950     120.000
0.970     121.000
0.990     181.000
0.999     604.477
1.000    1439.000
Name: sleep_lost, dtype: float64

count    2964.000000
mean       22.577598
std        55.521852
min         1.000000
25%         1.000000
50%         1.000000
75%        20.000000
max      1439.000000
Name: sleep_lost, dtype: float64

In [34]:
## Exclude if sleep lost is > .99 percentile of the sample in this case > 181
sleephabits.sleep_lost[ sleephabits.sleep_lost > 181 ] = 'LOW DATA QUALITY'
sleephabits.sleep_lost.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



1.0                   1588
30.0                   200
60.0                   186
10.0                   170
15.0                   163
20.0                   123
5.0                    116
61.0                    91
120.0                   56
121.0                   38
DATA QUALITY ISSUE      29
45.0                    27
180.0                   24
90.0                    19
3.0                     16
25.0                    12
181.0                   10
6.0                     10
2.0                      8
40.0                     8
4.0                      7
7.0                      5
12.0                     5
8.0                      4
13.0                     3
150.0                    3
18.0                     3
75.0                     3
22.0                     2
28.0                     2
34.0                     2
32.0                     2
66.0                     2
65.0                     2
35.0                     2
62.0                     2
46.0                     2
3

### Clean sleep_time_workday

In [36]:
sleep_time_workday.describe()
sleep_time_workday.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.100     5.00
0.500     7.00
0.600     7.00
0.800     8.00
0.850     8.00
0.900     8.00
0.950     8.75
0.970     9.00
0.990    10.00
0.999    12.02
1.000    17.00
Name: sleep_time_workday, dtype: float64

count    5536.000000
mean        6.651006
std         1.526232
min         0.020000
25%         6.000000
50%         7.000000
75%         7.500000
max        17.000000
Name: sleep_time_workday, dtype: float64

In [37]:
## Exclude if sleep time workday is > .99 percentile of the sample in this case > 10
sleephabits.sleep_time_workday[ sleephabits.sleep_time_workday > 10 ] = 'DATA QUALITY ISSUE'
sleephabits.sleep_time_workday.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



7.0                   1101
6.0                    770
8.0                    571
6.5                    362
7.5                    354
7.02                   317
5.0                    303
6.02                   228
8.02                   162
9.0                    137
5.5                    126
5.02                   106
4.0                     89
0.02                    70
8.5                     69
6.75                    54
10.0                    47
7.25                    43
9.02                    36
6.25                    34
DATA QUALITY ISSUE      33
4.5                     30
4.02                    26
7.33                    24
6.67                    19
7.75                    19
5.75                    17
3.0                     17
7.17                    15
7.67                    12
                      ... 
6.65                     1
6.22                     1
7.37                     1
4.57                     1
6.77                     1
9.23                     1
6

### Clean sleep_time_weekend

In [39]:
sleephabits.sleep_time_weekend = sleephabits['sleep_time_weekend']

sleephabits.sleep_time_weekend.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])
sleephabis.sleep_time_weekend.describe()

0.100     6.00
0.500     8.00
0.600     8.02
0.800     9.00
0.850     9.02
0.900    10.00
0.950    10.02
0.970    11.02
0.990    12.02
0.999    16.02
1.000    20.02
Name: sleep_time_weekend, dtype: float64

count    5538.000000
mean        7.843790
std         1.869131
min         0.020000
25%         7.000000
50%         8.000000
75%         9.000000
max        20.020000
Name: sleep_time_weekend, dtype: float64

In [40]:
## Exclude if sleep time weekend is > .99 percentile of the sample in this case > 12.02 
sleephabits.sleep_time_weekend[ sleephabits.sleep_time_weekend > 12.02 ] = 'DATA QUALITY ISSUE'
sleephabits.sleep_time_weekend.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



8.0                   1064
7.0                    674
9.0                    663
8.02                   343
10.0                   324
6.0                    286
7.02                   255
9.02                   238
7.5                    211
8.5                    177
10.02                  146
6.02                   143
5.0                    106
6.5                     81
0.02                    58
11.0                    58
12.0                    58
5.02                    56
12.02                   44
4.0                     39
9.5                     35
11.02                   34
DATA QUALITY ISSUE      31
5.5                     23
8.25                    20
7.75                    19
3.0                     15
4.02                    15
4.5                     13
7.25                    13
                      ... 
0.05                     1
0.07                     1
9.42                     1
8.55                     1
7.98                     1
5.27                     1
7

### Create Internal & External Copies of the Data

In [42]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21557215').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_SLEEPHABITS_DATA = sleephabits[sleephabits.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_SLEEPHABITS_DATA.shape

(3311, 16)

### External - Upload to Synapse

In [43]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='Sleep Habits',
                               columns=as_table_columns(EXTERNAL_SLEEPHABITS_DATA),
                               parent=SH_EXTERNAL_PROJECT)
sleephabits_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_SLEEPHABITS_DATA))
sleephabits_synTable_external

3163

<synapseclient.table.CsvFileTable at 0x1110d3a10>

### Internal - Upload to Synapse

In [44]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='Sleep Habits Internal',
                               columns=as_table_columns(sleephabits), 
                               parent=SH_INTERNAL_PROJECT)
sleephabits_synTable_internal = syn.store(Table(table_schema_internal,sleephabits))

### Set Provenance

In [45]:
activity=Activity(name= 'Sleep Habits data curation', 
                  description='Process and convert raw data to table format', 
                  used=sleephabits_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleep_Habits.ipynb')
#Internal 
syn.setProvenance(sleephabits_synTable_internal, activity)

#External
syn.setProvenance(sleephabits_synTable_external, activity)

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-08T20:35:34.045Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'd38a69d3-27ab-4a4d-a1ec-973ed325bf54',
 u'id': u'10232210',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-08T20:35:34.045Z',
 u'name': u'Sleep Habits data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/deerings/SLEEPHEALTHv2-Data-Release/Create_Sleep_Habits.ipynb',
   u'url': u'https://github.com/deerings/SLEEPHEALTHv2-Data-Release/Create_Sleep_Habits.ipynb',
   u'wasExecuted': True},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7117919', u'targetVersionNumber': 774},
   u'wasExecuted': False}]}

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-08T20:35:37.657Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'13734eca-1161-4031-8e85-c8ce43b37c2b',
 u'id': u'10232211',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-08T20:35:37.657Z',
 u'name': u'Sleep Habits data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/deerings/SLEEPHEALTHv2-Data-Release/Create_Sleep_Habits.ipynb',
   u'url': u'https://github.com/deerings/SLEEPHEALTHv2-Data-Release/Create_Sleep_Habits.ipynb',
   u'wasExecuted': True},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7117919', u'targetVersionNumber': 774},
   u'wasExecuted': False}]}