# Sleep Habits Data Prep

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw Sleep Habits Data

In [2]:
sleephabits_raw_id = 'syn7117919'
sleephabits = pd.read_csv(syn.get(sleephabits_raw_id).path)

## Replace white space with NaN

In [3]:
sleephabits = sleephabits.replace(r'^\s*$', np.nan, regex=True)

### Process Sleep Habits Data

In [4]:
#convert seconds to hours   
sleephabits.sleep_time_weekday = sleephabits.sleep_time_weekday.astype(float)/3600
sleephabits.sleep_time_weekend = sleephabits.sleep_time_weekend.astype(float)/3600
sleephabits.sleep_time_weekday = sleephabits.sleep_time_weekday.round(decimals=2)
sleephabits.sleep_time_weekend = sleephabits.sleep_time_weekend.round(decimals=2)

#rename sleep_time_weekday to sleep_time_workday so it's more consistent with the way the question was asked
sleephabits = sleephabits.rename(columns={'sleep_time_weekday':'sleep_time_workday'})

#convert seconds to minutes
sleephabits.sleep_lost = sleephabits.sleep_lost.astype(float)/60
sleephabits.sleep_lost = sleephabits.sleep_lost.round(decimals=2)

#remove brackets
def remove_brackets(col):
    return( col.astype(str).str.replace(']', '').str.replace('[',''))
sleephabits.alarm_dependency = remove_brackets(sleephabits.alarm_dependency)
sleephabits.driving_sleepy = remove_brackets(sleephabits.driving_sleepy)
sleephabits.falling_asleep = remove_brackets(sleephabits.falling_asleep)
sleephabits.morning_person = remove_brackets(sleephabits.morning_person)
sleephabits.nap_duration = remove_brackets(sleephabits.nap_duration)
sleephabits.sleep_partner = remove_brackets(sleephabits.sleep_partner)
sleephabits.wake_up_choices = remove_brackets(sleephabits.wake_up_choices)
sleephabits.weekly_naps = remove_brackets(sleephabits.weekly_naps)
sleephabits.what_wakes_you = remove_brackets(sleephabits.what_wakes_you)

### Filter based on age

In [5]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
sleephabits = sleephabits[~sleephabits.participantId.isin(underage_participants.participantId)]

### Remove test accounts

In [6]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

sleephabits = sleephabits[~sleephabits.participantId.isin(test_accounts.participantId)]

### Clean Sleep Habits data (We have provided full dataset, outlier cleaning code commented out).

### Clean wake_ups

In [7]:
sleephabits.wake_ups.describe()
sleephabits.wake_ups.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

count    5381.000000
mean        2.210741
std         2.870987
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max       100.000000
Name: wake_ups, dtype: float64

0.010      0.0
0.050      0.0
0.500      2.0
0.600      2.0
0.800      3.0
0.850      4.0
0.900      4.0
0.950      5.0
0.970      6.0
0.990     10.0
0.999     25.0
1.000    100.0
Name: wake_ups, dtype: float64

In [8]:
## Exclude if wake ups is > .99 percentile of the sample in this case > 10
#sleephabits.wake_ups[ sleephabits.wake_ups > 10 ] = 'LOW DATA QUALITY'
#sleephabits.wake_ups.value_counts()

### Clean sleep_needed

In [9]:
sleephabits.sleep_needed.describe()
sleephabits.sleep_needed.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

count    5333.000000
mean        7.556535
std         1.358906
min         0.000000
25%         7.000000
50%         8.000000
75%         8.000000
max        24.000000
Name: sleep_needed, dtype: float64

0.010     4.0
0.050     5.0
0.500     8.0
0.600     8.0
0.800     8.0
0.850     9.0
0.900     9.0
0.950    10.0
0.970    10.0
0.990    11.0
0.999    14.0
1.000    24.0
Name: sleep_needed, dtype: float64

In [10]:
## Exclude if sleep needed is > .99 percentile of the sample in this case > 11
#sleephabits.sleep_needed[ sleephabits.sleep_needed > 11 ] = 'LOW DATA QUALITY'
#sleephabits.sleep_needed.value_counts()

### Clean sleep_lost

In [11]:
sleephabits.sleep_lost.describe()
sleephabits.sleep_lost.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

count    2934.000000
mean       22.726653
std        55.770488
min         1.000000
25%         1.000000
50%         1.000000
75%        23.500000
max      1439.000000
Name: sleep_lost, dtype: float64

0.010       1.000
0.050       1.000
0.500       1.000
0.600      10.000
0.800      30.000
0.850      60.000
0.900      60.000
0.950     120.000
0.970     121.000
0.990     181.000
0.999     608.107
1.000    1439.000
Name: sleep_lost, dtype: float64

In [12]:
## Exclude if sleep lost is > .99 percentile of the sample in this case > 181
#sleephabits.sleep_lost[ sleephabits.sleep_lost > 181 ] = 'LOW DATA QUALITY'
#sleephabits.sleep_lost.value_counts()

### Clean sleep_time_workday

In [13]:
sleephabits.sleep_time_workday.describe()
sleephabits.sleep_time_workday.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

count    5408.000000
mean        6.644638
std         1.521031
min         0.020000
25%         6.000000
50%         7.000000
75%         7.500000
max        17.000000
Name: sleep_time_workday, dtype: float64

0.010     0.020
0.050     4.500
0.500     7.000
0.600     7.000
0.800     7.912
0.850     8.000
0.900     8.000
0.950     8.500
0.970     9.000
0.990    10.000
0.999    12.020
1.000    17.000
Name: sleep_time_workday, dtype: float64

In [14]:
## Exclude if sleep time workday is > .99 percentile of the sample in this case > 10
#sleephabits.sleep_time_workday[ sleephabits.sleep_time_workday > 10 ] = 'LOW DATA QUALITY'
#sleephabits.sleep_time_workday.value_counts()

### Clean sleep_time_weekend

In [15]:
sleephabits.sleep_time_weekend = sleephabits['sleep_time_weekend']

sleephabits.sleep_time_weekend.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])
sleephabits.sleep_time_weekend.describe()

0.010     0.02090
0.050     5.00000
0.500     8.00000
0.600     8.02000
0.800     9.00000
0.850     9.02000
0.900    10.00000
0.950    10.02000
0.970    11.02000
0.990    12.02000
0.999    16.01182
1.000    20.02000
Name: sleep_time_weekend, dtype: float64

count    5410.000000
mean        7.830980
std         1.855141
min         0.020000
25%         7.000000
50%         8.000000
75%         9.000000
max        20.020000
Name: sleep_time_weekend, dtype: float64

In [16]:
## Exclude if sleep time weekend is > .99 percentile of the sample in this case > 12.02 
#sleephabits.sleep_time_weekend[ sleephabits.sleep_time_weekend > 12.02 ] = 'LOW DATA QUALITY'
#sleephabits.sleep_time_weekend.value_counts()

### Create Internal & External Copies of the Data

In [17]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_SLEEPHABITS_DATA = sleephabits[sleephabits.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_SLEEPHABITS_DATA.shape

(3303, 16)

### Number of unique participants

In [18]:
len(EXTERNAL_SLEEPHABITS_DATA.participantId.unique())

3155

### Total number of observations

In [19]:
len(EXTERNAL_SLEEPHABITS_DATA)

3303

### External - Upload to Synapse

In [20]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='Sleep Habits',
                               columns=as_table_columns(EXTERNAL_SLEEPHABITS_DATA),
                               parent=SH_EXTERNAL_PROJECT)
sleephabits_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_SLEEPHABITS_DATA))
sleephabits_synTable_external

<synapseclient.table.CsvFileTable at 0x110044e50>

### Internal - Upload to Synapse

In [21]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='Sleep Habits Internal',
                               columns=as_table_columns(sleephabits), 
                               parent=SH_INTERNAL_PROJECT)
sleephabits_synTable_internal = syn.store(Table(table_schema_internal,sleephabits))

### Set Provenance

In [22]:
activity=Activity(name= 'Sleep Habits data curation', 
                  description='Process and convert raw data to table format', 
                  used=sleephabits_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleep_Habits.ipynb')
#Internal 
syn.setProvenance(sleephabits_synTable_internal, activity)

#External
syn.setProvenance(sleephabits_synTable_external, activity)

{u'createdBy': u'3334346',
 u'createdOn': u'2020-07-20T18:08:14.318Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'5405ed43-5903-4d6f-8519-aaacd385c1e8',
 u'id': u'10282142',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-07-20T18:08:14.318Z',
 u'name': u'Sleep Habits data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7117919', u'targetVersionNumber': 774},
   u'wasExecuted': False},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleep_Habits.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleep_Habits.ipynb',
   u'wasExecuted': True}]}

{u'createdBy': u'3334346',
 u'createdOn': u'2020-07-20T18:08:19.942Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'4908378d-4750-40c4-b087-5551417a4c70',
 u'id': u'10282143',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-07-20T18:08:19.942Z',
 u'name': u'Sleep Habits data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7117919', u'targetVersionNumber': 774},
   u'wasExecuted': False},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleep_Habits.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleep_Habits.ipynb',
   u'wasExecuted': True}]}