# Sleep Habits Data Prep

In [2]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw Sleep Habits Data

In [3]:
sleephabits_raw_id = 'syn7117919'
sleephabits = pd.read_csv(syn.get(sleephabits_raw_id).path)

## Replace white space with NaN

In [4]:
sleephabits = sleephabits.replace(r'^\s*$', np.nan, regex=True)

### Process Sleep Habits Data

In [5]:
#convert seconds to hours   
sleephabits.sleep_time_weekday = sleephabits.sleep_time_weekday.astype(float)/3600
sleephabits.sleep_time_weekend = sleephabits.sleep_time_weekend.astype(float)/3600
sleephabits.sleep_time_weekday = sleephabits.sleep_time_weekday.round(decimals=2)
sleephabits.sleep_time_weekend = sleephabits.sleep_time_weekend.round(decimals=2)

#rename sleep_time_weekday to sleep_time_workday so it's more consistent with the way the question was asked
sleephabits = sleephabits.rename(columns={'sleep_time_weekday':'sleep_time_workday'})

#convert seconds to minutes
sleephabits.sleep_lost = sleephabits.sleep_lost.astype(float)/60
sleephabits.sleep_lost = sleephabits.sleep_lost.round(decimals=2)

#remove brackets
def remove_brackets(col):
    return( col.astype(str).str.replace(']', '').str.replace('[',''))
sleephabits.alarm_dependency = remove_brackets(sleephabits.alarm_dependency)
sleephabits.driving_sleepy = remove_brackets(sleephabits.driving_sleepy)
sleephabits.falling_asleep = remove_brackets(sleephabits.falling_asleep)
sleephabits.morning_person = remove_brackets(sleephabits.morning_person)
sleephabits.nap_duration = remove_brackets(sleephabits.nap_duration)
sleephabits.sleep_partner = remove_brackets(sleephabits.sleep_partner)
sleephabits.wake_up_choices = remove_brackets(sleephabits.wake_up_choices)
sleephabits.weekly_naps = remove_brackets(sleephabits.weekly_naps)
sleephabits.what_wakes_you = remove_brackets(sleephabits.what_wakes_you)

### Filter based on age

In [6]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
sleephabits = sleephabits[~sleephabits.participantId.isin(underage_participants.participantId)]

### Remove test accounts

In [7]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

sleephabits = sleephabits[~sleephabits.participantId.isin(test_accounts.participantId)]

### Clean Sleep Habits data

### Clean wake_ups

In [8]:
sleephabits.wake_ups.describe()
sleephabits.wake_ups.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

count    5381.000000
mean        2.210741
std         2.870987
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max       100.000000
Name: wake_ups, dtype: float64

0.010      0.0
0.050      0.0
0.500      2.0
0.600      2.0
0.800      3.0
0.850      4.0
0.900      4.0
0.950      5.0
0.970      6.0
0.990     10.0
0.999     25.0
1.000    100.0
Name: wake_ups, dtype: float64

In [9]:
## Exclude if wake ups is > .99 percentile of the sample in this case > 10
sleephabits.wake_ups[ sleephabits.wake_ups > 10 ] = 'LOW DATA QUALITY'
sleephabits.wake_ups.value_counts()

1.0                 1395
2.0                 1381
3.0                  904
0.0                  855
4.0                  323
5.0                  291
6.0                   80
10.0                  44
7.0                   42
LOW DATA QUALITY      34
8.0                   28
9.0                    4
Name: wake_ups, dtype: int64

### Clean sleep_needed

In [10]:
sleephabits.sleep_needed.describe()
sleephabits.sleep_needed.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

count    5333.000000
mean        7.556535
std         1.358906
min         0.000000
25%         7.000000
50%         8.000000
75%         8.000000
max        24.000000
Name: sleep_needed, dtype: float64

0.010     4.0
0.050     5.0
0.500     8.0
0.600     8.0
0.800     8.0
0.850     9.0
0.900     9.0
0.950    10.0
0.970    10.0
0.990    11.0
0.999    14.0
1.000    24.0
Name: sleep_needed, dtype: float64

In [11]:
## Exclude if sleep needed is > .99 percentile of the sample in this case > 11
sleephabits.sleep_needed[ sleephabits.sleep_needed > 11 ] = 'LOW DATA QUALITY'
sleephabits.sleep_needed.value_counts()

8.0                 2018
7.0                 1498
9.0                  616
6.0                  611
10.0                 230
5.0                  192
4.0                   72
LOW DATA QUALITY      52
11.0                  19
3.0                    8
1.0                    7
2.0                    6
0.0                    4
Name: sleep_needed, dtype: int64

### Clean sleep_lost

In [12]:
sleephabits.sleep_lost.describe()
sleephabits.sleep_lost.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

count    2934.000000
mean       22.726653
std        55.770488
min         1.000000
25%         1.000000
50%         1.000000
75%        23.500000
max      1439.000000
Name: sleep_lost, dtype: float64

0.010       1.000
0.050       1.000
0.500       1.000
0.600      10.000
0.800      30.000
0.850      60.000
0.900      60.000
0.950     120.000
0.970     121.000
0.990     181.000
0.999     608.107
1.000    1439.000
Name: sleep_lost, dtype: float64

In [13]:
## Exclude if sleep lost is > .99 percentile of the sample in this case > 181
sleephabits.sleep_lost[ sleephabits.sleep_lost > 181 ] = 'LOW DATA QUALITY'
sleephabits.sleep_lost.value_counts()

1.0                 1569
30.0                 198
60.0                 185
10.0                 168
15.0                 159
20.0                 123
5.0                  115
61.0                  91
120.0                 56
121.0                 38
LOW DATA QUALITY      29
45.0                  27
180.0                 24
90.0                  19
3.0                   16
25.0                  12
181.0                 10
6.0                   10
2.0                    8
40.0                   8
4.0                    7
7.0                    5
12.0                   5
8.0                    4
13.0                   3
150.0                  3
18.0                   3
75.0                   3
22.0                   2
28.0                   2
35.0                   2
32.0                   2
66.0                   2
65.0                   2
34.0                   2
62.0                   2
46.0                   2
31.0                   1
42.0                   1
68.0                   1


### Clean sleep_time_workday

In [14]:
sleephabits.sleep_time_workday.describe()
sleephabits.sleep_time_workday.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

count    5408.000000
mean        6.644638
std         1.521031
min         0.020000
25%         6.000000
50%         7.000000
75%         7.500000
max        17.000000
Name: sleep_time_workday, dtype: float64

0.010     0.020
0.050     4.500
0.500     7.000
0.600     7.000
0.800     7.912
0.850     8.000
0.900     8.000
0.950     8.500
0.970     9.000
0.990    10.000
0.999    12.020
1.000    17.000
Name: sleep_time_workday, dtype: float64

In [15]:
## Exclude if sleep time workday is > .99 percentile of the sample in this case > 10
sleephabits.sleep_time_workday[ sleephabits.sleep_time_workday > 10 ] = 'LOW DATA QUALITY'
sleephabits.sleep_time_workday.value_counts()

7.0                 1084
6.0                  759
8.0                  551
6.5                  353
7.5                  346
7.02                 309
5.0                  299
6.02                 227
8.02                 157
9.0                  128
5.5                  125
5.02                 103
4.0                   86
0.02                  68
8.5                   67
6.75                  54
10.0                  45
7.25                  42
9.02                  35
6.25                  33
LOW DATA QUALITY      31
4.5                   30
4.02                  26
7.33                  24
6.67                  19
5.75                  17
7.75                  17
3.0                   16
7.17                  14
9.5                   11
                    ... 
7.78                   1
3.73                   1
6.2                    1
5.27                   1
6.22                   1
4.08                   1
6.1                    1
9.23                   1
6.82                   1


### Clean sleep_time_weekend

In [16]:
sleephabits.sleep_time_weekend = sleephabits['sleep_time_weekend']

sleephabits.sleep_time_weekend.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])
sleephabits.sleep_time_weekend.describe()

0.010     0.02090
0.050     5.00000
0.500     8.00000
0.600     8.02000
0.800     9.00000
0.850     9.02000
0.900    10.00000
0.950    10.02000
0.970    11.02000
0.990    12.02000
0.999    16.01182
1.000    20.02000
Name: sleep_time_weekend, dtype: float64

count    5410.000000
mean        7.830980
std         1.855141
min         0.020000
25%         7.000000
50%         8.000000
75%         9.000000
max        20.020000
Name: sleep_time_weekend, dtype: float64

In [17]:
## Exclude if sleep time weekend is > .99 percentile of the sample in this case > 12.02 
sleephabits.sleep_time_weekend[ sleephabits.sleep_time_weekend > 12.02 ] = 'LOW DATA QUALITY'
sleephabits.sleep_time_weekend.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



8.0                 1048
7.0                  667
9.0                  649
8.02                 336
10.0                 308
6.0                  283
7.02                 249
9.02                 230
7.5                  210
8.5                  171
6.02                 141
10.02                138
5.0                  105
6.5                   80
12.0                  57
5.02                  56
11.0                  56
0.02                  55
12.02                 43
4.0                   36
9.5                   32
11.02                 32
LOW DATA QUALITY      28
5.5                   23
8.25                  20
7.75                  19
3.0                   14
7.25                  13
4.02                  13
4.5                   13
                    ... 
8.15                   1
0.05                   1
8.47                   1
0.07                   1
8.55                   1
7.15                   1
6.7                    1
5.83                   1
7.35                   1


### Create Internal & External Copies of the Data

In [18]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21557215').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_SLEEPHABITS_DATA = sleephabits[sleephabits.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_SLEEPHABITS_DATA.shape

(3237, 16)

### External - Upload to Synapse

In [None]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='Sleep Habits',
                               columns=as_table_columns(EXTERNAL_SLEEPHABITS_DATA),
                               parent=SH_EXTERNAL_PROJECT)
sleephabits_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_SLEEPHABITS_DATA))
sleephabits_synTable_external

### Internal - Upload to Synapse

In [None]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='Sleep Habits Internal',
                               columns=as_table_columns(sleephabits), 
                               parent=SH_INTERNAL_PROJECT)
sleephabits_synTable_internal = syn.store(Table(table_schema_internal,sleephabits))

### Set Provenance

In [None]:
activity=Activity(name= 'Sleep Habits data curation', 
                  description='Process and convert raw data to table format', 
                  used=sleephabits_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleep_Habits.ipynb')
#Internal 
syn.setProvenance(sleephabits_synTable_internal, activity)

#External
syn.setProvenance(sleephabits_synTable_external, activity)