#  Nap Tracker Data Prep

In [1]:
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw Nap Tracker Data

In [2]:
nap_tracker_raw_id = 'syn7117931'

nap_tracker = pd.read_csv(syn.get(nap_tracker_raw_id).path)

### Process Nap Tracker Data

In [3]:
# drop uneeded columns
nap_tracker = nap_tracker.drop(columns=['NapTrackerNapStartTime', 'timestamp.1','NapTrackerNapComments','NapTrackerNapUniqueId'])

#rename these variables so that they are easier to interpret, Nap Tracker in front seems redundant
nap_tracker = nap_tracker.rename(columns = 
                             { 'NapTrackerNapDuration':'NapDuration',
                               'NapTrackerNapQuality':'NapQuality',
                               'transaction':'NapType'})

#recode variable labels for Nap Tracker Nap Type into ones that are easier to interpret 
nap_tracker['NapType'].replace('NapTrackerTransactionTypeNewNap','New nap',inplace=True)
nap_tracker['NapType'].replace('NapTrackerTransactionTypeDeleteNap','Deleted nap',inplace=True)
nap_tracker['NapType'].replace('NapTrackerTransactionTypeChangeNap','Changed nap',inplace=True)

### Filter based on age

In [4]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
nap_tracker = nap_tracker[~nap_tracker.participantId.isin(underage_participants.participantId)]

### Remove test accounts

In [5]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

nap_tracker = nap_tracker[~nap_tracker.participantId.isin(test_accounts.participantId)]

### Replace white space with NaN

In [6]:
nap_tracker = nap_tracker.replace(r'^\s*$', np.nan, regex=True)

### Clean Nap Tracker Data

In [7]:
nap_tracker.NapDuration.describe()

count      3731.000000
mean       5356.555382
std       16488.702612
min           0.000000
25%           0.000000
50%        2700.000000
75%        6034.305764
max      539178.000000
Name: NapDuration, dtype: float64

In [8]:
nap_tracker.NapDuration.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010         0.00
0.050         0.00
0.500      2700.00
0.600      3600.00
0.800      7200.00
0.850      7882.00
0.900     10800.00
0.950     21600.00
0.970     26122.00
0.990     35050.90
0.999    245001.99
1.000    539178.00
Name: NapDuration, dtype: float64

In [9]:
#exclude data for naps equal to zero or longer than 35050.90 seconds which is 99th percentile
nap_tracker.NapDuration[ nap_tracker.NapDuration == 0 ] = 'LOW DATA QUALITY'
nap_tracker.NapDuration[ nap_tracker.NapDuration > 35050.90 ] = 'LOW DATA QUALITY'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [10]:
nap_tracker.NapDuration.value_counts()

LOW DATA QUALITY    1026
3600.0               177
1800.0               158
7200.0               155
5400.0               148
2700.0               125
7500.0               101
3900.0               100
900.0                 91
1200.0                91
2400.0                74
4500.0                66
6300.0                55
10800.0               55
1500.0                54
2100.0                53
600.0                 52
3000.0                48
9000.0                47
300.0                 41
5700.0                35
8100.0                28
3300.0                27
11100.0               26
5100.0                25
4800.0                25
14400.0               24
4200.0                23
6000.0                17
6900.0                17
                    ... 
1343.94984102          1
1135.14196402          1
6312.12161601          1
30300.0                1
2870.93177402          1
24139.0                1
1566.0                 1
597.0                  1
3674.0                 1


### Create Internal & External Copies of the Data

In [11]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_NAP_TRACKER_DATA = nap_tracker[nap_tracker.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_NAP_TRACKER_DATA.shape

(2270, 5)

### Number of unique participants

In [12]:
len(EXTERNAL_NAP_TRACKER_DATA.participantId.unique())

592

### Total number of observations

In [13]:
len(EXTERNAL_NAP_TRACKER_DATA)

2270

### External - Upload to Synapse

In [None]:
len(EXTERNAL_NAP_TRACKER_DATA.participantId.unique())
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='Nap Tracker',
                               columns=as_table_columns(EXTERNAL_NAP_TRACKER_DATA),
                               parent=SH_EXTERNAL_PROJECT)
nap_tracker_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_NAP_TRACKER_DATA))
nap_tracker_synTable_external

592

### Internal - Upload to Synapse

In [None]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='Nap Tracker Internal',
                               columns=as_table_columns(nap_tracker), 
                               parent=SH_INTERNAL_PROJECT)
nap_tracker_synTable_internal = syn.store(Table(table_schema_internal,nap_tracker))

### Set Provenance

In [None]:
activity=Activity(name= 'Nap Tracker data curation', 
                  description='Process and convert raw data to table format', 
                  used=nap_tracker_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Nap_Tracker.ipynb')
#Internal 
syn.setProvenance(nap_tracker_synTable_internal, activity)

#External
syn.setProvenance(nap_tracker_synTable_external, activity)