#  Nap Tracker Data Prep

In [1]:
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw Nap Tracker Data

In [2]:
nap_tracker_raw_id = 'syn7117931'

nap_tracker = pd.read_csv(syn.get(nap_tracker_raw_id).path)

### Process Nap Tracker Data

In [3]:
# drop uneeded columns
nap_tracker = nap_tracker.drop(columns=['NapTrackerNapStartTime', 'timestamp.1','NapTrackerNapComments','NapTrackerNapUniqueId'])

#rename these variables so that they are easier to interpret, Nap Tracker in front seems redundant
nap_tracker = nap_tracker.rename(columns = 
                             { 'NapTrackerNapDuration':'NapDuration',
                               'NapTrackerNapQuality':'NapQuality',
                               'transaction':'NapType'})

#recode variable labels for Nap Tracker Nap Type into ones that are easier to interpret 
nap_tracker['NapType'].replace('NapTrackerTransactionTypeNewNap','New nap',inplace=True)
nap_tracker['NapType'].replace('NapTrackerTransactionTypeDeleteNap','Deleted nap',inplace=True)
nap_tracker['NapType'].replace('NapTrackerTransactionTypeChangeNap','Changed nap',inplace=True)

### Remove duplicated data

In [4]:
len(nap_tracker)
nap_tracker = nap_tracker.drop_duplicates(keep='first')

3775

In [5]:
len(nap_tracker)

3402

### Filter based on age

In [6]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
nap_tracker = nap_tracker[~nap_tracker.participantId.isin(underage_participants.participantId)]

### Remove test accounts

In [7]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

nap_tracker = nap_tracker[~nap_tracker.participantId.isin(test_accounts.participantId)]

### Replace white space with NaN

In [8]:
nap_tracker = nap_tracker.replace(r'^\s*$', np.nan, regex=True)

### Clean Nap Tracker Data

In [9]:
nap_tracker.NapDuration.describe()

count      3358.000000
mean       5951.550962
std       17278.417256
min           0.000000
25%         900.000000
50%        3054.500000
75%        6900.000000
max      539178.000000
Name: NapDuration, dtype: float64

In [10]:
nap_tracker.NapDuration.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010         0.000000
0.050         0.000000
0.500      3054.500000
0.600      3911.234551
0.800      7379.425659
0.850      8674.150000
0.900     11100.000000
0.950     22524.158566
0.970     26925.210000
0.990     35556.769062
0.999    245761.791000
1.000    539178.000000
Name: NapDuration, dtype: float64

In [11]:
#exclude data for naps equal to zero or longer than 35050.90 seconds which is 99th percentile
#nap_tracker.NapDuration[ nap_tracker.NapDuration == 0 ] = 'LOW DATA QUALITY'
#nap_tracker.NapDuration[ nap_tracker.NapDuration > 35050.90 ] = 'LOW DATA QUALITY'

In [12]:
nap_tracker.NapDuration.value_counts()

0.000000         615
3600.000000      177
1800.000000      158
7200.000000      155
5400.000000      148
2700.000000      125
7500.000000      101
3900.000000      100
1200.000000       91
900.000000        91
2400.000000       74
4500.000000       66
10800.000000      55
6300.000000       55
1500.000000       54
2100.000000       53
600.000000        52
3000.000000       48
9000.000000       47
300.000000        41
5700.000000       35
8100.000000       28
3300.000000       27
11100.000000      26
4800.000000       25
5100.000000       25
14400.000000      24
4200.000000       23
6900.000000       17
6000.000000       17
                ... 
1485.227500        1
3682.434882        1
20874.000000       1
31565.000000       1
3105.663550        1
4476.679595        1
27134.000000       1
22350.000000       1
1775.000000        1
27534.074856       1
22498.000000       1
5779.607574        1
17229.000000       1
3025.843470        1
27599.000000       1
8540.000000        1
724.000000   

### Create Internal & External Copies of the Data

In [13]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_NAP_TRACKER_DATA = nap_tracker[nap_tracker.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_NAP_TRACKER_DATA.shape

(2093, 5)

### Number of unique participants

In [14]:
len(EXTERNAL_NAP_TRACKER_DATA.participantId.unique())

592

### Total number of observations

In [15]:
len(EXTERNAL_NAP_TRACKER_DATA)

2093

### External - Upload to Synapse

In [16]:
len(EXTERNAL_NAP_TRACKER_DATA.participantId.unique())
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='Nap Tracker',
                               columns=as_table_columns(EXTERNAL_NAP_TRACKER_DATA),
                               parent=SH_EXTERNAL_PROJECT)
nap_tracker_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_NAP_TRACKER_DATA))
nap_tracker_synTable_external

592

<synapseclient.table.CsvFileTable at 0x114eb4050>

### Internal - Upload to Synapse

In [None]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='Nap Tracker Internal',
                               columns=as_table_columns(nap_tracker), 
                               parent=SH_INTERNAL_PROJECT)
nap_tracker_synTable_internal = syn.store(Table(table_schema_internal,nap_tracker))

### Set Provenance

In [None]:
activity=Activity(name= 'Nap Tracker data curation', 
                  description='Process and convert raw data to table format', 
                  used=nap_tracker_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Nap_Tracker.ipynb')
#Internal 
syn.setProvenance(nap_tracker_synTable_internal, activity)

#External
syn.setProvenance(nap_tracker_synTable_external, activity)