#  AM Check-in Data Prep

In [38]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw AM Check-in Data

In [39]:
amcheckin_raw_id = 'syn7115672'
amcheckin = pd.read_csv(syn.get(amcheckin_raw_id).path)

Unnamed: 0,participantId,AMCH-1,AMCH-2,AMCH-2a,AMCH-3,AMCH-3a,AMCH-4,AMCH-5,timestamp
0,2ff2a147-32ab-433d-aeb6-c672a8e792d9,,,,,,,,2016-03-04T21:59:40+05:30
1,2ff2a147-32ab-433d-aeb6-c672a8e792d9,,False,,,10860.0,2016-03-08T22:45:06+05:30,7260.0,2016-03-08T22:45:14+05:30
2,2ff2a147-32ab-433d-aeb6-c672a8e792d9,,,,,,,,2016-03-12T22:47:54+05:30
3,482657e9-1bd3-4e8c-b29a-c0b8d8d7282a,2016-09-26T07:15:42-04:00,True,3600.0,3.0,14400.0,2016-09-26T07:15:13-04:00,14400.0,2016-09-26T08:42:33-04:00
4,10313862-8ee9-4155-8ffd-c93b3d902de5,2016-03-02T21:00:39-06:00,False,,,120.0,2016-03-02T05:09:07-06:00,,2016-03-02T11:19:22-06:00


### Process AM Check-in Data

In [40]:
#convert seconds to minutes    
amcheckin['AMCH-2a'] = amcheckin['AMCH-2a']/60
amcheckin['AMCH-3a'] = amcheckin['AMCH-3a']/60
amcheckin['AMCH-5'] = amcheckin['AMCH-5']/60

#renaming these so they are consistent with PM Check-in
amcheckin = amcheckin.rename(columns = 
                             { 'AMCH-1':'AMCH1',
                              'AMCH-2':'AMCH2',
                              'AMCH-2a':'AMCH2A',
                              'AMCH-3':'AMCH3',
                              'AMCH-3a':'AMCH3A',
                              'AMCH-4':'AMCH4',
                              'AMCH-5':'AMCH5'})

### Filter based on age

In [None]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
amcheckin = amcheckin[~amcheckin.participantId.isin(underage_participants.participantId)]

### Replace white space with NaN

In [42]:
amcheckin = amcheckin.replace(r'^\s*$', np.nan, regex=True)

### Clean  AM-Checkin Data

### AMCH2A Cleaning (Sleep Onset Latency)

In [45]:
amcheckin.AMCH2A.describe()

count    14778.000000
mean        67.606374
std         87.412342
min          1.000000
25%         25.000000
50%         45.000000
75%         68.000000
max       1439.000000
Name: AMCH2A, dtype: float64

In [44]:
amcheckin.AMCH2A.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.100      15.00
0.500      45.00
0.600      60.00
0.800      90.00
0.850     120.00
0.900     121.00
0.950     200.15
0.970     245.00
0.990     420.00
0.999    1332.15
Name: AMCH2A, dtype: float64

In [46]:
#exclude data above 99th percentile, which in this case is 420 minutes
amcheckin.AMCH2A[ amcheckin.AMCH2A > 420 ] = 'DATA QUALITY ISSUE'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [47]:
amcheckin.AMCH2A.value_counts()

30.0                  2136
60.0                  1923
20.0                  1171
61.0                   990
15.0                   883
45.0                   752
120.0                  744
10.0                   507
90.0                   503
121.0                  471
40.0                   361
1.0                    353
25.0                   345
180.0                  264
181.0                  196
35.0                   172
240.0                  140
DATA QUALITY ISSUE     125
75.0                   122
150.0                  120
5.0                    111
50.0                    95
241.0                   76
80.0                    69
300.0                   67
210.0                   66
105.0                   60
16.0                    58
70.0                    52
17.0                    51
                      ... 
97.0                     1
99.0                     1
102.0                    1
109.0                    1
113.0                    1
117.0                    1
1

### AMCH 3 Cleaning (# of wakes during the night)

In [51]:
amcheckin.AMCH3.describe()

count      73452.000000
mean          17.000463
std         3708.164615
min            0.000000
25%            0.000000
50%            1.000000
75%            3.000000
max      1000000.000000
Name: AMCH3, dtype: float64

In [50]:
amcheckin.AMCH3.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.100     0.0
0.500     1.0
0.600     2.0
0.800     3.0
0.850     3.0
0.900     4.0
0.950     5.0
0.970     7.0
0.990    10.0
0.999    25.0
Name: AMCH3, dtype: float64

In [52]:
#exclude anything above 99th percentile, which in this case is 10 wakes
amcheckin.AMCH3[ amcheckin.AMCH3 > 10 ] = 'DATA QUALITY ISSUE'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [53]:
amcheckin.AMCH3.value_counts()

1.0                   19445
0.0                   19068
2.0                   15147
3.0                    9407
4.0                    4108
5.0                    2766
6.0                    1153
DATA QUALITY ISSUE      653
8.0                     574
7.0                     482
10.0                    446
9.0                     203
Name: AMCH3, dtype: int64

### AMCH3A Data Cleaning (Total Time Awake Overnight)

In [57]:
amcheckin.AMCH3A.describe()

count    54551.000000
mean        43.507103
std         98.828155
min          1.000000
25%          5.000000
50%         15.000000
75%         45.000000
max       1439.000000
Name: AMCH3A, dtype: float64

In [56]:
amcheckin.AMCH3A.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.100       2.0
0.500      15.0
0.600      20.0
0.800      60.0
0.850      61.0
0.900      90.0
0.950     180.0
0.970     241.0
0.990     450.0
0.999    1201.0
Name: AMCH3A, dtype: float64

In [58]:
#exclude anything above 99th percentile, which in this case is 450 minutes
amcheckin.AMCH3A[ amcheckin.AMCH3A > 450 ] = 'DATA QUALITY ISSUE'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [59]:
amcheckin.AMCH3A.value_counts()

5.0                   6322
10.0                  5502
1.0                   4419
15.0                  4036
30.0                  3712
20.0                  2608
60.0                  2504
2.0                   2123
3.0                   2042
61.0                  1403
4.0                   1374
6.0                   1153
45.0                  1127
120.0                  988
90.0                   826
8.0                    676
121.0                  643
25.0                   640
7.0                    617
40.0                   536
DATA QUALITY ISSUE     529
12.0                   438
180.0                  405
9.0                    348
35.0                   344
181.0                  300
75.0                   295
11.0                   282
150.0                  267
14.0                   238
                      ... 
272.0                    1
265.0                    1
359.0                    1
263.0                    1
256.0                    1
224.0                    1
2

### AMCH5 Data Cleaning (Total Hours Slept)

In [63]:
amcheckin.AMCH5.describe()

count    54693.000000
mean       395.127530
std        126.614827
min          1.000000
25%        360.000000
50%        420.000000
75%        480.000000
max       1439.000000
Name: AMCH5, dtype: float64

In [62]:
amcheckin.AMCH5.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.100    262.00
0.500    420.00
0.600    430.00
0.800    481.00
0.850    495.00
0.900    523.00
0.950    546.00
0.970    584.00
0.990    635.00
0.999    791.54
Name: AMCH5, dtype: float64

In [64]:
#exclude anything above 99th percentile, which in this case is 635 minutes
amcheckin.AMCH5[ amcheckin.AMCH5 > 635 ] = 'DATA QUALITY ISSUE'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [65]:
amcheckin.AMCH5.value_counts()

420.0                 3555
480.0                 3230
360.0                 2689
421.0                 2594
1.0                   2254
450.0                 2238
361.0                 1993
390.0                 1990
481.0                 1840
300.0                 1501
540.0                 1240
510.0                 1138
330.0                 1114
301.0                 1072
435.0                  677
541.0                  661
405.0                  636
465.0                  587
240.0                  572
375.0                  553
DATA QUALITY ISSUE     544
270.0                  534
241.0                  486
495.0                  418
440.0                  410
600.0                  391
430.0                  357
380.0                  339
345.0                  336
570.0                  328
                      ... 
52.0                     1
124.0                    1
129.0                    1
132.0                    1
119.0                    1
133.0                    1
1

### Create Internal & External Copies of the Data

In [67]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21557215').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_AMCHECKIN_DATA = amcheckin[amcheckin.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_AMCHECKIN_DATA.shape

(49270, 9)

### External - Upload to Synapse

In [68]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='AM Check-in',
                               columns=as_table_columns(EXTERNAL_AMCHECKIN_DATA),
                               parent=SH_EXTERNAL_PROJECT)
amcheckin_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_AMCHECKIN_DATA))
amcheckin_synTable_external

5253

<synapseclient.table.CsvFileTable at 0x110802110>

### Internal - Upload to Synapse

In [69]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='AM Check-in Internal',
                               columns=as_table_columns(amcheckin), 
                               parent=SH_INTERNAL_PROJECT)
amcheckin_synTable_internal = syn.store(Table(table_schema_internal,amcheckin))

### Set Provenance

In [71]:
activity=Activity(name= 'AM Check-in data curation', 
                  description='Process and convert raw data to table format', 
                  used=amcheckin_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_AM_Checkin.ipynb')
#Internal 
syn.setProvenance(amcheckin_synTable_internal, activity)

#External
syn.setProvenance(amcheckin_synTable_external, activity)

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-08T18:22:14.799Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'e571ae60-087b-48ae-99ac-0552f1489952',
 u'id': u'10232190',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-08T18:22:14.799Z',
 u'name': u'AM Check-in data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/deerings/SLEEPHEALTHv2-Data-Release/Create_AM_Checkin.ipynb',
   u'url': u'https://github.com/deerings/SLEEPHEALTHv2-Data-Release/Create_AM_Checkin.ipynb',
   u'wasExecuted': True},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7115672', u'targetVersionNumber': 928},
   u'wasExecuted': False}]}

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-08T18:23:01.922Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'9792ba4f-936c-4d34-8ac1-dd0c863ea345',
 u'id': u'10232191',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-08T18:23:01.922Z',
 u'name': u'AM Check-in data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/deerings/SLEEPHEALTHv2-Data-Release/Create_AM_Checkin.ipynb',
   u'url': u'https://github.com/deerings/SLEEPHEALTHv2-Data-Release/Create_AM_Checkin.ipynb',
   u'wasExecuted': True},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7115672', u'targetVersionNumber': 928},
   u'wasExecuted': False}]}