#  AM Check-in Data Prep

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw AM Check-in Data

In [2]:
amcheckin_raw_id = 'syn7115672'
amcheckin = pd.read_csv(syn.get(amcheckin_raw_id).path)

### Process AM Check-in Data

In [3]:
#convert seconds to minutes    
amcheckin['AMCH-2a'] = amcheckin['AMCH-2a']/60
amcheckin['AMCH-3a'] = amcheckin['AMCH-3a']/60
amcheckin['AMCH-5'] = amcheckin['AMCH-5']/60

#renaming these so they are consistent with PM Check-in
amcheckin = amcheckin.rename(columns = 
                             { 'AMCH-1':'AMCH1',
                              'AMCH-2':'AMCH2',
                              'AMCH-2a':'AMCH2A',
                              'AMCH-3':'AMCH3',
                              'AMCH-3a':'AMCH3A',
                              'AMCH-4':'AMCH4',
                              'AMCH-5':'AMCH5'})

### Filter based on age

In [4]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
amcheckin = amcheckin[~amcheckin.participantId.isin(underage_participants.participantId)]

### Remove test accounts

In [5]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

amcheckin = amcheckin[~amcheckin.participantId.isin(test_accounts.participantId)]

### Replace white space with NaN

In [6]:
amcheckin = amcheckin.replace(r'^\s*$', np.nan, regex=True)

### Clean  AM-Checkin Data

### AMCH2A Cleaning (Sleep Onset Latency)

In [7]:
amcheckin.AMCH2A.describe()

count    14544.000000
mean        67.704277
std         87.526330
min          1.000000
25%         25.000000
50%         45.000000
75%         70.000000
max       1439.000000
Name: AMCH2A, dtype: float64

In [8]:
amcheckin.AMCH2A.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010       1.00
0.050      10.00
0.500      45.00
0.600      60.00
0.800      90.00
0.850     120.00
0.900     121.00
0.950     200.00
0.970     244.42
0.990     409.13
0.999    1343.85
1.000    1439.00
Name: AMCH2A, dtype: float64

In [9]:
#exclude data above 99th percentile, which in this case is 409.13 minutes
amcheckin.AMCH2A[ amcheckin.AMCH2A > 409.13 ] = 'LOW DATA QUALITY'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [10]:
amcheckin.AMCH2A.value_counts()

30.0                2107
60.0                1898
20.0                1147
61.0                 981
15.0                 868
45.0                 741
120.0                735
90.0                 502
10.0                 497
121.0                466
40.0                 358
1.0                  345
25.0                 334
180.0                259
181.0                192
35.0                 167
LOW DATA QUALITY     146
240.0                137
75.0                 120
150.0                120
5.0                  110
50.0                  92
241.0                 76
80.0                  69
300.0                 65
210.0                 64
105.0                 60
16.0                  58
17.0                  50
12.0                  49
                    ... 
290.0                  1
298.0                  1
299.0                  1
178.0                  1
166.0                  1
164.0                  1
133.0                  1
99.0                   1
302.0                  1


### AMCH 3 Cleaning (# of wakes during the night)

In [11]:
amcheckin.AMCH3.describe()

count      72609.000000
mean          17.182250
std         3729.628531
min            0.000000
25%            0.000000
50%            1.000000
75%            3.000000
max      1000000.000000
Name: AMCH3, dtype: float64

In [12]:
amcheckin.AMCH3.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010          0.0
0.050          0.0
0.500          1.0
0.600          2.0
0.800          3.0
0.850          3.0
0.900          4.0
0.950          5.0
0.970          7.0
0.990         10.0
0.999         25.0
1.000    1000000.0
Name: AMCH3, dtype: float64

In [13]:
#exclude anything above 99th percentile, which in this case is 10 wakes
amcheckin.AMCH3[ amcheckin.AMCH3 > 10 ] = 'LOW DATA QUALITY'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [14]:
amcheckin.AMCH3.value_counts()

1.0                 19230
0.0                 18713
2.0                 15007
3.0                  9328
4.0                  4088
5.0                  2749
6.0                  1149
LOW DATA QUALITY      647
8.0                   572
7.0                   481
10.0                  443
9.0                   202
Name: AMCH3, dtype: int64

### AMCH3A Data Cleaning (Total Time Awake Overnight)

In [15]:
amcheckin.AMCH3A.describe()

count    54065.000000
mean        43.547785
std         98.876173
min          1.000000
25%          5.000000
50%         15.000000
75%         45.000000
max       1439.000000
Name: AMCH3A, dtype: float64

In [16]:
amcheckin.AMCH3A.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010       1.0
0.050       1.0
0.500      15.0
0.600      20.0
0.800      60.0
0.850      61.0
0.900      91.6
0.950     180.0
0.970     241.0
0.990     450.0
0.999    1201.0
1.000    1439.0
Name: AMCH3A, dtype: float64

In [17]:
#exclude anything above 99th percentile, which in this case is 450 minutes
amcheckin.AMCH3A[ amcheckin.AMCH3A > 450 ] = 'LOW DATA QUALITY'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [19]:
amcheckin.AMCH3A.value_counts()

5.0                 6259
10.0                5431
1.0                 4373
15.0                4011
30.0                3686
20.0                2576
60.0                2491
2.0                 2105
3.0                 2024
61.0                1393
4.0                 1364
6.0                 1149
45.0                1116
120.0                988
90.0                 822
8.0                  666
121.0                642
25.0                 627
7.0                  609
40.0                 530
LOW DATA QUALITY     523
12.0                 436
180.0                400
9.0                  344
35.0                 340
181.0                296
75.0                 287
11.0                 280
150.0                267
14.0                 235
                    ... 
274.0                  1
271.0                  1
359.0                  1
265.0                  1
263.0                  1
256.0                  1
224.0                  1
227.0                  1
248.0                  1


### AMCH5 Data Cleaning (Total Hours Slept)

In [20]:
amcheckin.AMCH5.describe()

count    54106.000000
mean       395.307230
std        126.233963
min          1.000000
25%        360.000000
50%        420.000000
75%        480.000000
max       1439.000000
Name: AMCH5, dtype: float64

In [21]:
amcheckin.AMCH5.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010       1.0
0.050     120.0
0.500     420.0
0.600     430.0
0.800     481.0
0.850     495.0
0.900     522.0
0.950     545.0
0.970     580.0
0.990     630.0
0.999     781.0
1.000    1439.0
Name: AMCH5, dtype: float64

In [22]:
#exclude anything above 99th percentile, which in this case is 630 minutes
amcheckin.AMCH5[ amcheckin.AMCH5 > 630 ] = 'LOW DATA QUALITY'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [23]:
amcheckin.AMCH5.value_counts()

420.0               3531
480.0               3205
360.0               2658
421.0               2575
1.0                 2227
450.0               2224
361.0               1980
390.0               1980
481.0               1816
300.0               1482
540.0               1224
510.0               1130
330.0               1105
301.0               1067
435.0                675
541.0                654
405.0                630
465.0                583
240.0                565
375.0                547
LOW DATA QUALITY     535
270.0                525
241.0                481
495.0                412
440.0                406
600.0                380
430.0                353
380.0                334
345.0                333
570.0                325
                    ... 
124.0                  1
38.0                   1
42.0                   1
102.0                  1
69.0                   1
143.0                  1
98.0                   1
91.0                   1
144.0                  1


### Create Internal & External Copies of the Data

In [24]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21557215').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_AMCHECKIN_DATA = amcheckin[amcheckin.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_AMCHECKIN_DATA.shape

(48753, 9)

### External - Upload to Synapse

In [None]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='AM Check-in',
                               columns=as_table_columns(EXTERNAL_AMCHECKIN_DATA),
                               parent=SH_EXTERNAL_PROJECT)
amcheckin_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_AMCHECKIN_DATA))
amcheckin_synTable_external

### Internal - Upload to Synapse

In [None]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='AM Check-in Internal',
                               columns=as_table_columns(amcheckin), 
                               parent=SH_INTERNAL_PROJECT)
amcheckin_synTable_internal = syn.store(Table(table_schema_internal,amcheckin))

### Set Provenance

In [None]:
activity=Activity(name= 'AM Check-in data curation', 
                  description='Process and convert raw data to table format', 
                  used=amcheckin_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_AM_Checkin.ipynb')
#Internal 
syn.setProvenance(amcheckin_synTable_internal, activity)

#External
syn.setProvenance(amcheckin_synTable_external, activity)