#  Sleepiness Checker Data Prep

In [18]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw Sleepiness Checker Data

In [19]:
sleepiness_checker_raw_id = 'syn7115682'

sleepiness_checker = pd.read_csv(syn.get(sleepiness_checker_raw_id).path)

### Replace white space with NaN

In [20]:
sleepiness_checker = sleepiness_checker.replace(r'^\s*$', np.nan, regex=True)

### Process Sleepiness Checker Data

In [21]:
#rename columns
sleepiness_checker = sleepiness_checker.rename(columns={
    'AlertnessResultIdentifier':'sc_score',})

### Remove duplicated data

In [22]:
len(sleepiness_checker)
sleepiness_checker = sleepiness_checker.drop_duplicates(keep='first')

76755

In [23]:
len(sleepiness_checker)

75903

### Filter based on age

In [24]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
sleepiness_checker = sleepiness_checker[~sleepiness_checker.participantId.isin(underage_participants.participantId)]

### Remove test accounts

In [25]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

sleepiness_checker = sleepiness_checker[~sleepiness_checker.participantId.isin(test_accounts.participantId)]

### Clean SC score data (remove zeroes) - kept this in the code because it is removing legitimately bad data.

In [26]:
#remove zeroes from sc_score column, as 0 was not a possible selection and should not be there.
## If the sc_score value is equal to zero, set to nan.
sleepiness_checker.sc_score[ sleepiness_checker.sc_score == 0 ] = 'nan'
sleepiness_checker.sc_score.value_counts()

4      14021
6      12521
5       9264
7       9041
8       8329
3       8007
2       6969
1       3463
9       2896
nan      421
Name: sc_score, dtype: int64

### Create Internal & External Copies of the Data

In [27]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_SLEEPINESS_CHECKER_DATA = sleepiness_checker[sleepiness_checker.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_SLEEPINESS_CHECKER_DATA.shape

(49188, 3)

### Number of unique participants

In [28]:
len(EXTERNAL_SLEEPINESS_CHECKER_DATA.participantId.unique())

4636

### Total number of observations

In [29]:
len(EXTERNAL_SLEEPINESS_CHECKER_DATA)

49188

### External - Upload to Synapse

In [30]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='Sleepiness Checker',
                               columns=as_table_columns(EXTERNAL_SLEEPINESS_CHECKER_DATA),
                               parent=SH_EXTERNAL_PROJECT)
sleepiness_checker_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_SLEEPINESS_CHECKER_DATA))
sleepiness_checker_synTable_external

<synapseclient.table.CsvFileTable at 0x10c4c1f10>

### Internal - Upload to Synapse

In [31]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='Sleepiness Checker Internal',
                               columns=as_table_columns(sleepiness_checker), 
                               parent=SH_INTERNAL_PROJECT)
sleepiness_checker_synTable_internal = syn.store(Table(table_schema_internal,sleepiness_checker))

### Set Provenance

In [32]:
activity=Activity(name= 'Sleepiness Checker data curation', 
                  description='Process and convert raw data to table format', 
                  used=sleepiness_checker_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleepiness_Checker.ipynb')
#Internal 
syn.setProvenance(sleepiness_checker_synTable_internal, activity)

#External
syn.setProvenance(sleepiness_checker_synTable_external, activity)

{u'createdBy': u'3334346',
 u'createdOn': u'2020-07-20T19:35:46.697Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'a0f75645-1cc9-479c-8c0d-2030113a1883',
 u'id': u'10282156',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-07-20T19:35:46.697Z',
 u'name': u'Sleepiness Checker data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7115682', u'targetVersionNumber': 948},
   u'wasExecuted': False},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleepiness_Checker.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleepiness_Checker.ipynb',
   u'wasExecuted': True}]}

{u'createdBy': u'3334346',
 u'createdOn': u'2020-07-20T19:36:11.962Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'da4f50af-7e64-4f32-9308-a13a498e5c37',
 u'id': u'10282157',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-07-20T19:36:11.962Z',
 u'name': u'Sleepiness Checker data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7115682', u'targetVersionNumber': 948},
   u'wasExecuted': False},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleepiness_Checker.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Sleepiness_Checker.ipynb',
   u'wasExecuted': True}]}