# OnBoarding Demographics Data Prep

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw Demographics Data

In [2]:
demog_raw_id = 'syn7115683'
demog = pd.read_csv(syn.get(demog_raw_id).path)
demog.head(n=4)


# drop uneeded columns (missing data due to App bug)
demog = demog.drop(columns=['patientWakeUpTime', 'patientGoSleepTime'])


### Replace white space with NaN
demog = demog.replace(r'^\s*$', np.nan, regex=True)

Unnamed: 0,participantId,patientWeightPounds,patientBiologicalSex,patientHeightInches,patientWakeUpTime,patientCurrentAge,patientGoSleepTime,timestamp
0,6c9018a1-6e62-477e-a5b5-36145bf4ea51,170.0,Male,71.0,,40.0,,2016-02-27T22:04:59-05:00
1,e1d8427f-924b-4bd1-a08f-421c1805a405,177.0,Male,68.0,,39.0,,2016-03-10T10:24:26-06:00
2,3d6c0442-9150-4974-8257-3a94461c8790,190.0,Male,75.0,2016-07-21T08:00:00-07:00,27.0,2016-07-21T22:30:00-07:00,2016-07-21T14:44:06-07:00
3,a12e2ca0-2cf4-498f-a51d-1644630511ec,177.0,Female,64.0,2016-07-21T08:00:00-04:00,19.0,2016-07-21T00:00:00-04:00,2016-07-21T20:29:50-04:00


### Process Demographics Data

In [3]:
demog = demog.rename(columns={'patientWeightPounds':'weight_pounds',
                              'patientBiologicalSex':'gender',
                              'patientHeightInches':'height_inches',
                              'patientCurrentAge':'age_years'})

demog.height_inches = demog.height_inches.round(2)
demog.weight_pounds = demog.weight_pounds.round(2)

### Filter based on Age 

In [4]:
underage_participants = demog[demog.age_years < 18]
underage_participants.to_csv('SleepHealth_underage_participants_tobeExcluded.tsv',
                            index=False, sep="\t")
syn.store(synapseclient.File("SleepHealth_underage_participants_tobeExcluded.tsv", parentId = 'syn7066726'),
         executed = 'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb',
         used = "syn7115683")


demog = demog[~demog.participantId.isin(underage_participants.participantId)]


##################################################
 Uploading file to Synapse storage 
##################################################



File(versionNumber=26, createdOn=u'2020-04-09T22:44:11.245Z', concreteType=u'org.sagebionetworks.repo.model.FileEntity', modifiedBy=u'3334346', cacheDir='', files=['SleepHealth_underage_participants_tobeExcluded.tsv'], versionLabel=u'26', dataFileHandleId=u'52068772', _file_handle={u'contentType': u'text/tab-separated-values', u'createdBy': u'3334346', u'createdOn': u'2020-04-10T16:05:42.000Z', u'contentMd5': u'229035449a1cd9c67288f94f2e71b821', u'isPreview': False, u'fileName': u'SleepHealth_underage_participants_tobeExcluded.tsv', u'etag': u'1f268792-39b4-4e7a-91e8-b56db8559b53', u'bucketName': u'proddata.sagebase.org', u'key': u'3334346/6ef9f0b7-07e3-438f-a8a6-b458a99e02a8/SleepHealth_underage_participants_tobeExcluded.tsv', u'id': u'52068772', u'previewId': u'52068773', u'externalURL': None, u'storageLocationId': 1, u'concreteType': u'org.sagebionetworks.repo.model.file.S3FileHandle', u'contentSize': 22014}, path='SleepHealth_underage_participants_tobeExcluded.tsv', synapseStore=Tr

### Remove test accounts

In [5]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

demog = demog[~demog.participantId.isin(test_accounts.participantId)]

### Clean height_inches

In [7]:
demog.height_inches.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])
demog.height_inches.describe()
## If the height is < 50 in or > 90 in, exclude data (based on normative data seems unlikely, also to protect people with unique features))
demog.height_inches[ (demog.height_inches < 50) | (demog.height_inches > 90) ] = 'LOW DATA QUALITY'

0.010      0.0
0.050     60.0
0.500     69.0
0.600     70.0
0.800     72.0
0.850     73.0
0.900     73.0
0.950     75.0
0.970     75.0
0.990     77.0
0.999     83.0
1.000    107.0
Name: height_inches, dtype: float64

count    13921.000000
mean        65.764433
std         14.392243
min          0.000000
25%         66.000000
50%         69.000000
75%         71.000000
max        107.000000
Name: height_inches, dtype: float64

### Clean weight_pounds

In [8]:
demog.weight_pounds.quantile([.01, .05,  .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])
demog.weight_pounds.describe()
#Filter odd data 
## If the weight is less than 90 or above 869, exclude (choices based on normative weight data).
demog.weight_pounds[ (demog.weight_pounds < 80) |  (demog.weight_pounds > 869) ] = 'LOW DATA QUALITY'

0.010    0.000000e+00
0.050    0.000000e+00
0.500    1.800000e+02
0.600    1.940000e+02
0.800    2.260000e+02
0.850    2.400000e+02
0.900    2.580000e+02
0.950    2.850000e+02
0.970    3.080000e+02
0.990    3.540000e+02
0.999    5.090800e+02
1.000    1.877018e+12
Name: weight_pounds, dtype: float64

count    1.392100e+04
mean     1.348338e+08
std      1.590865e+10
min      0.000000e+00
25%      1.500000e+02
50%      1.800000e+02
75%      2.160000e+02
max      1.877018e+12
Name: weight_pounds, dtype: float64

### Create Internal & External Copies of the Data

In [9]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']
EXTERNAL_DEMO_DATA = demog[demog.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_DEMO_DATA.shape

(8130, 6)

### Number of unique participants

In [10]:
len(EXTERNAL_DEMO_DATA.participantId.unique())

7250

### Total number of observations

In [11]:
len(EXTERNAL_DEMO_DATA)

8130

### External - Upload to Synapse

In [12]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='Onboarding Demographics',
                               columns=as_table_columns(EXTERNAL_DEMO_DATA),
                               parent=SH_EXTERNAL_PROJECT)
demo_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_DEMO_DATA))
demo_synTable_external

<synapseclient.table.CsvFileTable at 0x10a62be50>

### Internal - Upload to Synapse

In [13]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='Participant Demographics Internal',
                               columns=as_table_columns(demog), 
                               parent=SH_INTERNAL_PROJECT)
demo_synTable_internal = syn.store(Table(table_schema_internal,demog))

### Set Provenance

In [14]:
activity=Activity(name= 'Onboarding Demographics data curation', 
                  description='Process and convert raw data to table format', 
                  used=demog_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb')
#Internal 
syn.setProvenance(demo_synTable_internal, activity)

#External
syn.setProvenance(demo_synTable_external, activity)

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-22T17:36:07.265Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'27db587d-9ea9-4ec5-9a34-d35a3683b7ab',
 u'id': u'10233917',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-22T17:36:07.265Z',
 u'name': u'Onboarding Demographics data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb',
   u'wasExecuted': True},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7115683', u'targetVersionNumber': 913},
   u'wasExecuted': False}]}

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-22T17:36:13.049Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'761702b2-b075-4e58-86ab-555f1910515b',
 u'id': u'10233918',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-22T17:36:13.049Z',
 u'name': u'Onboarding Demographics data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb',
   u'wasExecuted': True},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7115683', u'targetVersionNumber': 913},
   u'wasExecuted': False}]}