# OnBoarding Demographics Data Prep

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

Welcome, Abhishek Pratap!




UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



### Get Raw Demographics Data

In [2]:
demog_raw_id = 'syn7115683'
demog = pd.read_csv(syn.get(demog_raw_id).path)
demog.head(n=4)


# drop uneeded columns (missing data due to App bug)
demog = demog.drop(columns=['patientWakeUpTime', 'patientGoSleepTime'])


### Replace white space with NaN
demog = demog.replace(r'^\s*$', np.nan, regex=True)

Unnamed: 0,participantId,patientWeightPounds,patientBiologicalSex,patientHeightInches,patientWakeUpTime,patientCurrentAge,patientGoSleepTime,timestamp
0,6c9018a1-6e62-477e-a5b5-36145bf4ea51,170.0,Male,71.0,,40.0,,2016-02-27T22:04:59-05:00
1,e1d8427f-924b-4bd1-a08f-421c1805a405,177.0,Male,68.0,,39.0,,2016-03-10T10:24:26-06:00
2,3d6c0442-9150-4974-8257-3a94461c8790,190.0,Male,75.0,2016-07-21T08:00:00-07:00,27.0,2016-07-21T22:30:00-07:00,2016-07-21T14:44:06-07:00
3,a12e2ca0-2cf4-498f-a51d-1644630511ec,177.0,Female,64.0,2016-07-21T08:00:00-04:00,19.0,2016-07-21T00:00:00-04:00,2016-07-21T20:29:50-04:00


### Process Demographics Data

In [3]:
demog = demog.rename(columns={'patientWeightPounds':'weight_pounds',
                              'patientBiologicalSex':'gender',
                              'patientHeightInches':'height_inches',
                              'patientCurrentAge':'age_years'})

#drop rows where everything is blank
demog.dropna(subset = ['weight_pounds','gender','height_inches','age_years'],how='all',inplace=True)

demog.height_inches = demog.height_inches.round(2)
demog.weight_pounds = demog.weight_pounds.round(2)

### Filter based on Age 

In [4]:
underage_participants = demog[demog.age_years < 18]
underage_participants.to_csv('SleepHealth_underage_participants_tobeExcluded.tsv',
                            index=False, sep="\t")
syn.store(synapseclient.File("SleepHealth_underage_participants_tobeExcluded.tsv", parentId = 'syn7066726'),
         executed = 'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb',
         used = "syn7115683")


demog = demog[~demog.participantId.isin(underage_participants.participantId)]


##################################################
 Uploading file to Synapse storage 
##################################################



File(id='syn21905452', synapseStore=True, cacheDir='', concreteType='org.sagebionetworks.repo.model.FileEntity', createdOn='2020-04-09T22:44:11.245Z', modifiedBy='2166046', files=['SleepHealth_underage_participants_tobeExcluded.tsv'], modifiedOn='2020-04-09T23:18:35.069Z', name='SleepHealth_underage_participants_tobeExcluded.tsv', parentId='syn7066726', versionNumber=7, dataFileHandleId='52031715', _file_handle={'id': '52031715', 'etag': '0f41206e-a718-4ada-b44d-b717acdf326b', 'createdBy': '2166046', 'createdOn': '2020-04-09T22:44:10.000Z', 'concreteType': 'org.sagebionetworks.repo.model.file.S3FileHandle', 'contentType': 'text/tab-separated-values', 'contentMd5': '229035449a1cd9c67288f94f2e71b821', 'fileName': 'SleepHealth_underage_participants_tobeExcluded.tsv', 'storageLocationId': 1, 'contentSize': 22014, 'bucketName': 'proddata.sagebase.org', 'key': '2166046/2a419cac-3069-4af5-add6-570a5b2e6e16/SleepHealth_underage_participants_tobeExcluded.tsv', 'previewId': '52031716', 'isPrevie

### Clean height_inches

In [5]:
demog.height_inches.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])
demog.height_inches.describe()
## If the height is = < .01 percentile of the sample in this case, 0 exclude data (not possible to be 0 inches tall)
demog.height_inches[ demog.height_inches == 0 ] = 'LOW DATA QUALITY'

0.010      0.0
0.050     58.0
0.500     69.0
0.600     70.0
0.800     72.0
0.850     73.0
0.900     73.0
0.950     75.0
0.970     75.0
0.990     77.0
0.999     83.0
1.000    107.0
Name: height_inches, dtype: float64

count    14085.000000
mean        65.421021
std         15.144391
min          0.000000
25%         65.000000
50%         69.000000
75%         71.000000
max        107.000000
Name: height_inches, dtype: float64

### Clean weight_pounds

In [6]:
demog.weight_pounds.quantile([.01, .05,  .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])
demog.weight_pounds.describe()
#Filter odd data 
## If the weight is > .99 percentile, < .01 percentile of the sample 
demog.weight_pounds[ (demog.weight_pounds <= 0) |  (demog.weight_pounds > 510) ] = 'LOW DATA QUALITY'

0.010    0.000000e+00
0.050    0.000000e+00
0.500    1.800000e+02
0.600    1.930000e+02
0.800    2.260000e+02
0.850    2.400000e+02
0.900    2.580000e+02
0.950    2.850000e+02
0.970    3.080000e+02
0.990    3.540000e+02
0.999    5.100000e+02
1.000    1.877018e+12
Name: weight_pounds, dtype: float64

count    1.408500e+04
mean     1.332639e+08
std      1.581576e+10
min      0.000000e+00
25%      1.500000e+02
50%      1.800000e+02
75%      2.150000e+02
max      1.877018e+12
Name: weight_pounds, dtype: float64

### Create Internal & External Copies of the Data

In [10]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21557215').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']
EXTERNAL_DEMO_DATA = demog[demog.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_DEMO_DATA.shape

(7942, 6)

### External - Upload to Synapse

In [11]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='Onboarding Demographics',
                               columns=as_table_columns(EXTERNAL_DEMO_DATA),
                               parent=SH_EXTERNAL_PROJECT)
demo_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_DEMO_DATA))
demo_synTable_external

<synapseclient.table.CsvFileTable at 0x11af1a3d0>

### Internal - Upload to Synapse

In [13]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='Participant Demographics Internal',
                               columns=as_table_columns(demog), 
                               parent=SH_INTERNAL_PROJECT)
demo_synTable_internal = syn.store(Table(table_schema_internal,demog))

### Set Provenance

In [14]:
activity=Activity(name= 'Onboarding Demographics data curation', 
                  description='Process and convert raw data to table format', 
                  used=demog_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb')
#Internal 
syn.setProvenance(demo_synTable_internal, activity)

#External
syn.setProvenance(demo_synTable_external, activity)

{'id': '10232287',
 'name': 'Onboarding Demographics data curation',
 'description': 'Process and convert raw data to table format',
 'etag': '5d68b2ea-f046-46e5-85ec-d0baa7b40264',
 'createdOn': '2020-04-09T23:22:01.168Z',
 'modifiedOn': '2020-04-09T23:22:01.168Z',
 'createdBy': '2166046',
 'modifiedBy': '2166046',
 'used': [{'wasExecuted': True,
   'concreteType': 'org.sagebionetworks.repo.model.provenance.UsedURL',
   'name': 'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb',
   'url': 'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb'},
  {'wasExecuted': False,
   'concreteType': 'org.sagebionetworks.repo.model.provenance.UsedEntity',
   'reference': {'targetId': 'syn7115683', 'targetVersionNumber': 913}}]}

{'id': '10232288',
 'name': 'Onboarding Demographics data curation',
 'description': 'Process and convert raw data to table format',
 'etag': 'dfad85d7-2c8b-41bb-ba38-23d0d6aa5071',
 'createdOn': '2020-04-09T23:22:10.849Z',
 'modifiedOn': '2020-04-09T23:22:10.849Z',
 'createdBy': '2166046',
 'modifiedBy': '2166046',
 'used': [{'wasExecuted': True,
   'concreteType': 'org.sagebionetworks.repo.model.provenance.UsedURL',
   'name': 'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb',
   'url': 'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_Onboarding_Demog_table.ipynb'},
  {'wasExecuted': False,
   'concreteType': 'org.sagebionetworks.repo.model.provenance.UsedEntity',
   'reference': {'targetId': 'syn7115683', 'targetVersionNumber': 913}}]}