# My Health Data Prep

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw My Health Data

In [2]:
myhealth_raw_id = 'syn7117922'

myhealth = pd.read_csv(syn.get(myhealth_raw_id).path)

### Replace white space with NaN

In [3]:
myhealth = myhealth.replace(r'^\s*$', np.nan, regex=True)

### Process My Health Data

In [4]:
#remove brackets from categorical columns
def remove_brackets(col):
    return( col.astype(str).str.replace(']', '').str.replace('[',''))
myhealth.current_allergies=remove_brackets(myhealth.current_allergies)
myhealth.impactsleep_allergies=remove_brackets(myhealth.impactsleep_allergies)
myhealth.sleepimpact_allergies=remove_brackets(myhealth.sleepimpact_allergies)
myhealth.allergies=remove_brackets(myhealth.allergies)
myhealth.anxiety=remove_brackets(myhealth.anxiety)
myhealth.current_anxiety=remove_brackets(myhealth.current_anxiety)
myhealth.impactsleep_anxiety=remove_brackets(myhealth.impactsleep_anxiety)
myhealth.sleepimpact_anxiety=remove_brackets(myhealth.sleepimpact_anxiety)
myhealth.anxious=remove_brackets(myhealth.anxious)
myhealth.apnea=remove_brackets(myhealth.apnea)
myhealth.current_apnea=remove_brackets(myhealth.current_apnea)
myhealth.impactsleep_apnea=remove_brackets(myhealth.impactsleep_apnea)
myhealth.sleepimpact_apnea=remove_brackets(myhealth.sleepimpact_apnea)
myhealth.asthma=remove_brackets(myhealth.asthma)
myhealth.current_asthma=remove_brackets(myhealth.current_asthma)
myhealth.impactsleep_asthma=remove_brackets(myhealth.impactsleep_asthma)
myhealth.sleepimpact_asthma=remove_brackets(myhealth.sleepimpact_asthma)
myhealth.atrial=remove_brackets(myhealth.atrial)
myhealth.current_atrial=remove_brackets(myhealth.current_atrial)
myhealth.impactsleep_atrial=remove_brackets(myhealth.impactsleep_atrial)
myhealth.sleepimpact_atrial=remove_brackets(myhealth.sleepimpact_atrial)
myhealth.hi_blood_pressure=remove_brackets(myhealth.hi_blood_pressure)
myhealth.current_hbp=remove_brackets(myhealth.current_hbp)
myhealth.impactsleep_hbp=remove_brackets(myhealth.impactsleep_hbp)
myhealth.sleepimpact_hbp=remove_brackets(myhealth.sleepimpact_hbp)
myhealth.cancer=remove_brackets(myhealth.cancer)
myhealth.current_cancer=remove_brackets(myhealth.current_cancer)
myhealth.impactsleep_cancer=remove_brackets(myhealth.impactsleep_cancer)
myhealth.sleepimpact_cancer=remove_brackets(myhealth.sleepimpact_cancer)
myhealth.cardiovascular=remove_brackets(myhealth.cardiovascular)
myhealth.compare_one_year=remove_brackets(myhealth.compare_one_year)
myhealth.day_to_day=remove_brackets(myhealth.day_to_day)
myhealth.depressed=remove_brackets(myhealth.depressed)
myhealth.depression=remove_brackets(myhealth.depression)
myhealth.current_depression=remove_brackets(myhealth.current_depression)
myhealth.impactsleep_depression=remove_brackets(myhealth.impactsleep_depression)
myhealth.sleepimpact_depression=remove_brackets(myhealth.sleepimpact_depression)
myhealth.diabetes=remove_brackets(myhealth.diabetes)
myhealth.current_Diabetes=remove_brackets(myhealth.current_Diabetes)
myhealth.impactsleep_diabetes=remove_brackets(myhealth.impactsleep_diabetes)
myhealth.sleepimpact_diabetes=remove_brackets(myhealth.sleepimpact_diabetes)
myhealth.diabetes_type=remove_brackets(myhealth.diabetes_type)
myhealth.emotional=remove_brackets(myhealth.emotional)
myhealth.erectile=remove_brackets(myhealth.erectile)
myhealth.current_ed=remove_brackets(myhealth.current_ed)
myhealth.impactsleep_ed=remove_brackets(myhealth.impactsleep_ed)
myhealth.sleepimpact_ed=remove_brackets(myhealth.sleepimpact_ed)
myhealth.fatigued=remove_brackets(myhealth.fatigued)
myhealth.gastroesophageal=remove_brackets(myhealth.gastroesophageal)
myhealth.current_gastroesophageal=remove_brackets(myhealth.current_gastroesophageal)
myhealth.impactsleep_gastroesophageal=remove_brackets(myhealth.impactsleep_gastroesophageal)
myhealth.sleepimpact_gastroesophageal=remove_brackets(myhealth.sleepimpact_gastroesophageal)
myhealth.general_health=remove_brackets(myhealth.general_health)
myhealth.heart_disease=remove_brackets(myhealth.heart_disease)
myhealth.current_heart_disease=remove_brackets(myhealth.current_heart_disease)
myhealth.impactsleep_heart_disease=remove_brackets(myhealth.impactsleep_heart_disease)
myhealth.sleepimpact_heart_disease=remove_brackets(myhealth.sleepimpact_heart_disease)
myhealth.insomnia=remove_brackets(myhealth.insomnia)
myhealth.current_insomnia=remove_brackets(myhealth.current_insomnia)
myhealth.impactsleep_insomnia=remove_brackets(myhealth.impactsleep_insomnia)
myhealth.sleepimpact_insomnia=remove_brackets(myhealth.sleepimpact_insomnia)
myhealth.lung=remove_brackets(myhealth.lung)
myhealth.current_lung=remove_brackets(myhealth.current_lung)
myhealth.impactsleep_lung=remove_brackets(myhealth.impactsleep_lung)
myhealth.sleepimpact_lung=remove_brackets(myhealth.sleepimpact_lung)
myhealth.mental_health=remove_brackets(myhealth.mental_health)
myhealth.narcolepsy=remove_brackets(myhealth.narcolepsy)
myhealth.current_Narcolepsy=remove_brackets(myhealth.current_Narcolepsy)
myhealth.impactsleep_Narcolepsy=remove_brackets(myhealth.impactsleep_Narcolepsy)
myhealth.sleepimpact_Narcolepsy=remove_brackets(myhealth.sleepimpact_Narcolepsy)
myhealth.nocturia=remove_brackets(myhealth.nocturia)
myhealth.current_Nocturia=remove_brackets(myhealth.current_Nocturia)
myhealth.impactsleep_Nocturia=remove_brackets(myhealth.impactsleep_Nocturia)
myhealth.sleepimpact_Nocturia=remove_brackets(myhealth.sleepimpact_Nocturia)
myhealth.restless_legs_syndrome=remove_brackets(myhealth.restless_legs_syndrome)
myhealth.current_rls=remove_brackets(myhealth.current_rls)
myhealth.impactsleep_rls=remove_brackets(myhealth.impactsleep_rls)
myhealth.sleepimpact_rls=remove_brackets(myhealth.sleepimpact_rls)
myhealth.stroke=remove_brackets(myhealth.stroke)
myhealth.current_Stroke=remove_brackets(myhealth.current_Stroke)
myhealth.impactsleep_stroke=remove_brackets(myhealth.impactsleep_stroke)
myhealth.sleepimpact_stroke=remove_brackets(myhealth.sleepimpact_stroke)
myhealth.physical_activities=remove_brackets(myhealth.physical_activities)
myhealth.physical_health=remove_brackets(myhealth.physical_health)
myhealth.risk=remove_brackets(myhealth.risk)
myhealth.sleep_trouble=remove_brackets(myhealth.sleep_trouble)
myhealth.social_activities=remove_brackets(myhealth.social_activities)
myhealth.stressed=remove_brackets(myhealth.stressed)
myhealth.uars=remove_brackets(myhealth.uars)
myhealth.current_uars=remove_brackets(myhealth.current_uars)
myhealth.impactsleep_uars=remove_brackets(myhealth.impactsleep_uars)
myhealth.sleepimpact_uars=remove_brackets(myhealth.sleepimpact_uars)

#drop free text fields
myhealth = myhealth.drop(columns=['cancer_selected'])

### Filter based on age

In [5]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
myhealth = myhealth[~myhealth.participantId.isin(underage_participants.participantId)]

### Remove test accounts

In [6]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

myhealth = myhealth[~myhealth.participantId.isin(test_accounts.participantId)]

### Clean My Health Data

#### Clean age high blood Pressure Column

In [7]:
#Set ages 120 and 140 years old to LOW DATA QUALITY - I don't feel comfortable making any assumptions about what these participants meant.

myhealth.age_hbp[ myhealth.age_hbp == 120] = 'LOW DATA QUALITY'
myhealth.age_hbp[ myhealth.age_hbp == 140] = 'LOW DATA QUALITY'
myhealth.age_hbp.value_counts()

40.0                44
30.0                29
45.0                24
35.0                23
25.0                22
50.0                21
38.0                19
55.0                18
42.0                13
44.0                13
18.0                11
32.0                10
48.0                10
28.0                10
33.0                 9
31.0                 9
19.0                 9
52.0                 9
34.0                 8
24.0                 8
20.0                 8
41.0                 8
36.0                 7
22.0                 7
26.0                 7
29.0                 7
16.0                 7
39.0                 7
53.0                 6
43.0                 6
17.0                 5
23.0                 5
21.0                 5
56.0                 5
57.0                 4
54.0                 4
65.0                 4
37.0                 4
60.0                 4
66.0                 3
58.0                 3
27.0                 3
51.0                 3
46.0       

### Create Internal & External Copies of the Data

In [8]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21557215').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_MYHEALTH_DATA = myhealth[myhealth.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_MYHEALTH_DATA.shape

(1512, 114)

### External - Upload to Synapse

In [None]:
len(EXTERNAL_MYHEALTH_DATA.participantId.unique())
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='My Health',
                               columns=as_table_columns(EXTERNAL_MYHEALTH_DATA),
                               parent=SH_EXTERNAL_PROJECT)
myhealth_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_MYHEALTH_DATA))
myhealth_synTable_external

1439

### Internal - Upload to Synapse

In [None]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='My Health Internal',
                               columns=as_table_columns(myhealth), 
                               parent=SH_INTERNAL_PROJECT)
myhealth_synTable_internal = syn.store(Table(table_schema_internal,myhealth))

### Set Provenance

In [None]:
activity=Activity(name= 'My Health data curation', 
                  description='Process and convert raw data to table format', 
                  used=myhealth_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_My_Health.ipynb')
#Internal 
syn.setProvenance(myhealth_synTable_internal, activity)

#External
syn.setProvenance(myhealth_synTable_external, activity)