#  About Me Data Prep

In [10]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw About Me Data

In [11]:
aboutme_raw_id = 'syn7115681'
aboutme = pd.read_csv(syn.get(aboutme_raw_id).path)

### Remove duplicated data

In [None]:
#remove data points that are identical but appear more than once
len(aboutme)
aboutme = aboutme.drop_duplicates(keep='first')

In [None]:
len(aboutme)

### Process About Me Data

In [13]:
# drop uneeded columns
aboutme = aboutme.drop(columns=['Birthdate','race2'])
# rename columns
aboutme = aboutme.rename(columns={'Income':'income'})

#remove brackets
def remove_brackets(col):
    return( col.astype(str).str.replace(']', '').str.replace('[',''))
aboutme.alcohol = remove_brackets(aboutme.alcohol)
aboutme.basic_expenses = remove_brackets(aboutme.basic_expenses)
aboutme.daily_activities = remove_brackets(aboutme.daily_activities)
aboutme.daily_smoking = remove_brackets(aboutme.daily_smoking)
aboutme.education = remove_brackets(aboutme.education)
aboutme.flexible_work_hours = remove_brackets(aboutme.flexible_work_hours)
aboutme.gender = remove_brackets(aboutme.gender)
aboutme.good_life = remove_brackets(aboutme.good_life)
aboutme.hispanic = remove_brackets(aboutme.hispanic)
aboutme.income = remove_brackets(aboutme.income)
aboutme.marital = remove_brackets(aboutme.marital)
aboutme.race = remove_brackets(aboutme.race)
aboutme.smoking_status = remove_brackets(aboutme.smoking_status)
aboutme.menopause = remove_brackets(aboutme.menopause)
aboutme.recent_births = remove_brackets(aboutme.recent_births)
aboutme.current_pregnant = remove_brackets(aboutme.current_pregnant)
aboutme.work_schedule = remove_brackets(aboutme.work_schedule)

#Replace white space with NaN
aboutme = aboutme.replace(r'^\s*$', np.nan, regex=True)

### Filter based on age

In [14]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
aboutme = aboutme[~aboutme.participantId.isin(underage_participants.participantId)]

### Remove Test Accounts

In [15]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

aboutme = aboutme[~aboutme.participantId.isin(test_accounts.participantId)]

### Clean About Me Data (We have included recommended thresholds for data cleaning, but full data is provided except where doing so could compromise privacy).

### Clean Weight Data

In [16]:
aboutme.weight.describe()

count    8922.000000
mean      190.796682
std        53.847247
min        50.000000
25%       155.000000
50%       183.000000
75%       218.000000
max       557.000000
Name: weight, dtype: float64

In [17]:
aboutme.weight.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010    100.210
0.050    120.000
0.500    183.000
0.600    195.000
0.800    228.000
0.850    240.000
0.900    260.000
0.950    290.000
0.970    310.000
0.990    364.790
0.999    499.711
1.000    557.000
Name: weight, dtype: float64

In [18]:
#Filter odd data 
## If the weight is less than 80lbs or above 350bs, exclude (chosen based on threshold established in Asthma Data Descriptor).
aboutme.weight[ (aboutme.weight < 80) |  (aboutme.weight > 350) ] = 'CENSORED'

In [19]:
aboutme.weight.value_counts()

185.0    234
170.0    211
160.0    208
180.0    202
165.0    200
200.0    195
175.0    189
190.0    184
210.0    164
195.0    162
150.0    162
140.0    159
220.0    150
155.0    144
145.0    144
135.0    127
230.0    126
215.0    123
130.0    110
240.0    110
205.0    102
225.0     92
250.0     91
168.0     85
125.0     83
203.0     80
158.0     79
178.0     77
245.0     73
162.0     72
        ... 
399.0      2
304.0      2
323.0      2
407.0      2
301.0      2
334.0      2
414.0      2
557.0      2
322.0      2
439.0      1
317.0      1
308.0      1
87.0       1
291.0      1
324.0      1
511.0      1
99.0       1
328.0      1
338.0      1
347.0      1
381.0      1
382.0      1
397.0      1
478.0      1
470.0      1
402.0      1
409.0      1
411.0      1
417.0      1
332.0      1
Name: weight, Length: 302, dtype: int64

### Clean caffeine data

In [20]:
aboutme.caffeine.describe()

count    9042.000000
mean        3.289648
std         4.866727
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max        50.000000
Name: caffeine, dtype: float64

In [21]:
aboutme.caffeine.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010     0.0
0.050     0.0
0.500     2.0
0.600     3.0
0.800     4.0
0.850     5.0
0.900     6.0
0.950    10.0
0.970    16.0
0.990    24.0
0.999    50.0
1.000    50.0
Name: caffeine, dtype: float64

In [22]:
#remove caffeine intake above 99th percentile, in this case is 24.
#aboutme.caffeine[aboutme.caffeine > 24 ] = 'LOW DATA QUALITY'

In [23]:
aboutme.caffeine.value_counts()

2.0                 2424
1.0                 1687
3.0                 1498
0.0                 1114
4.0                  854
5.0                  475
6.0                  311
8.0                  174
LOW DATA QUALITY      90
12.0                  85
24.0                  80
10.0                  80
16.0                  65
20.0                  42
7.0                   29
9.0                   11
18.0                   9
15.0                   7
14.0                   4
13.0                   2
21.0                   1
Name: caffeine, dtype: int64

### Create Internal & External Copies of the Data

In [24]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_ABOUTME_DATA = aboutme[aboutme.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_ABOUTME_DATA.shape

(5581, 21)

### Total number of unique participants

In [25]:
len(EXTERNAL_ABOUTME_DATA.participantId.unique())

3262

### Total number of observations

In [26]:
len(EXTERNAL_ABOUTME_DATA)

5581

### External - Upload to Synapse

In [27]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='About Me',
                               columns=as_table_columns(EXTERNAL_ABOUTME_DATA),
                               parent=SH_EXTERNAL_PROJECT)
aboutme_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_ABOUTME_DATA))
aboutme_synTable_external

<synapseclient.table.CsvFileTable at 0x1091ec750>

### Internal - Upload to Synapse

In [28]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='About Me Internal',
                               columns=as_table_columns(aboutme), 
                               parent=SH_INTERNAL_PROJECT)
aboutme_synTable_internal = syn.store(Table(table_schema_internal,aboutme))

### Set Provenance

In [29]:
activity=Activity(name= 'About Me data curation', 
                  description='Process and convert raw data to table format', 
                  used=aboutme_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_About_Me.ipynb')
#Internal 
syn.setProvenance(aboutme_synTable_internal, activity)

#External
syn.setProvenance(aboutme_synTable_external, activity)

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-23T16:45:31.271Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'e36cf63a-3dea-4918-a938-fcc0dec819a7',
 u'id': u'10233988',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-23T16:45:31.271Z',
 u'name': u'About Me data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_About_Me.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_About_Me.ipynb',
   u'wasExecuted': True},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7115681', u'targetVersionNumber': 950},
   u'wasExecuted': False}]}

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-23T16:45:37.394Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'091b407b-e86f-4558-8125-651e5f4eb1ab',
 u'id': u'10233989',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-23T16:45:37.394Z',
 u'name': u'About Me data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_About_Me.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_About_Me.ipynb',
   u'wasExecuted': True},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7115681', u'targetVersionNumber': 950},
   u'wasExecuted': False}]}