#  About Me Data Prep

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw About Me Data

In [2]:
aboutme_raw_id = 'syn7115681'
aboutme = pd.read_csv(syn.get(aboutme_raw_id).path)

### Remove duplicated data

In [3]:
#remove data points that are identical but appear more than once
len(aboutme)
aboutme = aboutme.drop_duplicates(keep='first')

9345

In [4]:
len(aboutme)

5762

### Process About Me Data

In [5]:
# drop uneeded columns
aboutme = aboutme.drop(columns=['Birthdate','race2'])
# rename columns
aboutme = aboutme.rename(columns={'Income':'income'})

#remove brackets
def remove_brackets(col):
    return( col.astype(str).str.replace(']', '').str.replace('[',''))
aboutme.alcohol = remove_brackets(aboutme.alcohol)
aboutme.basic_expenses = remove_brackets(aboutme.basic_expenses)
aboutme.daily_activities = remove_brackets(aboutme.daily_activities)
aboutme.daily_smoking = remove_brackets(aboutme.daily_smoking)
aboutme.education = remove_brackets(aboutme.education)
aboutme.flexible_work_hours = remove_brackets(aboutme.flexible_work_hours)
aboutme.gender = remove_brackets(aboutme.gender)
aboutme.good_life = remove_brackets(aboutme.good_life)
aboutme.hispanic = remove_brackets(aboutme.hispanic)
aboutme.income = remove_brackets(aboutme.income)
aboutme.marital = remove_brackets(aboutme.marital)
aboutme.race = remove_brackets(aboutme.race)
aboutme.smoking_status = remove_brackets(aboutme.smoking_status)
aboutme.menopause = remove_brackets(aboutme.menopause)
aboutme.recent_births = remove_brackets(aboutme.recent_births)
aboutme.current_pregnant = remove_brackets(aboutme.current_pregnant)
aboutme.work_schedule = remove_brackets(aboutme.work_schedule)

#Replace white space with NaN
aboutme = aboutme.replace(r'^\s*$', np.nan, regex=True)

### Filter based on age

In [6]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
aboutme = aboutme[~aboutme.participantId.isin(underage_participants.participantId)]

### Remove Test Accounts

In [7]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

aboutme = aboutme[~aboutme.participantId.isin(test_accounts.participantId)]

### Clean About Me Data (We have included recommended thresholds for data cleaning, but full data is provided except where doing so could compromise privacy).

### Clean Weight Data

In [8]:
aboutme.weight.describe()

count    5504.000000
mean      190.154433
std        54.149631
min        50.000000
25%       153.000000
50%       182.000000
75%       218.000000
max       557.000000
Name: weight, dtype: float64

In [9]:
aboutme.weight.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010    100.000
0.050    119.000
0.500    182.000
0.600    195.000
0.800    229.000
0.850    240.000
0.900    260.000
0.950    289.000
0.970    309.910
0.990    364.970
0.999    503.473
1.000    557.000
Name: weight, dtype: float64

In [10]:
#Filter odd data 
## If the weight is less than 80lbs or above 350bs, exclude (chosen based on threshold established in Asthma Data Descriptor).
aboutme.weight[ (aboutme.weight < 80) |  (aboutme.weight > 350) ] = 'CENSORED'

In [11]:
aboutme.weight.value_counts()

185.0       139
170.0       137
180.0       127
160.0       127
165.0       126
200.0       125
175.0       111
190.0       109
140.0       105
150.0       102
195.0       100
210.0        99
220.0        93
145.0        93
155.0        88
CENSORED     81
230.0        80
135.0        80
215.0        77
240.0        72
130.0        70
205.0        64
250.0        59
225.0        56
125.0        55
168.0        50
158.0        48
178.0        44
203.0        43
115.0        43
           ... 
331.0         2
276.0         2
313.0         2
91.0          1
92.0          1
291.0         1
281.0         1
301.0         1
87.0          1
85.0          1
99.0          1
308.0         1
81.0          1
338.0         1
316.0         1
317.0         1
319.0         1
322.0         1
323.0         1
348.0         1
324.0         1
347.0         1
327.0         1
328.0         1
332.0         1
343.0         1
342.0         1
334.0         1
341.0         1
304.0         1
Name: weight, Length: 25

### Clean caffeine data

In [12]:
aboutme.caffeine.describe()

count    5577.000000
mean        3.302313
std         4.929166
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max        50.000000
Name: caffeine, dtype: float64

In [13]:
aboutme.caffeine.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010     0.00
0.050     0.00
0.500     2.00
0.600     3.00
0.800     4.00
0.850     5.00
0.900     6.00
0.950    10.00
0.970    16.00
0.990    25.72
0.999    50.00
1.000    50.00
Name: caffeine, dtype: float64

In [14]:
#remove caffeine intake above 99th percentile, in this case is 24.
#aboutme.caffeine[aboutme.caffeine > 24 ] = 'LOW DATA QUALITY'

In [15]:
aboutme.caffeine.value_counts()

2.0     1508
1.0     1035
3.0      918
0.0      684
4.0      529
5.0      290
6.0      189
8.0      111
12.0      52
10.0      50
24.0      48
16.0      40
20.0      25
7.0       20
36.0      17
32.0      10
50.0      10
9.0        7
40.0       5
18.0       5
15.0       4
28.0       3
48.0       2
14.0       2
44.0       2
30.0       2
25.0       2
35.0       2
42.0       1
13.0       1
33.0       1
38.0       1
21.0       1
Name: caffeine, dtype: int64

### Create Internal & External Copies of the Data

In [16]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_ABOUTME_DATA = aboutme[aboutme.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_ABOUTME_DATA.shape

(3448, 21)

### Total number of unique participants

In [17]:
len(EXTERNAL_ABOUTME_DATA.participantId.unique())

3262

### Total number of observations

In [18]:
len(EXTERNAL_ABOUTME_DATA)

3448

### External - Upload to Synapse

In [None]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='About Me',
                               columns=as_table_columns(EXTERNAL_ABOUTME_DATA),
                               parent=SH_EXTERNAL_PROJECT)
aboutme_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_ABOUTME_DATA))
aboutme_synTable_external

### Internal - Upload to Synapse

In [None]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='About Me Internal',
                               columns=as_table_columns(aboutme), 
                               parent=SH_INTERNAL_PROJECT)
aboutme_synTable_internal = syn.store(Table(table_schema_internal,aboutme))

### Set Provenance

In [None]:
activity=Activity(name= 'About Me data curation', 
                  description='Process and convert raw data to table format', 
                  used=aboutme_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_About_Me.ipynb')
#Internal 
syn.setProvenance(aboutme_synTable_internal, activity)

#External
syn.setProvenance(aboutme_synTable_external, activity)