#  About Me Data Prep

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw About Me Data

In [2]:
aboutme_raw_id = 'syn7115681'
aboutme = pd.read_csv(syn.get(aboutme_raw_id).path)

### Process About Me Data

In [3]:
# drop uneeded columns
aboutme = aboutme.drop(columns=['Birthdate','race2'])
# rename columns
aboutme = aboutme.rename(columns={'Income':'income'})

#remove brackets
def remove_brackets(col):
    return( col.astype(str).str.replace(']', '').str.replace('[',''))
aboutme.alcohol = remove_brackets(aboutme.alcohol)
aboutme.basic_expenses = remove_brackets(aboutme.basic_expenses)
aboutme.daily_activities = remove_brackets(aboutme.daily_activities)
aboutme.daily_smoking = remove_brackets(aboutme.daily_smoking)
aboutme.education = remove_brackets(aboutme.education)
aboutme.flexible_work_hours = remove_brackets(aboutme.flexible_work_hours)
aboutme.gender = remove_brackets(aboutme.gender)
aboutme.good_life = remove_brackets(aboutme.good_life)
aboutme.hispanic = remove_brackets(aboutme.hispanic)
aboutme.income = remove_brackets(aboutme.income)
aboutme.marital = remove_brackets(aboutme.marital)
aboutme.race = remove_brackets(aboutme.race)
aboutme.smoking_status = remove_brackets(aboutme.smoking_status)
aboutme.menopause = remove_brackets(aboutme.menopause)
aboutme.recent_births = remove_brackets(aboutme.recent_births)
aboutme.current_pregnant = remove_brackets(aboutme.current_pregnant)
aboutme.work_schedule = remove_brackets(aboutme.work_schedule)

#Replace white space with NaN
aboutme = aboutme.replace(r'^\s*$', np.nan, regex=True)

### Filter based on age

In [4]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
aboutme = aboutme[~aboutme.participantId.isin(underage_participants.participantId)]

### Clean About Me Data

### Clean Weight Data

In [5]:
aboutme.weight.describe()

count    8964.000000
mean      190.675033
std        53.886526
min        50.000000
25%       155.000000
50%       183.000000
75%       218.000000
max       557.000000
Name: weight, dtype: float64

In [6]:
aboutme.weight.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010    100.000
0.050    120.000
0.500    183.000
0.600    195.000
0.800    228.000
0.850    240.000
0.900    260.000
0.950    290.000
0.970    310.000
0.990    364.370
0.999    499.333
1.000    557.000
Name: weight, dtype: float64

In [None]:
#Filter odd data 
## If the weight is above 869, <= .01 percentile of the sample, in this case 869lbs (869 chosen based on normative data)
aboutme.weight[ (aboutme.weight <= 100) |  (aboutme.weight > 869) ] = 'LOW DATA QUALITY'

In [7]:
aboutme.weight.value_counts()

185.0    236
170.0    213
160.0    212
180.0    202
165.0    200
200.0    195
175.0    189
190.0    184
210.0    166
140.0    163
195.0    162
150.0    162
220.0    150
155.0    144
145.0    144
135.0    129
215.0    128
230.0    126
240.0    110
130.0    110
205.0    102
225.0     92
250.0     91
168.0     85
125.0     83
203.0     80
158.0     79
178.0     77
245.0     73
162.0     72
        ... 
438.0      2
301.0      2
394.0      2
420.0      2
72.0       1
99.0       1
409.0      1
332.0      1
478.0      1
382.0      1
291.0      1
64.0       1
328.0      1
68.0       1
411.0      1
439.0      1
417.0      1
76.0       1
324.0      1
71.0       1
338.0      1
347.0      1
381.0      1
87.0       1
308.0      1
402.0      1
397.0      1
317.0      1
511.0      1
470.0      1
Name: weight, Length: 315, dtype: int64

### Clean caffeine data

In [None]:
aboutme.caffeine.describe()

In [None]:
aboutme.caffeine.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

In [None]:
#remove caffeine intake above 99th percentile, in this case is 24. Don't need to remove 0 because that is an acceptable answer.
aboutme.caffeine[aboutme.caffeine > 24 ] = 'LOW DATA QUALITY'

In [None]:
aboutme.caffeine.value_counts()

### Create Internal & External Copies of the Data

In [None]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21557215').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_ABOUTME_DATA = aboutme[aboutme.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_ABOUTME_DATA.shape

### External - Upload to Synapse

In [None]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='About Me',
                               columns=as_table_columns(EXTERNAL_ABOUTME_DATA),
                               parent=SH_EXTERNAL_PROJECT)
aboutme_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_ABOUTME_DATA))
aboutme_synTable_external

### Internal - Upload to Synapse

In [None]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='About Me Internal',
                               columns=as_table_columns(aboutme), 
                               parent=SH_INTERNAL_PROJECT)
aboutme_synTable_internal = syn.store(Table(table_schema_internal,aboutme))

### Set Provenance

In [None]:
activity=Activity(name= 'About Me data curation', 
                  description='Process and convert raw data to table format', 
                  used=aboutme_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_About_Me.ipynb')
#Internal 
syn.setProvenance(aboutme_synTable_internal, activity)

#External
syn.setProvenance(aboutme_synTable_external, activity)