#  About Me Data Prep

In [5]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw About Me Data

In [6]:
aboutme_raw_id = 'syn7115681'
aboutme = pd.read_csv(syn.get(aboutme_raw_id).path)

### Process About Me Data

In [7]:
# drop uneeded columns
aboutme = aboutme.drop(columns=['Birthdate','race2',''])
# rename columns
aboutme = aboutme.rename(columns={'Income':'income'})

#remove brackets
def remove_brackets(col):
    return( col.astype(str).str.replace(']', '').str.replace('[',''))
aboutme.alcohol = remove_brackets(aboutme.alcohol)
aboutme.basic_expenses = remove_brackets(aboutme.basic_expenses)
aboutme.daily_activities = remove_brackets(aboutme.daily_activities)
aboutme.daily_smoking = remove_brackets(aboutme.daily_smoking)
aboutme.education = remove_brackets(aboutme.education)
aboutme.flexible_work_hours = remove_brackets(aboutme.flexible_work_hours)
aboutme.gender = remove_brackets(aboutme.gender)
aboutme.good_life = remove_brackets(aboutme.good_life)
aboutme.hispanic = remove_brackets(aboutme.hispanic)
aboutme.income = remove_brackets(aboutme.income)
aboutme.marital = remove_brackets(aboutme.marital)
aboutme.race = remove_brackets(aboutme.race)
aboutme.smoking_status = remove_brackets(aboutme.smoking_status)
aboutme.menopause = remove_brackets(aboutme.menopause)
aboutme.recent_births = remove_brackets(aboutme.recent_births)
aboutme.current_pregnant = remove_brackets(aboutme.current_pregnant)
aboutme.work_schedule = remove_brackets(aboutme.work_schedule)

#Replace white space with NaN
aboutme = aboutme.replace(r'^\s*$', np.nan, regex=True)

KeyError: "[''] not found in axis"

### Filter based on age

In [None]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
aboutme = aboutme[~aboutme.participantId.isin(underage_participants.participantId)]

### Remove Test Accounts

In [8]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

aboutme = aboutme[~aboutme.participantId.isin(test_accounts.participantId)]

### Clean About Me Data

### Clean Weight Data

In [9]:
aboutme.weight.describe()

count    9091.000000
mean      189.925971
std        53.994596
min        50.000000
25%       154.000000
50%       182.000000
75%       217.000000
max       557.000000
Name: weight, dtype: float64

In [10]:
aboutme.weight.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010    100.0
0.050    118.0
0.500    182.0
0.600    195.0
0.800    228.0
0.850    240.0
0.900    260.0
0.950    289.0
0.970    310.0
0.990    364.0
0.999    499.0
1.000    557.0
Name: weight, dtype: float64

In [11]:
#Filter odd data 
## If the weight is less than 80lbs or above 869bs, exclude (chosen based on normative data).
aboutme.weight[ (aboutme.weight < 80) |  (aboutme.weight > 869) ] = 'LOW DATA QUALITY'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)



In [12]:
aboutme.weight.value_counts()

185.0    235
160.0    218
170.0    215
180.0    204
165.0    202
200.0    195
175.0    190
190.0    184
150.0    170
140.0    167
210.0    164
195.0    162
220.0    150
155.0    149
145.0    146
135.0    130
230.0    126
215.0    125
130.0    119
240.0    110
205.0    102
225.0     92
250.0     91
125.0     86
168.0     86
158.0     81
203.0     80
178.0     77
162.0     74
245.0     73
        ... 
394.0      2
334.0      2
323.0      2
407.0      2
440.0      2
414.0      2
420.0      2
322.0      2
438.0      2
338.0      1
347.0      1
87.0       1
439.0      1
470.0      1
317.0      1
478.0      1
332.0      1
397.0      1
381.0      1
402.0      1
382.0      1
291.0      1
328.0      1
417.0      1
308.0      1
411.0      1
409.0      1
324.0      1
511.0      1
99.0       1
Name: weight, Length: 302, dtype: int64

### Clean caffeine data

In [13]:
aboutme.caffeine.describe()

count    9208.000000
mean        3.278454
std         4.889993
min         0.000000
25%         1.000000
50%         2.000000
75%         4.000000
max        50.000000
Name: caffeine, dtype: float64

In [14]:
aboutme.caffeine.quantile([.01, .05, .5, .6, .8, .85, .90, .95, .97, 0.99, 0.999, 1])

0.010     0.0
0.050     0.0
0.500     2.0
0.600     3.0
0.800     4.0
0.850     5.0
0.900     6.0
0.950    10.0
0.970    16.0
0.990    25.0
0.999    50.0
1.000    50.0
Name: caffeine, dtype: float64

In [15]:
#remove caffeine intake above 99th percentile, in this case is 24.
aboutme.caffeine[aboutme.caffeine > 24 ] = 'LOW DATA QUALITY'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  



In [16]:
aboutme.caffeine.value_counts()

2.0                 2458
1.0                 1726
3.0                 1507
0.0                 1166
4.0                  866
5.0                  480
6.0                  316
8.0                  178
LOW DATA QUALITY      94
12.0                  87
24.0                  80
10.0                  80
16.0                  65
20.0                  42
7.0                   29
9.0                   11
18.0                   9
15.0                   7
14.0                   4
13.0                   2
21.0                   1
Name: caffeine, dtype: int64

### Create Internal & External Copies of the Data

In [17]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_ABOUTME_DATA = aboutme[aboutme.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_ABOUTME_DATA.shape

(5690, 23)

### Total number of unique participants

In [18]:
len(EXTERNAL_ABOUTME_DATA.participantId.unique())

3329

### Total number of observations

In [19]:
len(EXTERNAL_ABOUTME_DATA)

5690

### External - Upload to Synapse

In [None]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='About Me',
                               columns=as_table_columns(EXTERNAL_ABOUTME_DATA),
                               parent=SH_EXTERNAL_PROJECT)
aboutme_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_ABOUTME_DATA))
aboutme_synTable_external

### Internal - Upload to Synapse

In [None]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='About Me Internal',
                               columns=as_table_columns(aboutme), 
                               parent=SH_INTERNAL_PROJECT)
aboutme_synTable_internal = syn.store(Table(table_schema_internal,aboutme))

### Set Provenance

In [None]:
activity=Activity(name= 'About Me data curation', 
                  description='Process and convert raw data to table format', 
                  used=aboutme_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_About_Me.ipynb')
#Internal 
syn.setProvenance(aboutme_synTable_internal, activity)

#External
syn.setProvenance(aboutme_synTable_external, activity)