# My Family Data Prep

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()


UPGRADE AVAILABLE

A more recent version of the Synapse Client (2.0.0) is available. Your version (1.9.4) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 2.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Sean Deering!



### Get Raw My Family Data

In [2]:
myfamily_raw_id = 'syn7117879'

myfamily = pd.read_csv(syn.get(myfamily_raw_id).path)

### Process My Family Data

In [3]:
#remove brackets
def remove_brackets(col):
    return( col.str.replace(']', '').str.replace('[',''))
myfamily = myfamily.apply(remove_brackets, axis=0)

#drop columns with free text fields
myfamily = myfamily.drop(columns=['language2'])

#drop brothers/sisters info
myfamily = myfamily.drop(columns=['brothers_only','brothers&sisters1','brothers&sisters2','sisters_only'])

#drop rows where everything is blank
myfamily.dropna(subset = ['fam_history','family_size','language','underage_family'],how='all',inplace=True)
myfamily.columns

Index([u'participantId', u'fam_history', u'family_size', u'language',
       u'underage_family', u'timestamp'],
      dtype='object')

### Filter based on age

In [4]:
underage_participants = pd.read_csv( syn.get('syn21905452').path, sep="\t")
myfamily = myfamily[~myfamily.participantId.isin(underage_participants.participantId)]

### Remove test accounts

In [5]:
test_accounts = pd.read_excel(syn.get('syn21958537').path)

myfamily = myfamily[~myfamily.participantId.isin(test_accounts.participantId)]

### Create Internal & External Copies of the Data

In [6]:
#download the data 
sharing_info = pd.read_excel(syn.get('syn21958546').path)
healthCodes_with_broadsharing = sharing_info[sharing_info.sharing == 'all_qualified_researchers']

EXTERNAL_MYFAMILY_DATA = myfamily[myfamily.participantId.isin(healthCodes_with_broadsharing['participant id'])]
EXTERNAL_MYFAMILY_DATA.shape

(3003, 6)

### Number of unique participants

In [7]:
len(EXTERNAL_MYFAMILY_DATA.participantId.unique())

2760

### Total number of observations

In [8]:
len(EXTERNAL_MYFAMILY_DATA)

3003

### External - Upload to Synapse

In [9]:
SH_EXTERNAL_PROJECT = 'syn18492837'
table_schema_external = Schema(name='My Family',
                               columns=as_table_columns(EXTERNAL_MYFAMILY_DATA),
                               parent=SH_EXTERNAL_PROJECT)
myfamily_synTable_external = syn.store(Table(table_schema_external,EXTERNAL_MYFAMILY_DATA))
myfamily_synTable_external

<synapseclient.table.CsvFileTable at 0x115623950>

### Internal - Upload to Synapse

In [10]:
SH_INTERNAL_PROJECT = 'syn7066726'
table_schema_internal = Schema(name='My Family Internal',
                               columns=as_table_columns(myfamily), 
                               parent=SH_INTERNAL_PROJECT)
myfamily_synTable_internal = syn.store(Table(table_schema_internal,myfamily))

### Set Provevance

In [11]:
activity=Activity(name= 'My Family data curation', 
                  description='Process and convert raw data to table format', 
                  used=myfamily_raw_id, 
                  executed='https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_My_Family.ipynb')
#Internal 
syn.setProvenance(myfamily_synTable_internal, activity)

#External
syn.setProvenance(myfamily_synTable_external, activity)

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-20T20:24:08.954Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'f79f0162-52e3-4225-a8e5-7745b4995c4a',
 u'id': u'10233750',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-20T20:24:08.954Z',
 u'name': u'My Family data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7117879', u'targetVersionNumber': 791},
   u'wasExecuted': False},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_My_Family.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_My_Family.ipynb',
   u'wasExecuted': True}]}

{u'createdBy': u'3334346',
 u'createdOn': u'2020-04-20T20:24:10.924Z',
 u'description': u'Process and convert raw data to table format',
 u'etag': u'50ea847c-5252-42dd-9aaf-414334c1c2fa',
 u'id': u'10233751',
 u'modifiedBy': u'3334346',
 u'modifiedOn': u'2020-04-20T20:24:10.924Z',
 u'name': u'My Family data curation',
 u'used': [{u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedEntity',
   u'reference': {u'targetId': u'syn7117879', u'targetVersionNumber': 791},
   u'wasExecuted': False},
  {u'concreteType': u'org.sagebionetworks.repo.model.provenance.UsedURL',
   u'name': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_My_Family.ipynb',
   u'url': u'https://github.com/apratap/SleepHealth_Data_Release/blob/master/Create_My_Family.ipynb',
   u'wasExecuted': True}]}