# Curate_MentalHealthServices_data

In [3]:
import datetime as dt
import itertools
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, File, Schema, Table, as_table_columns
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

tqdm.pandas()

def isnum(x):
    if x is None:
        return False
    try:
        float(x)
        return True
    except ValueError:
        return False

Welcome, Abhishek Pratap!



In [4]:
v1sid, v2sid = 'syn12181333', 'syn12181343'

v1r = pd.read_csv(syn.get(v1sid).path, parse_dates=['timestamp'])
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['timestamp'])

v1r.head()
v2r.head()

Unnamed: 0,userid,brightenid,timestamp,timestampUTC,mhs_1,mhs_2,mhs_3,mhs_4,mhs_5
0,10431,BLUE-00048,2014-08-01 07:00:47,2014-08-01 11:00:48,No,No,No,No,No
1,10470,BLUE-00049,2014-08-01 12:32:32,2014-08-01 16:32:31,No,No,No,No,Yes
2,10519,BLUE-00050,2014-08-08 19:32:57,2014-08-09 02:32:57,No,No,No,No,No
3,10519,BLUE-00050,2014-09-08 18:23:10,2014-09-09 01:23:09,No,No,No,No,No
4,10519,BLUE-00050,2014-11-03 08:37:44,2014-11-03 16:37:43,No,No,No,No,No


Unnamed: 0,userid,timestamp,mhs_1,mhs_2,mhs_3,mhs_4,mhs_5
0,EN05337,2017-03-26 17:52:57,No,No,No,No,No
1,EN00252,2016-09-28 19:56:34,No,No,No,No,Yes
2,EN00599,2016-12-05 08:12:32,No,No,No,No,No
3,EN00625,2016-12-19 17:14:09,No,No,No,No,No
4,EN00150,2016-10-14 01:32:47,No,No,No,No,No


In [None]:
# # upload v2 to public release portal as csv
# v2p = v2r.copy()
# name = 'V2_MentalHealthSvc.csv'

# v2p.to_csv(name, index=None)
# v2p = syn.setProvenance(
#     syn.store(File(name=name, path=name, parent='syn10848316')),
#     activity=Activity(
#         name='Public Release',
#         description='Prepare data for public realease',
#         used=[v2sid],
#         executed=[
#             dict(
#                 name='IPython Notebook',
#                 url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Create_ImpactMHS_datafiles.ipynb'
#             )
#         ]
#     )
# )

In [None]:
v2sid='syn12181343'

In [None]:
v1 = v1r.drop(columns=[
    'userid', 'timestampUTC'
]).rename(columns={
    'brightenid':'participant_id',
    'timestamp':'dt_response'
})

# convert the response columns to appropiate indicators
def fx(x):
    if x == 'No':
        return 0
    elif x == 'Yes':
        return 1
    else:
        return -1
    
for i in range(1, 6):
    v1[f'mhs_{i}'] = v1[f'mhs_{i}'].apply(fx)

v1.head()

In [None]:
v2 = v2r.rename(columns={
    'userid':'participant_id',
    'timestamp':'dt_response'
})

# convert the response columns to appropiate indicators
def fx(x):
    if x == 'No':
        return 0
    elif x == 'Yes':
        return 1
    else:
        return -1
    
for i in range(1, 6):
    v2[f'mhs_{i}'] = v2[f'mhs_{i}'].apply(fx)

v2.head()

### Combine the DataFrames

In [None]:
combined = pd.concat([v1, v2], sort=False)
combined.head()

### Add week into study

In [6]:
metasid = 'syn27082597'

In [None]:
metasid = 'syn27082597'
metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)

# add in the participants start date as a new column
combined = pd.merge(combined, metadata, on='participant_id', how='left')

# get the time difference in weeks as a float
combined['week'] = [
    d.days/7 for d in (
        combined.dt_response.apply(
            lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate
    )
]

# convert the week number to an int by taking the floor
combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)

# remove the start date
combined = combined.drop(columns=['startdate'], errors='ignore')

# reorder the columns
cols = list(combined.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
combined = combined.reindex(columns=cols)

combined.head()

In [None]:
combined.week.hist()

### Localize timestamps

In [None]:
# localize timestamps
combined['dt_response'] = [
    str(t.tz_localize('UTC'))
    for t in combined.dt_response
]

combined.head()

### Set provenance and upload to Synapse

In [None]:
t = syn.delete(
    syn.tableQuery('select * from syn17022660')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='Mental Health Services',
            columns=as_table_columns(combined), 
            parent='syn10848316'),
        combined
    )
)

In [7]:
final = syn.setProvenance(
    'syn17022660',
    activity=Activity(
        name='Combine V1 and V2 data',
        description='Process and combine the data collected during study 1 and study 2',
        used=[v1sid, v2sid, metasid],
        executed=[
            dict(
                name='Curate_MentalHealthServices_data',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_MentalHealthServices_data.ipynb'
            )
        ]
    )
)