# Curate_GAD7_Data

In [1]:
%matplotlib inline

In [2]:
import datetime as dt
import os

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import synapseclient
from synapseclient import Activity, File, Schema, Table, as_table_columns
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

tqdm.pandas()

Welcome, Abhishek Pratap!



In [3]:
v1sid, v2sid =  'syn12181331', 'syn9974013'

v1r = pd.read_csv(syn.get(v1sid).path, parse_dates=['timestamp'])
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['createdAt'])

v1r.head()
v2r.head()

Unnamed: 0,userid,brightenid,timestamp,timestampUTC,gad7_1,gad7_2,gad7_3,gad7_4,gad7_5,gad7_6,gad7_7,gad7_S
0,10431,BLUE-00048,2014-08-01 06:59:31,2014-08-01 10:59:30,1,0,1,1,0,0,0,0
1,10470,BLUE-00049,2014-08-01 12:31:14,2014-08-01 16:31:13,1,0,1,1,0,1,1,1
2,10519,BLUE-00050,2014-08-08 19:30:09,2014-08-09 02:30:10,1,1,1,1,1,1,1,0
3,10617,BLUE-00051,2014-08-05 13:34:21,2014-08-05 17:34:20,1,1,1,1,1,1,0,1
4,10672,BLUE-00052,2014-08-04 14:27:28,2014-08-04 21:27:28,3,3,3,3,3,3,3,3


Unnamed: 0,username,"Over the last 2 weeks, how often have you been bothered by any of the following problems? Feeling nervous, anxious, or on edge","Over the last 2 weeks, how often have you been bothered by any of the following problems? Not being able to stop or control worrying","Over the last 2 weeks, how often have you been bothered by any of the following problems? Worrying too much about different things","Over the last 2 weeks, how often have you been bothered by any of the following problems? Trouble relaxing","Over the last 2 weeks, how often have you been bothered by any of the following problems? Being so restless that it's hard to sit still","Over the last 2 weeks, how often have you been bothered by any of the following problems? Becoming easily annoyed or irritable","Over the last 2 weeks, how often have you been bothered by any of the following problems? Feeling afraid as if something awful might happen","If you checked off any problems, how difficult have these made it for you to do your work, take care of things at home, or get along with other people?",day,createdAt
0,EN00469,0,1,1,1,0,1,0,2,1,2016-11-11 14:03:15
1,EN05051,0,1,1,1,0,0,0,0,1,2016-09-03 15:53:34
2,EN05023,2,1,2,3,1,3,1,1,1,2016-09-03 12:51:17
3,EN05184,3,3,3,3,1,3,2,2,1,2016-11-06 17:57:18
4,EN00486,2,3,3,2,2,2,1,1,1,2016-11-11 18:22:08


### Process V1 data

In [None]:
# drop the uneeded columns
v1 = v1r.drop(columns=[
    'userid', 'timestampUTC'
]).rename(columns={
    'brightenid': 'participant_id',
    'gad7_S':'gad7_8',
    'timestamp':'dt_response'
})

v1['gad7_sum'] = [np.sum(t.iloc[2:]) for i, t in v1.iterrows()]
v1.head()

### Process V2 data

In [None]:
v2r.columns = ['participant_id', 'gad7_1', 'gad7_2', 'gad7_3', 'gad7_4', 'gad7_5', 'gad7_6', 'gad7_7', 'gad7_8', 'day', 'dt_response']
v2 = v2r.drop(columns=['day']).loc[:,  ['participant_id', 'dt_response', 'gad7_1', 'gad7_2', 'gad7_3', 'gad7_4', 'gad7_5', 'gad7_6', 'gad7_7', 'gad7_8'] ]

v2['gad7_sum'] = [np.sum(t.iloc[2:]) for i, t in v2.iterrows()]
v2.head()

### Combine the DataFrames

In [None]:
combined = pd.concat([v1, v2], sort=False)
combined.head()

In [None]:
sum(combined.duplicated(subset=['participant_id', 'dt_response']))

### Add week in study

In [4]:
metasid = 'syn27082597'

In [None]:
metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)

# add in the participants start date as a new column
combined = pd.merge(combined, metadata, on='participant_id', how='left')

# get the time difference in weeks as a float
combined['week'] = [
    d.days/7 for d in (
        combined.dt_response.apply(
            lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate
    )
]

# convert the week number to an int by taking the floor
combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)

# remove the start date
combined = combined.drop(columns=['startdate'], errors='ignore')

# reorder the columns
cols = list(combined.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
combined = combined.reindex(columns=cols)

combined.head()

### Loclize timestamps

In [None]:
# localize timestamps
combined['dt_response'] = [
    str(t.tz_localize('UTC'))
    for t in combined.dt_response
]

combined.head()

### Set provenance and upload to Synapse

In [None]:
# remove existing rows
t = syn.delete(
    syn.tableQuery('SELECT * FROM syn17022655')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='GAD-7',
            columns=as_table_columns(combined), 
            parent='syn10848316'),
        combined
    )
)

In [5]:
final = syn.setProvenance(
    'syn17022655',
    activity=Activity(
        name='Combine V1 and V2 data',
        description='Process and combine the data collected during study 1 and study 2',
        used=[v1sid, v2sid],
        executed=[
            dict(
                name='Curate_GAD7_Data',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_GAD7_Data.ipynb'
            )
        ]
    )
)