# PHQ2 Final Data Prep

In [None]:
import datetime as dt
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

### Get the data

In [None]:
v1_raw_id, v2_raw_id = 'syn10250486', 'syn9974012'

v1r = pd.read_excel(syn.get(v1_raw_id).path)
v2r = pd.read_csv(syn.get(v2_raw_id).path, parse_dates=['createdAt'])

v1r.head()
v2r.head()

| field         | data type | description                                                                                                                                                          |
|---------------|-----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| particpant_id | str       |                                                                                                                                                                      |
| dt_response  | DateTime  | A timestamp representing the time when the survey was completed. Timestamps are local to the survey location.                                                        |
| dt_yesterday  | Date      | The date for which the mood of this survey represents                                                                                                                |
| day_of_study  | int       | The number of days into the study the participant was when they completed this survey                                                                                |
| q1_response   | int       | A response to "Yesterday, where you feeling down, depressed, or hopeless?" $\in {1, 2, 3, 4, 5}$ as an ordinal level of intensity with 5 being the highest           |
| q2_response   | int       | A response to "Yesterday, did you have little interest or pleasure in doing things?" $\in {1, 2, 3, 4, 5}$ as an ordinal level of intensity with 5 being the highest |
| qsum          | int       | q1_response + q2response                                                                                                                                             |

### Process V1 data

In [None]:
# drop the uneeded columns
v1r = v1r.drop(columns=[
    'sent_time_local', 'sent_time_utc', 'response_utc', 'response_id', 'user_id', 'start', 'phq2'
]).rename(columns={
    'brightenid': 'participant_id',
    'Feeling down, depressed, or hopeless.': 'q1',
    'Little interest or pleasure in doing things.': 'q2',
})

# add in yesterdays date
def dx(x):
    t = x-dt.timedelta(days=1)
    return dt.date(year=t.year, month=t.month, day=t.day)

v1r['dt_yesterday'] = v1r.response_local.apply(dx)

# add qsum
v1r['qsum'] = [t.q1+t.q2 for t in v1r.itertuples()]

### Process V2 Data

In [None]:
# rename some columns
v2r = v2r.rename(columns={
    'YESTERDAY, were you bothered by any of the following problems? Feeling down, depressed, or hopeless.':'q1',
    'YESTERDAY, were you bothered by any of the following problems? Irritable or Anxious?':'q2',
    'username':'participant_id',
    'createdAt': 'response_local'
})

# add yesterdays date
v2r['dt_yesterday'] = v2r.response_local.apply(dx)

# add qsum
v2r['qsum'] = [t.q1+t.q2 for t in v2r.itertuples()]

### Combine the DataFrames

In [None]:
combined = pd.concat([v1r, v2r], sort=False)
combined = combined.loc[:,
    ['participant_id', 'response_local', 'dt_yesterday', 'day', 'q1', 'q2', 'qsum']
].rename(columns={'response_local':'dt_response'})
combined.head()

### Set provevance and upload to Synapse

In [None]:
phq2_final = syn.store(Table(
    Schema(
            name='Daily PHQ2 Survey',
            columns=as_table_columns(combined), 
            parent='syn10848316'),
        combined
    )
)

In [None]:
phq2_final = syn.setProvenance(
    'syn17020587',
    activity=Activity(
        name='Combine V1 and V2 data',
        description='Process and combine the data collected during study 1 and study 2',
        used=[v1_raw_id, v2_raw_id],
        executed=[
            dict(
                name='IPython Notebook',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Create_PHQ2_datafiles.ipynb'
            )
        ]
    )
)