# GAD7 Final Data Prep

In [None]:
%matplotlib inline

In [None]:
import datetime as dt
import os

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import synapseclient
from synapseclient import Activity, File, Schema, Table, as_table_columns

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

In [None]:
v1sid, v2sid =  'syn12181331', 'syn9974013'

v1r = pd.read_csv(syn.get(v1sid).path, parse_dates=['timestamp'])
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['createdAt'])

v1r.head()
v2r.head()

### Process V1 data

In [None]:
# drop the uneeded columns
v1 = v1r.drop(columns=[
    'userid', 'timestampUTC'
]).rename(columns={
    'brightenid': 'participant_id',
    'gad7_S':'gad7_8',
    'timestamp':'dt_response'
})

v1['gad7_sum'] = [np.sum(t.iloc[2:]) for i, t in v1.iterrows()]
v1.head()

### Process V2 data

In [None]:
v2r.columns = ['participant_id', 'gad7_1', 'gad7_2', 'gad7_3', 'gad7_4', 'gad7_5', 'gad7_6', 'gad7_7', 'gad7_8', 'day', 'dt_response']
v2 = v2r.drop(columns=['day']).loc[:,  ['participant_id', 'dt_response', 'gad7_1', 'gad7_2', 'gad7_3', 'gad7_4', 'gad7_5', 'gad7_6', 'gad7_7', 'gad7_8'] ]

v2['gad7_sum'] = [np.sum(t.iloc[2:]) for i, t in v2.iterrows()]
v2.head()

In [None]:
# process for public file distribution on Synapse BRIGTHEN portal as a csv
v2p = v2r.copy().rename(columns={
    'participant_id':'username',
    'dt_response':'timestamp'
}).loc[:,  ['username', 'timestamp', 'gad7_1', 'gad7_2', 'gad7_3', 'gad7_4', 'gad7_5', 'gad7_6', 'gad7_7', 'gad7_8']]

name = 'V2_GAD7.csv'
v2p.to_csv(name, index=None)

# upload to public portal
v2p = syn.setProvenance(
    syn.store(File(name=name, path=name, parent='syn10848316')),
    activity=Activity(
        name='Public Release',
        used=[v2sid],
        executed=dict(
                name='IPython Notebook',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Create_GAD7_datafiles.ipynb'
        )
    )
)

v2sid='syn17023069'

### Combine the DataFrames

In [None]:
combined = pd.concat([v1, v2], sort=False)
combined.head()

### Set provenance and upload to Synapse

In [None]:
final = syn.store(Table(
    Schema(
            name='GAD7',
            columns=as_table_columns(combined), 
            parent='syn10848316'),
        combined
    )
)

In [None]:
final = syn.setProvenance(
    'syn17022655',
    activity=Activity(
        name='Combine V1 and V2 data',
        description='Process and combine the data collected during study 1 and study 2',
        used=[v1sid, v2sid],
        executed=[
            dict(
                name='IPython Notebook',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Create_GAD7_datafiles.ipynb'
            )
        ]
    )
)