# GAD7 Final Data Prep

In [None]:
%matplotlib inline

In [None]:
import datetime as dt
import os

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import synapseclient
from synapseclient import Activity, File, Schema, Table, as_table_columns
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

tqdm.pandas()

In [None]:
v1sid, v2sid =  'syn12181331', 'syn9974013'

v1r = pd.read_csv(syn.get(v1sid).path, parse_dates=['timestamp'])
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['createdAt'])

v1r.head()
v2r.head()

### Process V1 data

In [None]:
# drop the uneeded columns
v1 = v1r.drop(columns=[
    'userid', 'timestampUTC'
]).rename(columns={
    'brightenid': 'participant_id',
    'gad7_S':'gad7_8',
    'timestamp':'dt_response'
})

v1['gad7_sum'] = [np.sum(t.iloc[2:]) for i, t in v1.iterrows()]
v1.head()

### Process V2 data

In [None]:
v2r.columns = ['participant_id', 'gad7_1', 'gad7_2', 'gad7_3', 'gad7_4', 'gad7_5', 'gad7_6', 'gad7_7', 'gad7_8', 'day', 'dt_response']
v2 = v2r.drop(columns=['day']).loc[:,  ['participant_id', 'dt_response', 'gad7_1', 'gad7_2', 'gad7_3', 'gad7_4', 'gad7_5', 'gad7_6', 'gad7_7', 'gad7_8'] ]

v2['gad7_sum'] = [np.sum(t.iloc[2:]) for i, t in v2.iterrows()]
v2.head()

### Combine the DataFrames

In [None]:
combined = pd.concat([v1, v2], sort=False)
combined.head()

In [None]:
sum(combined.duplicated(subset=['participant_id', 'dt_response']))

### Add week in study

In [None]:
metasid = 'syn17023349'
metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)

# add in the participants start date as a new column
combined = pd.merge(combined, metadata, on='participant_id', how='left')

# get the time difference in weeks as a float
combined['week'] = [
    d.days/7 for d in (
        combined.dt_response.apply(
            lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate
    )
]

# convert the week number to an int by taking the floor
combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)

# remove the start date
combined = combined.drop(columns=['startdate'], errors='ignore')

# reorder the columns
cols = list(combined.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
combined = combined.reindex(columns=cols)

combined.head()

### Set provenance and upload to Synapse

In [None]:
# remove existing rows
t = syn.delete(
    syn.tableQuery('SELECT * FROM syn17022655')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='GAD-7',
            columns=as_table_columns(combined), 
            parent='syn10848316'),
        combined
    )
)

In [None]:
final = syn.setProvenance(
    'syn17022655',
    activity=Activity(
        name='Combine V1 and V2 data',
        description='Process and combine the data collected during study 1 and study 2',
        used=[v1sid, v2sid],
        executed=[
            dict(
                name='IPython Notebook',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Create_GAD7_datafiles.ipynb'
            )
        ]
    )
)