In [None]:
%matplotlib inline

import datetime as dt
import itertools as it

from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns
from tabulate import tabulate
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

tqdm.pandas()

In [None]:
v2sid = 'syn17037408'
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['date']).rename(columns={
    'username':'participant_id',
    'date':'dt_passive'
})

leading = ['participant_id', 'dt_passive']
v2 = v2r.reindex(labels=
    leading + sorted(list(set(v2r.columns)-set(leading))),
    axis=1                  
)

v2.location_variance = np.round(v2.location_variance, 3)

### Add the week in to the study

In [None]:
metasid = 'syn17023349'
metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)

# add in the participants start date as a new column
v2 = pd.merge(v2, metadata, on='participant_id', how='left')

# exclude any rows before the start date
v2 = v2.loc[(v2.dt_passive >= v2.startdate) | v2.dt_passive.isnull()]

# get the time difference in weeks as a float
v2['week'] = [
    d.days/7 for d in (
        v2.dt_passive.apply(
            lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-v2.startdate
    )
]

# convert the week number to an int by taking the floor
v2.week = v2.week.progress_apply(lambda x: np.int16(np.floor(x)) if not pd.isnull(x) else np.nan)

# remove the start date
v2 = v2.drop(columns=['startdate'], errors='ignore')

# reorder the columns
cols = list(v2.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
v2 = v2.reindex(columns=cols)

v2.head()

### Reduce dt_passive to date only

In [None]:
v2.dt_passive = v2.dt_passive.apply(lambda x: x.date())

### Upload to Synapse

In [None]:
t = syn.delete(
    syn.tableQuery('select * from syn17061218')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='GPS Features (v2)',
            columns=as_table_columns(v2), 
            parent='syn10848316'),
        v2
    )
)

In [None]:
final = syn.setProvenance(
    'syn17061218',
    activity=Activity(
        name='Generate Public GPS (v2) Table Data',
        description='Process the data collected during study ',
        used=[v2sid],
        executed=[
            dict(
                name='IPython Notebook',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Create_V2GpsFeatures_datafiles.ipynb'
            )
        ]
    )
)

### Make data dictionary

In [None]:
cheat = []
headers = ['#', 'Variable Name', 'Data Type', 'Description']

for c in v2.columns:
    cheat.append((c, str(v2[c].dtype)\
          .replace('object', 'str')\
          .replace('float64', 'float')\
          .replace('int16', 'int')\
          .replace('int64', 'int')\
          .replace('datetime64[ns]', 'DateTime')
     ))
    
descriptions = [
    'Unique ID',                    # participant id
    'Date of aggregated data',      # dt_passive
    '$$\(\in \)$$ { _true_, _false_ } indicating whether or not the participant came to work',
    'The cumulative distance traveled in the _active_ velocity bin (_meters_)', # distance_activate
    'The cumulative distance traveled in the _high speed transportation_ velocity bin (_meters_)',
    'The cumulative distance traveled in the _powered vehicle_ velocity bin (_meters_)',
    'The cumulative distance traveled in the _walking_ velocity bin (_meters_)',
    'The count of hours in the day for which GPS records exist ',
    'The cumulative time spent in the _active_ velocity bin (_hours_)',
    'The cumulative time spent in the _home_ cluster (_hours_)',
    'The cumulative time spent in the _work_ cluster  (_hours_)',
    'The cumultive time spent in the _high speed transportation_ velocity bin (_hours_)',
    'The estimated hours of sleep accrued during the previous night (_hours_)',
    'The cumulative time spent in the _powered vehicle_ velocity bin (_hours_)',
    'The cumulative time spent in the participants top 3 most visited clusters throughout the study period (_hours_)',
    'The cumulative time spent in the _stationary_ velocity bin (_hours_)',
    'The cumulative time spent in the _stationary_ velocity bin excluding the time spent in the home and work clusters (_hours_)',
    'The cumulative time spent in the _walking_ velocity bin (_hours_)',
    'The count of hours in the day for which GPS does not exist',
    'log($$\(\sigma^2_{\text{lat}}+\sigma^2_{\text{lon}}\)$$)',
    'The distinct number of clusters visited'
]

cheat = pd.DataFrame(
    cheat, 
    columns=headers[1:-1],
    index=np.arange(1, len(cheat)+1)
)

cheat['Description'] = descriptions + ['' for i in range(len(descriptions), len(cheat))]

print(tabulate(
    cheat,
    headers=headers,
    tablefmt='orgtbl'
).replace('+', '|'))