In [None]:
%matplotlib inline

import datetime as dt
import itertools as it

from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns
from tabulate import tabulate

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

### Prep the data

https://www.synapse.org/#!Synapse:syn15667977

In [None]:
v2sid = 'syn15667977'
v2 = pd.read_csv(syn.get(v2sid).path, parse_dates=[
    'end_date', 'enrollment_date'
])

v2 = v2.rename(columns={
    'username':'participant_id',
    'end_date':'dt_end',
    'enrollment_date':'dt_enrollment'
})

v2.head()

In [None]:
t = [print(c) for c in sorted(v2.columns)]

In [None]:
# take the features to make public
cols = [
    'participant_id', 'dt_enrollment', 'dt_end', 'days_in_study',
    'days_with_data', 'home_work_distance', 'hours_clustered', 'hours_with_data', 
    'mti_between_home_visits', 'mti_between_work_visits', 'n_clusters'
]
v2 = v2.reindex(columns=cols)

# round and cast
v2.days_in_study = [np.int16(i) if not pd.isnull(i) else np.nan for i in v2.days_in_study]
v2.hours_clustered = np.around(v2.hours_clustered, 1)
v2.home_work_distance = np.around(v2.home_work_distance, 1)
v2.mti_between_home_visits = np.around(v2.mti_between_home_visits, 1)
v2.mti_between_work_visits = np.around(v2.mti_between_work_visits, 1)
v2.n_clusters = [np.int16(i) if not pd.isnull(i) else np.nan for i in v2.n_clusters]

v2.head()

### Reduce Timestamps to dates

In [None]:
v2.dt_enrollment = v2.dt_enrollment.apply(lambda x: x.date())
v2.dt_end = v2.dt_end.apply(lambda x: x.date())

### Upload to Synapse

In [None]:
results = syn.tableQuery('select * from syn17097503')
a = syn.delete(results)

In [None]:
final = syn.store(Table(
    Schema(
            name='Passive GPS Features (v2)',
            columns=as_table_columns(v2), 
            parent='syn10848316'),
        v2
    )
)

In [None]:
final = syn.setProvenance(
    'syn17097503',
    activity=Activity(
        name='Generate V2 Table Data',
        description='Process the data collected during study ',
        used=[v2sid],
        executed=[
            dict(
                name='IPython Notebook',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Create_V2GPSPassiveAggregatedFeatures_datafiles.ipynb'
            )
        ]
    )
)

### Make the data dictionary

In [None]:
cheat = []
headers = ['#', 'Variable Name', 'Data Type', 'Description']

for c in v2.columns:
    cheat.append((c, str(v2[c].dtype)\
          .replace('object', 'str')\
          .replace('float64', 'float')\
          .replace('int16', 'int')\
          .replace('int64', 'int')\
          .replace('datetime64[ns]', 'DateTime')
     ))
    
descriptions = [
    'Unique ID',                                                # participant id
    'Date for when the participant enrolled in the study',      # dt_enrollment
    'Last date of PHQ2 response',                               # dt_end
    'Number of days between the end_date and enrollment_date',  # dt_end-dt_enrollment
    'Number days with recorded GPS data',                       # days_in_study
    'Distance in meters between the home and work clusters',    # home_work_distance
    'Number of hours spent in a known cluster',                 # hours_spent_in_a_known_cluster
    'Number of unique hours with GPS data present',             # 
    'Mean time interval between visits to the home cluster',    #
    'Mean time interval between visits to the work cluster',    #
    'Number of unique clusters identified'
]

cheat = pd.DataFrame(
    cheat, 
    columns=headers[1:-1],
    index=np.arange(1, len(cheat)+1)
)

cheat['Description'] = descriptions + ['' for i in range(len(descriptions), len(cheat))]

print(tabulate(
    cheat,
    headers=headers,
    tablefmt='orgtbl'
).replace('+', '|'))