In [1]:
import datetime as dt
import itertools as it
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import scipy.stats as stats
from tqdm import tqdm

init_notebook_mode(connected=True)
InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

Welcome, Luke Waninger!



In [2]:
meta = syn.tableQuery('select participant_id, zipcode from syn17023349').asDataFrame()

In [3]:
v1sids = [
    ('syn10250484', 'xls'),
    ('syn10250483', 'xls'),
    ('syn12204723', 'csv'),
    ('syn12181335', 'csv'),
    ('syn12181333', 'csv'),
    ('syn12181340', 'csv'),
    ('syn12181339', 'csv'),
    ('syn12181331', 'csv'),
    ('syn12181332', 'csv'),
    ('syn10250481', 'xls'),
    ('syn10250486', 'xls')
]

v1data = [
    pd.read_csv(syn.get(t[0]).path)
        if t[1] == 'csv' else
    pd.read_excel(syn.get(t[0]).path)
    for t in v1sids
]

In [4]:
v1utc, v1local = [], []
for df in v1data:
    if 'timestampUTC' in df.columns:
        # make sure to only use US participants
        df = df.loc[[t.brightenid in meta.participant_id.tolist() for t in df.itertuples()]]
        v1utc += pd.to_datetime(df.timestampUTC.tolist())
        v1local += pd.to_datetime(df.timestamp.tolist())

In [5]:
v2auditc = pd.read_csv(syn.get('syn9974011').path, parse_dates=['createdAt'])
v2mood = pd.read_csv(syn.get('syn9974012').path, parse_dates=['createdAt'])
v2gad7 = pd.read_csv(syn.get('syn9974013').path, parse_dates=['createdAt'])
v2mhs  = pd.read_csv(syn.get('syn9974017').path, parse_dates=['createdAt'])
v2auditc = pd.read_csv(syn.get('syn9974011').path, parse_dates=['createdAt'])
v2pgic = pd.read_csv(syn.get('syn9974019').path, parse_dates=['createdAt'])
v2sds = pd.read_csv(syn.get('syn9974021').path, parse_dates=['createdAt'])
v2sleep = pd.read_csv(syn.get('syn9974022').path, parse_dates=['createdAt'])
v2impact = pd.read_csv(syn.get('syn9974018').path, parse_dates=['createdAt'])

First get an idea of the UTC vs local densities.

In [7]:
def seconds_into_day(x):
    a = dt.datetime(year=x.year, month=x.month, day=x.day, hour=0, minute=0, second=0)
    return (x-a).total_seconds()/3600

data = [
    list(map(seconds_into_day, v1utc)),
    list(map(seconds_into_day, v1local)),
    v2auditc.createdAt.apply(seconds_into_day),
    v2mood.createdAt.apply(seconds_into_day),
    v2gad7.createdAt.apply(seconds_into_day),
    v2mhs.createdAt.apply(seconds_into_day),
    v2pgic.createdAt.apply(seconds_into_day),
    v2sds.createdAt.apply(seconds_into_day),
    v2sleep.createdAt.apply(seconds_into_day),
    v2impact.createdAt.apply(seconds_into_day),
]

labels = ['v1utc', 'v1local', 'auditc', 'mood','gad7', 'mhs', 'pgic', 'sds', 'sleep', 'impact']
fig = ff.create_distplot(data, labels)

iplot(fig)

It looks very much like the V2 data is already in UTC. There's a significant mode around 9 in the morning for local times that we don't see in any of the other distributions. I'll do some t-tests to confirm.

In [64]:
results = []
for i in range(2, len(data)):
    test = stats.ttest_ind(data[0], data[i], equal_var=False)
    results.append(('v1utc', labels[i], len(v1utc), len(data[i]), test.statistic, np.round(test.pvalue, 4)))

    test = stats.ttest_ind(data[1], data[i], equal_var=False)
    results.append(('v1local', labels[i], len(v1local), len(data[i]), test.statistic, np.round(test.pvalue, 4)))

df = pd.DataFrame(results, columns=['control', 'test', 'n_ctrl', 'n_test', 'statistic', 'pvalue'])
df

Unnamed: 0,control,test,n_ctrl,n_test,statistic,pvalue
0,v1utc,auditc,12480,102,-2.352105,0.0206
1,v1local,auditc,12480,102,-4.103345,0.0001
2,v1utc,mood,12480,9020,19.259601,0.0
3,v1local,mood,12480,9020,8.320857,0.0
4,v1utc,gad7,12480,87,-2.083015,0.0402
5,v1local,gad7,12480,87,-3.759693,0.0003
6,v1utc,mhs,12480,393,3.031264,0.0026
7,v1local,mhs,12480,393,0.089197,0.929
8,v1utc,pgic,12480,1220,8.770928,0.0
9,v1local,pgic,12480,1220,3.774733,0.0002


In [65]:
zips = syn.tableQuery('select participant_id, zipcode from syn17023349').asDataFrame()

In [None]:
def apply_offset(t):
    zipcode = meta.loc[meta.participant_id == t.participant_id, 'zipcode'].values[0]
    
    if zipcode == 'nan':
        return t.dt_response
    
    offset = zips.loc[zips.zipcode == zipcode, 'utc_offset'].values
    offset = int(offset[0]) if len(offset) > 0 else 0
    
    td = dt.timedelta(hours=offset)
    return t.dt_response - td

v2['dt_utc'] = [apply_offset(t) for t in tqdm(v2.itertuples())]

In [None]:
data = [
    v1.dt_response.apply(seconds_into_day),
    v2.dt_utc.apply(seconds_into_day)
]

labels = ['v1', 'v2']

iplot(ff.create_distplot(data, labels, bin_size=3000))