# Curate_Sleep_Data

In [1]:
%matplotlib inline

import datetime as dt
import itertools
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

tqdm.pandas()

def isnum(x):
    if x is None:
        return False
    try:
        float(x)
        return True
    except ValueError:
        return False

Welcome, Abhishek Pratap!



### Data load, rename, combine

In [2]:
v1sid, v2sid = 'syn12181340', 'syn12181349'

v1r = pd.read_csv(syn.get(v1sid).path, parse_dates=['timestamp'])
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['timestamp'])

v1r.head()
v2r.head()

Unnamed: 0,userid,brightenid,timestamp,timestampUTC,sleep_1,sleep_2,sleep_3
0,10431,BLUE-00048,2014-08-01 07:03:18,2014-08-01 11:03:19,3,2,2
1,10470,BLUE-00049,2014-08-01 12:33:21,2014-08-01 16:33:20,1,3,1
2,10519,BLUE-00050,2014-08-08 19:36:07,2014-08-09 02:36:07,3,2,1
3,10519,BLUE-00050,2014-09-06 09:31:55,2014-09-06 16:31:54,1,3,1
4,10519,BLUE-00050,2014-11-02 09:14:45,2014-11-02 17:14:45,2,3,1


Unnamed: 0,userid,timestamp,sleep_1,sleep_2,sleep_3
0,EN00249,2016-09-28 00:11:24,2,4.0,1
1,EN00658,2016-11-29 08:08:53,2,2.0,3
2,ES05062,2017-02-02 16:10:49,3,3.0,3
3,EN05022,2016-09-30 06:15:23,4,3.0,2
4,EN00185,2016-09-23 01:36:56,2,3.0,1


In [None]:
v1 = v1r.drop(columns=['userid', 'timestampUTC']).rename(columns={
    'brightenid':'participant_id',
    'timestamp':'dt_response'
})

v2 = v2r.rename(columns={
    'userid':'participant_id',
    'timestamp':'dt_response'
})

combined = pd.concat([v1, v2], sort=False)
combined.head()

In [None]:
combined.sleep_1.hist()

### Add week into study

In [3]:
metasid = 'syn27082597'

In [None]:
metasid = 'syn27082597'
metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)
# add in the participants start date as a new column
combined = pd.merge(combined, metadata, on='participant_id', how='left')

# get the time difference in weeks as a float
combined['week'] = [
    d.days/7 for d in (
        combined.dt_response.apply(
            lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate
    )
]

# convert the week number to an int by taking the floor
combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)

# remove the start date
combined = combined.drop(columns=['startdate'], errors='ignore')

# reorder the columns
cols = list(combined.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
combined = combined.reindex(columns=cols)

combined.head()

### Localize timestamps

In [None]:
# localize timestamps
combined['dt_response'] = [
    str(t.tz_localize('UTC'))
    for t in combined.dt_response
]

combined.head()

### Set provenance and upload to Synapse

In [None]:
t = syn.delete(
    syn.tableQuery('select * from syn17022659')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='Sleep',
            columns=as_table_columns(combined), 
            parent='syn10848316'),
        combined
    )
)

In [None]:
final = syn.setProvenance(
    'syn17022659',
    activity=Activity(
        name='Combine V1 and V2 data',
        description='Process and combine the data collected during study 1 and study 2',
        used=[v1sid, v2sid],
        executed=[
            dict(
                name='Curate_Sleep_Data',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Create_Sleep_datafiles.ipynb'
            )
        ]
    )
)