# Curate_V2_PassiveFeatures_data

In [1]:
%matplotlib inline

import datetime as dt
import itertools as it

from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import synapseclient
from synapseclient import Activity, Schema, Table, as_table_columns
from tabulate import tabulate
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

tqdm.pandas()

Welcome, Abhishek Pratap!



## V2 Passive Phone Metrics

In [2]:
v2sid = 'syn10165199'
v2r = pd.read_csv(syn.get(v2sid).path, sep='\t', parse_dates=['passive_date'])
v2r.head()

Unnamed: 0,username,passive_date,call_duration_secs-Incoming,call_duration_secs-Missed,call_duration_secs-Outgoing,call_hour_0-Incoming,call_hour_0-Missed,call_hour_0-Outgoing,call_hour_1-Incoming,call_hour_1-Missed,...,text_hour_9-Sent,text_length-,text_length-Received,text_length-Sent,uniqueNumbers-Incoming,uniqueNumbers-Missed,uniqueNumbers-Outgoing,uniqueNumbersTexted-,uniqueNumbersTexted-Received,uniqueNumbersTexted-Sent
0,EN00066,2016-08-24,0,0,0,0,0,0,0,0,...,0,0,2954,2278,0,2,0,0,6,4
1,EN00066,2016-08-25,340,0,0,0,0,0,0,0,...,0,0,10979,10306,1,0,0,0,15,12
2,EN00066,2016-08-26,0,0,326,0,0,0,0,0,...,0,0,14748,8056,0,5,3,0,11,8
3,EN00066,2016-08-27,0,0,1681,0,0,0,0,0,...,0,0,12864,10158,0,1,3,0,13,12
4,EN00066,2016-08-28,0,0,449,0,0,449,0,0,...,0,0,4019,3194,0,0,1,0,9,10


In [None]:
v2 = v2r.rename(columns={
    'username':'participant_id',
    'passive_date':'dt_passive'
})

v2.head()

no columns have na values 

all columns call_hour_i-missed are 0

In [None]:
labels = [
    'participant_id', 'dt_passive',
    'call_duration_secs-Incoming', 'call_duration_secs-Outgoing',
    'numCalls_Incoming', 'numCalls_Missed', 'numCalls_Outgoing', 
    'numTexts-', 'numTexts-Received', 'numTexts-Sent',
    'text_length-Received', 'text_length-Sent',
    'uniqueNumbers-Incoming', 'uniqueNumbers-Missed', 'uniqueNumbers-Outgoing', 
    'uniqueNumbersTexted-', 'uniqueNumbersTexted-Received', 'uniqueNumbersTexted-Sent'
]
v2 = v2.reindex(labels, axis=1).fillna(0)

v2 = v2.rename(columns={
    'call_duration_secs-Incoming':'callDuration_incoming', 
    'call_duration_secs-Outgoing':'callDuration_outgoing',
    'numCalls_Incoming':'callCount_incoming',
    'numCalls_Missed':'callCount_missed',
    'numCalls_Outgoing':'callCount_outgoing', 
    'numTexts-':'textCount', 
    'numTexts-Received':'textCount_received', 
    'numTexts-Sent':'textCount_sent',
    'text_length-Received':'textLength_received', 
    'text_length-Sent':'textLength_sent',
    'uniqueNumbers-Incoming':'uniqueNumbers_calls_incoming', 
    'uniqueNumbers-Missed':'uniqueNumbers_calls_missed', 
    'uniqueNumbers-Outgoing':'uniqueNumbers_calls_outgoing', 
    'uniqueNumbersTexted-':'uniqueNumbers_texts', 
    'uniqueNumbersTexted-Received':'uniqueNumbers_texts_received',
    'uniqueNumbersTexted-Sent':'uniqueNumbers_texts_sent'
})

v2.head()

### Add week in study

In [4]:
metasid = 'syn17023349'

In [None]:
metasid = 'syn17023349'
metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)

# add in the participants start date as a new column
v2 = pd.merge(v2, metadata, on='participant_id', how='left')

# exclude any rows before the start date
v2 = v2.loc[v2.dt_passive >= v2.startdate]

# get the time difference in weeks as a float
v2['week'] = [
    d.days/7 for d in (
        v2.dt_passive.apply(
            lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-v2.startdate
    )
]

# convert the week number to an int by taking the floor
v2.week = v2.week.progress_apply(lambda x: np.int16(np.floor(x))+1 if not pd.isnull(x) else np.nan)

# remove the start date
v2 = v2.drop(columns=['startdate'], errors='ignore')

# reorder the columns
cols = list(v2.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
v2 = v2.reindex(columns=cols)

v2.head()

Make sure the date isn't a timestamp.

In [None]:
v2.dt_passive = v2.dt_passive.apply(lambda x: x.date())

In [None]:
v2.week.hist()

### Upload to Synapse

In [None]:
t = syn.delete(
    syn.tableQuery('select * from syn17060502')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='Passive Features (v2)',
            columns=as_table_columns(v2), 
            parent='syn10848316'),
        v2
    )
)

In [5]:
final = syn.setProvenance(
    'syn17060502',
    activity=Activity(
        name='Generate V2 Table Data',
        description='Process the data collected during study ',
        used=[v2sid, metasid],
        executed=[
            dict(
                name='Curate_V2_PassiveFeatures_data',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_V2_PassiveFeatures_data.ipynb'
            )
        ]
    )
)