# Curate_StudyAppDownloadReason_Data

In [1]:
%matplotlib inline

import datetime as dt
import itertools as it
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
import synapseclient
from synapseclient import Activity, File, Schema, Table, as_table_columns
from tqdm import tqdm

InteractiveShell.ast_node_interactivity = 'all'
syn = synapseclient.Synapse()
syn.login()

tqdm.pandas()

def isnum(x):
    if x is None:
        return False
    try:
        float(x)
        return True
    except ValueError:
        return False

Welcome, Abhishek Pratap!



In [2]:
v1sid, v2sid = 'syn10250489', 'syn17023091'

v1r = pd.read_excel(syn.get(v1sid).path)
v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['createdAt'])

v1r.head()
v2r.head()

Unnamed: 0,brightenid,start,week,user_id,sent_time_local,sent_time_utc,response_local,response_utc,response_id,Why did you download this app?,"For ""other"", please type in box"
0,BLUE-00048,2014-08-01,0,10431,NaT,NaT,2014-08-01 07:00:09,2014-08-01 11:00:09,166331,for fun|for mental health reasons [e.g. depres...,
1,BLUE-00049,2014-08-01,0,10470,NaT,NaT,2014-08-01 12:31:58,2014-08-01 16:31:58,166824,for brain health [e.g. better memory]|to impro...,
2,BLUE-00050,2014-08-09,0,10519,NaT,NaT,2014-08-08 19:31:06,2014-08-09 02:31:06,173157,for management of daily problems|for brain hea...,
3,BLUE-00050,2014-08-09,4,10519,2014-09-06 09:00:01,2014-09-06 16:00:01,2014-09-08 18:21:29,2014-09-09 01:21:29,198454,for management of daily problems|for brain hea...,
4,BLUE-00050,2014-08-09,12,10519,2014-11-01 08:00:05,2014-11-01 16:00:05,2014-11-02 09:16:22,2014-11-02 17:16:22,256858,for mood [e.g. sadness]|for brain health [e.g....,


Unnamed: 0,username,Why did you download this app?,createdAt
0,EN05039,"Fun,My mental health,My mood,Managing daily is...",2016-09-03 17:13:21
1,EN05331,"My mental health,Brain health,Improve work",2017-01-22 23:04:11
2,EN00387,"My mental health,My mood,Brain health,Fun",2016-10-28 08:37:06
3,EN00322,My mental health,2016-09-07 21:32:32
4,EN00478,"My mental health,Managing daily issues,My mood...",2016-11-11 07:21:42


### V1 Data Prep

In [None]:
# drop the uneeded columns
v1 = v1r.drop(columns=[
    'week',	'user_id',	'sent_time_local',	'sent_time_utc', 'response_utc', 'response_id'
]).rename(columns={
    'brightenid': 'participant_id',
    'response_local':'dt_response',
    'Why did you download this app?':'apps',
    'For "other", please type in box':'other_description'
})

# conver to lowercase for lookups
v1.apps = v1.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')

# add indicators for different app usage
v1['fun'] = v1.apps.apply(lambda x: int(x.find('fun') > -1))
v1['mental_health'] =  v1.apps.apply(lambda x: int(x.find('mental health') > -1))
v1['mood'] =  v1.apps.apply(lambda x: int(x.find('mood') > -1))
v1['managing_daily_issues'] =  v1.apps.apply(lambda x: int(x.find('management of daily ') > -1))
v1['improve_work'] =  v1.apps.apply(lambda x: int(x.find('improve work') > -1))
v1['brain_health'] =  v1.apps.apply(lambda x: int(x.find('brain health') > -1))
v1['improve_relationships'] =  v1.apps.apply(lambda x: int(x.find('improve relationships') > -1))
v1['other'] =  v1.apps.apply(lambda x: int(x.find('other') > -1))

v1.head()

In [None]:
# print the unique application reasons
t = [print(a) for a in pd.unique(list(it.chain(*[t.split('|') for t in v1.apps])))]; del t

In [None]:
plt.hist([len(a) for a in v1.other_description if not isnum(a)])

In [None]:
v1['day'] = [(t.dt_response - t.start).days + 1 for t in v1.itertuples()]
v1.day.hist()

#### Extract two common topics I saw a quick read through the other descriptions

In [None]:
incentive_words = [
    'paid', 'pay', 'mone', 'compens', 'gift', 'incentive', '$', 'finan', 'incom', 'reimb', 'craigs', 'pd'
]

def is_incentive(s):
    if not isinstance(s, str):
        return 0
    else:
        return int(any(s.find(a) > -1 for a in incentive_words))
    
v1['happ_inc'] = pd.to_numeric(v1.other_description.apply(is_incentive), downcast='integer')

In [None]:
for_the_study_words = [
    'part', 'require', 'for study', 'told to', 'asked to', 'request', 'to be', 'to do', 'brighten', 'assignment', 'ucsf', 'for a', 'study'
]

def is_for_the_study(s):
    if not isinstance(s, str):
        return int(0)
    else:
        return int(any(s.find(a) > -1 for a in for_the_study_words))
    
v1['happ_fts'] = v1.other_description.apply(is_for_the_study)

In [None]:
v1 = v1.drop(columns=['start', 'day'])

### V2 Data Prep

In [None]:
# drop the uneeded columns
v2 = v2r.rename(columns={
    'username': 'participant_id',
    'createdAt':'dt_response',
    'Why did you download this app?':'apps'
})

# conver to lowercase for lookups
v2.apps = v2.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')

# add indicators for different app usage
v2['fun'] = v2.apps.apply(lambda x: int(x.find('fun') > -1))
v2['mental_health'] =  v2.apps.apply(lambda x: int(x.find('my mental health') > -1))
v2['mood'] =  v2.apps.apply(lambda x: int(x.find('my mood') > -1))
v2['managing_daily_issues'] =  v2.apps.apply(lambda x: int(x.find('managing daily issues') > -1))
v2['improve_work'] =  v2.apps.apply(lambda x: int(x.find('improve work') > -1))
v2['brain_health'] =  v2.apps.apply(lambda x: int(x.find('brain health') > -1))
v2['improve_relationships'] =  v2.apps.apply(lambda x: int(x.find('improve relationships') > -1))
v2['other'] =  v2.apps.apply(lambda x: int(x.find('other') > -1))

# add the study flag
v2.head()

In [None]:
t = [print(a) for a in pd.unique(list(it.chain(*[t.split(',') for t in v2.apps])))]; del t

### Combine the DataFrames

In [None]:
combined = pd.concat([v1, v2], sort=False)

combined = combined.drop(columns=['apps']).rename(columns={
    'brain_health': 'happ_bh',
    'fun': 'happ_f',
    'improve_relationships': 'happ_ir',
    'improve_work': 'happ_iw',
    'managing_daily_issues': 'happ_mdi',
    'mental_health': 'happ_mh',
    'mood':'happ_m',
    'other': 'happ_o',
    'other_description':'happ_o_description'
}).loc[:, [
    'participant_id', 'dt_response', 
    'happ_bh', 'happ_f', 'happ_fts', 'happ_inc', 'happ_ir', 'happ_iw', 'happ_m', 'happ_mdi', 'happ_mh', 'happ_o',
    'happ_o_description'
]]

combined = combined.fillna(0)

# make sure they're all the correct type. for some reason pandas was converting fts and inc to floats
# but we don't need that many bits. plus i like consistency
for c in combined.columns:
    if c.find('description') > -1:
        continue 
        
    if c.find('app') > -1:
        combined[c] = combined[c].astype(int)

combined.head()

### Add week into study

In [3]:
metasid = 'syn27082597'

In [None]:
metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)
metadata.startdate = pd.to_datetime(metadata.startdate)

# add in the participants start date as a new column
combined = pd.merge(combined, metadata, on='participant_id', how='left')

# get the time difference in weeks as a float
combined['week'] = [
    d.days/7 for d in (
        combined.dt_response.apply(
            lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate
    )
]

# convert the week number to an int by taking the floor
combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)

# remove the start date
combined = combined.drop(columns=['startdate'], errors='ignore')

# reorder the columns
cols = list(combined.columns)
cols = cols[0:2] + ['week'] + cols[2:-1]
combined = combined.reindex(columns=cols)

combined.head()

### Localize timestamps

In [None]:
# localize timestamps
combined['dt_response'] = [
    str(t.tz_localize('UTC'))
    for t in combined.dt_response
]

combined.head()

### Set provenance and upload to Synapse

In [None]:
t = syn.delete(
    syn.tableQuery('select * from syn17022426')
)

In [None]:
final = syn.store(Table(
    Schema(
            name='Health Applications',
            columns=as_table_columns(combined), 
            parent='syn10848316'),
        combined
    )
)

In [4]:
final = syn.setProvenance(
    'syn17022426',
    activity=Activity(
        name='Combine V1 and V2 data',
        description='Process and combine the data collected during study 1 and study 2',
        used=['syn17023091', v1sid, v2sid, 'syn12181332', metasid],
        executed=[
            dict(
                name='Curate_StudyAppDownloadReason_Data',
                url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_StudyAppDownloadReason_Data.ipynb'
            )
        ]
    )
)