In [1]:
import json
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from datetime import date, datetime

In [2]:
with open('owletplatform-us_data.json', 'r') as myfile:
    data=myfile.read()
    
obj = json.loads(data)

## Connected care baby info (Downloaded April 18, 2019)

In [3]:
# This is the "table" I want -> dataframe?
cc_users = obj['ConnectedCare']['DeviceUser']

In [4]:
user1 = obj['ConnectedCare']['DeviceUser']['fd8377c2-3ad1-11e6-897c-0e947fb56105']
#user1

In [5]:
all_rows = []
for user, data in tqdm_notebook(cc_users.items()):
    birthLength, birthWeight = 0, 0
    healthConditions, races, userIds = frozenset([]), frozenset([]), frozenset([])
    birthDate, dsn, dueDate, firstName, lastName, gender, plurality = '','','','','','',''
    keys = set(data.keys())
    
    if 'birthDate' in keys:
        birthDate = data['birthDate']
    if 'birthLength' in keys:
        birthLength = data['birthLength']
    if 'birthWeight' in keys:
        birthWeight = data['birthWeight']['pounds']*16 + data['birthWeight']['ounces'] # Weight in oz
    if 'deviceIds' in keys:
        if '0' in set(data['deviceIds'].keys()):
            dsn = data['deviceIds']['0'] # What if this key doesn't exist??
    if 'dueDate' in keys:
        dueDate = data['dueDate']
    if 'firstName' in keys:
        firstName = data['firstName']
    if 'lastName' in keys:
        lastName = data['lastName']
    if 'gender' in keys:
        gender = data['gender']
    if 'healthConditions' in keys:
        healthConditions = frozenset([info for key,info in data['healthConditions'].items()])
    if 'plurality' in keys:
        plurality = data['plurality']
    if 'races' in keys:
        races = frozenset([info for key,info in data['races'].items()])
    if 'userIds' in keys:
        userIds = frozenset([info for key,info in data['userIds'].items()])
    
    row = (user, birthDate, birthLength, birthWeight, dsn, dueDate, firstName,
           lastName, gender, healthConditions, plurality, races, userIds)
    all_rows.append(row)

HBox(children=(IntProgress(value=0, max=107088), HTML(value='')))




In [6]:
columns = ['deviceUser','birthDate','birthLength','birthWeight','dsn','dueDate',
 'firstName','lastName','gender','healthConditions','plurality','races','userIds']

In [7]:
cc_baby_info = pd.DataFrame(all_rows, columns=columns)

In [311]:
# cc_baby_info

In [313]:
# cc_baby_info.drop_duplicates(['dsn','birthDate']).dsn.value_counts()

In [8]:
cc_baby_info.userIds.value_counts()

(a338ba14-c82d-4c4a-9dfb-f4a63029f921)    24
(575fb85f-d883-4537-aa46-e77a0c379f82)     7
()                                         6
(18aafcd3-c9f9-43ee-81f9-41b43033e708)     5
(1cc86682-0465-45bd-917a-2431fb99a786)     5
(b4553c1a-23cc-4083-85a9-03abf0ce9a66)     5
(52e513e9-cada-490e-aa06-be469968e10b)     5
(0988ea47-9a7a-4470-98e2-8d7c84a53185)     4
(56a7f766-3c91-49c8-80fb-95fef34a1920)     4
(198c8e33-9ac8-48ed-a47f-102c8b9f9728)     4
(21fe0ea2-e217-47ea-8ab5-582e82938ec0)     4
(39213e12-f61e-4f6d-9f33-7077d5b0df6c)     4
(6dca2b28-4c84-400c-924b-b4f300e0ddef)     4
(75fc075b-8ffd-4b72-86b3-c14aac7113fc)     4
(bb6d8510-84ae-4eaa-814a-f32b649a716a)     4
(1cfa9e3b-4ee6-4821-899f-02baae136ab0)     4
(005becaf-9aa1-42bd-9497-367d9652bd66)     4
(0f3b4a62-d50f-4495-aa1e-f868e485f9c4)     3
(80cbc251-196b-44f8-8610-a475fac685aa)     3
(c5420fe0-7482-4bad-8454-039f22bf9478)     3
(b3735dd3-6ad9-4804-a76c-4a548aa58001)     3
(f00038a9-36f1-47fa-8067-b536a67c1145)     3
(062fef06-

In [11]:
cc_baby_info[(cc_baby_info.dsn == 'AC000W002431442')]

Unnamed: 0,deviceUser,birthDate,birthLength,birthWeight,dsn,dueDate,firstName,lastName,gender,healthConditions,plurality,races,userIds
81994,c406b11e-2245-11e8-be48-0e75f1ecdf28,2017-09-24T00:00:00Z,0.0,0,AC000W002431442,2017-09-30T00:00:00Z,Sophie,Downing,female,(notApplicable),not_applicable,(white),(d2fc0ccd-c668-4e10-850e-0f818cc63e42)


In [11]:
cc_parent_info[(cc_parent_info.email == 'spentz@mail.usf.edu')]

Unnamed: 0,userId,created_at,deviceUsers,relationship,firstName,lastName,email,phone,birthDate,city,state,zip,country
87825,d2fc0ccd-c668-4e10-850e-0f818cc63e42,2018-03-07T20:26:07Z,(c406b11e-2245-11e8-be48-0e75f1ecdf28),(biological child),Stephanie,Downing,spentz@mail.usf.edu,(941) 662-0267,2017-09-24T00:00:00Z,Rotonda west,FL,,us


In [290]:
# Save dsns that had connected care
# pickle.dump(set(cc_baby_info.dsn.values), open("cc_dsns.p", "wb"))

deviceUser is unique (no duplicated info?)

## Connected care parent info

In [8]:
cc_parent = obj['ConnectedCare']['User']
#cc_parent['fd751001-d7e5-4f6e-895e-5d193e148d52']

In [9]:
parent_rows = []
for user, data in tqdm_notebook(cc_parent.items()):
    deviceUsers, relationship = frozenset([]), frozenset([])
    userId, created_at, firstName, lastName, email = '','','','',''
    phone, birthDate, city, state, zip_, country = '','','','','',''
    keys = set(data.keys())
    
    if 'created_at' in keys:
        created_at = data['created_at']
    if 'deviceUsers' in keys:
        temp = []
#         for key, info in data['deviceUsers'].items():
#             if 'deviceUserID' in set(info.keys()):
#                 temp.append(info['deviceUserId'])
        deviceUsers = frozenset([info['deviceUserId'] for key,info in data['deviceUsers'].items()]) #CHECK
        relationship = frozenset([info['relationship'] for key,info in data['deviceUsers'].items()])
    if 'firstName' in keys:
        firstName = data['firstName']
    if 'lastName' in keys:
        lastName = data['lastName']
    if 'email' in keys:
        email = data['email']
    if 'phone' in keys:
        phone = data['phone']
    if 'birthDate' in keys:
        birthDate = data['birthDate']
    if 'city' in keys:
        city = data['city']
    if 'state' in keys:
        state = data['state']
    if 'zip_' in keys:
        zip_ = data['zip_']
    if 'country' in keys:
        country = data['country']
    
    row = (user, created_at, deviceUsers, relationship, firstName, lastName, 
           email, phone, birthDate, city, state, zip_, country)
    parent_rows.append(row)

HBox(children=(IntProgress(value=0, max=106329), HTML(value='')))




In [10]:
parent_columns = ['userId','created_at','deviceUsers', 'relationship','firstName',
                  'lastName','email','phone','birthDate','city','state','zip','country']
cc_parent_info = pd.DataFrame(parent_rows, columns=parent_columns)

In [436]:
# cc_parent_info

## Feature engineering

#### Clean dates

In [15]:
def clean_date(day):
    '''Input: string of the date with time
       Returns: date object'''
    if day == '':
        return ''
    if day == '2018--1-5-07T00:00Z':
        return date(2018,7,15)
    try:
        int(day[:4])
    except:
        return ''
    return date(int(day[:4]), int(day[5:7]), int(day[8:10]))

def clean_datetime(day):
    '''Input: string of the date with time
       Returns: date object'''
    if day == '':
        return ''
#     if day == '2018--1-5-07T00:00Z':
#         return date(2018,7,15)
    try:
        int(day[:4])
    except:
        return ''
    return datetime(int(day[:4]), int(day[5:7]), int(day[8:10]), int(day[11:13]), int(day[14:16]), int(day[17:19]))

In [16]:
cc_baby_info.birthDate = cc_baby_info.birthDate.apply(clean_date)
cc_baby_info.dueDate = cc_baby_info.dueDate.apply(clean_date)

In [17]:
cc_parent_info.created_at = cc_parent_info.created_at.apply(clean_datetime)
cc_parent_info.birthDate = cc_parent_info.birthDate.apply(clean_date)

#### Gestational age/what week they gave birth

In [None]:
# cc_baby_info[(cc_baby_info.birthWeight > 17) & (cc_baby_info.birthWeight <= 88)]

In [None]:
# cc_baby_info[cc_baby_info.birthWeight > 17]

In [18]:
# Don't know gestational age @ birth if the birthdate or duedate is missing 
cc_baby_gestation = cc_baby_info[(cc_baby_info.birthDate != '') & (cc_baby_info.dueDate != '')]

# Disregard bdays/duedates that are not in the time frame of use of the sock (pre 2015 post 2019)
cc_baby_gestation = cc_baby_gestation[(cc_baby_gestation.birthDate > date(2015,1,1)) & (cc_baby_gestation.birthDate < date(2019,3,31))]
cc_baby_gestation = cc_baby_gestation[(cc_baby_gestation.dueDate > date(2015,1,1)) & (cc_baby_gestation.dueDate < date(2019,3,31))]


In [19]:
def days_to_wk(x):
    n = x.days
    return -1 * ((n - 1) // 7) + 39

In [20]:
cc_baby_gestation['week_of_birth'] = (cc_baby_gestation.dueDate - cc_baby_gestation.birthDate).apply(days_to_wk)

In [21]:
# Not later than 4 weeks after, not earlier than?? try week 21
valid_gestation = cc_baby_gestation[(cc_baby_gestation.week_of_birth > 20) & (cc_baby_gestation.week_of_birth < 45)]

In [22]:
valid_gestation[['deviceUser','userIds','week_of_birth']]

Unnamed: 0,deviceUser,userIds,week_of_birth
0,0001153c-f1a9-11e8-92b8-0ab32160e8e4,(3780481c-f52a-4089-a768-2ba1239a6cc8),40
1,00012bd4-c1ae-11e7-a66d-0ac49f1ecf36,(e6b20f27-193f-4aa1-bf3b-8de4e3b19472),38
2,00020cba-a647-11e8-9949-0e9382159dc5,(1bb5d052-4e26-4510-90a4-8f83ea5f8401),40
3,0003dc14-4f46-11e8-ad7b-0ef7636f67fa,(6afac83e-ba35-4c2e-9c29-7cb88f8e60ac),40
4,00042868-9d93-11e8-a629-0a5d62639812,(8bfd4977-4d0e-49e2-ba41-b08b95fa85dc),37
5,0004c630-ad13-11e8-8454-0a3272bbb876,(5900b291-cfc6-4f01-9e95-3c886ec4c44f),40
6,0005e982-9821-11e7-8c60-0e9382159dc5,(84950bde-7448-4848-9fe4-78711c275396),40
7,0005f4ce-4a80-11e8-a326-0e9382159dc5,(aa41f25a-69a0-477d-8f74-08bca8cbb6ce),38
8,000646c2-d19f-11e7-91eb-0e3fb1df70d2,(7f2915db-de7b-4680-bdc9-ce5be56f9f9e),41
9,00069c24-cf43-11e8-aed5-0ea451014b92,(619e40ac-be8c-4b8c-95da-a54eaf56a84a),39


In [23]:
valid_gestation[valid_gestation.week_of_birth < 37]

Unnamed: 0,deviceUser,birthDate,birthLength,birthWeight,dsn,dueDate,firstName,lastName,gender,healthConditions,plurality,races,userIds,week_of_birth
26,0012b480-63ed-11e8-848e-0e9382159dc5,2018-05-16,19.094488,84,AC000W002457926,2018-06-16,Luka,Mete,male,(notApplicable),twin,(white),(fcd783b8-2134-410e-ba75-61b5aaa51915),35
33,001453da-9bd3-11e8-8afa-0ab3f06930bb,2018-05-14,14.500000,41,AC000W002673785,2018-08-10,Arielle,Dallas,female,(notApplicable),not_applicable,"(other, black)",(8ffa9aad-7f5e-4c35-a9bf-5e97dd3a0bbd),27
34,00152420-e851-11e8-8cf5-0a415a41cd98,2018-10-09,17.700000,60,AC000W002603577,2018-11-15,Isla,Scripture,female,(notApplicable),not_applicable,(white),(10cca583-2b30-420a-b714-c71d45a70ab3),34
70,002ae768-8bbf-11e8-8755-0a3272bbb876,2018-07-17,20.000000,95,AC000W002631264,2018-08-10,Connor,Theriac,male,(notApplicable),not_applicable,(white),(a47cbf1e-98d6-426b-8f8a-06e51d8b2e05),36
104,003f99ea-92dd-11e8-98ca-0a3272bbb876,2018-06-21,19.000000,114,AC000W002648779,2018-07-25,Enzo,Mendoza,male,(notApplicable),not_applicable,(other),(9b86af4b-de4b-4586-8f45-d65a68b26de9),35
123,004a7dbe-4531-11e8-9f4e-0ab3f06930bb,2018-02-21,13.582677,31,AC000W002581651,2018-05-28,Akin,Susumpow,male,(chronic lung disease in infants),not_applicable,(asian),(3ad1ba3c-47af-4acf-9de8-cc714250ab11),26
152,0057b06a-b43b-11e7-aab8-0e9382159dc5,2017-10-09,17.500000,108,AC000W001149020,2017-11-17,Henry,Riley,male,(notApplicable),not_applicable,(white),(e678c735-b9ea-4a58-a545-caf2498551a1),34
156,005d300a-a818-11e7-b4d6-0ee51d704787,2017-07-19,14.000000,34,AC000W001109425,2017-10-16,Oliver,McGuire,male,(notApplicable),not_applicable,(white),(a2f5a266-d78b-47bc-8468-6f3c35d9d4f5),27
160,005e2b76-f620-11e7-b815-0ea451014b92,2017-11-19,15.500000,44,AC000W001055284,2018-01-14,Ivy,Smith,female,(),twin,(),(8014a9e4-7d00-40bb-adb9-87da33fe2b98),32
161,005ec082-6a74-11e7-b18e-0ab553feed59,2017-06-15,18.700000,79,AC000W001136169,2017-07-27,Amelia Jane,Ryland,female,(notApplicable),not_applicable,(white),(f9d33d3c-e2b3-4651-bb1e-3d6e0965a298),34


In [49]:
print(f'percent premature of people that used c.c. and reported birth date and due date: {9050/89181*100:.2f}%')

percent premature of people that used c.c. and reported birth date and due date: 10.15%


#### Age of parent at baby's birth

In [235]:
# need birth day from baby_info based on deviceUser/deviceUserIds 

# Make series,
# cc_baby_info = cc_baby_info[cc_baby_info.userIds != frozenset([])]
parent_bdays = []
for id_ in tqdm_notebook(list(cc_baby_info.userIds.values)):
    id_, = id_
    bday_values = cc_parent_info[cc_parent_info.userId == id_]
    if bday_values.shape[0] != 0:
        birthDate = bday_values.birthDate.values[0]
    else:
        birthDate = ''
    parent_bdays.append(birthDate)

HBox(children=(IntProgress(value=0, max=103511), HTML(value='')))

In [None]:
cc_baby_info['parent_bday'] = parent_bdays
cc_baby_valid = cc_baby_info[(cc_baby_info.birthDate != '')]
cc_baby_valid = cc_baby_valid[(cc_baby_valid.birthDate > date(2015,1,1)) & (cc_baby_valid.birthDate < date(2019,3,31))]

In [278]:
# def parent_age_at_birth(x):
parent_age = cc_baby_valid[(cc_baby_valid.parent_bday != '')]
parent_age = parent_age[(parent_age.parent_bday > date(1955,2,1)) & (parent_age.parent_bday < date(2007,3,1))]
parent_age['parent_age'] = parent_age.birthDate - parent_age.parent_bday

In [279]:
days_to_yr = lambda x : x.days // 365
parent_age.parent_age = parent_age.parent_age.apply(days_to_yr)

In [433]:
parent_age[['deviceUser','userIds','parent_age']]

Unnamed: 0,deviceUser,userIds,parent_age
1,00012bd4-c1ae-11e7-a66d-0ac49f1ecf36,(e6b20f27-193f-4aa1-bf3b-8de4e3b19472),23
2,00020cba-a647-11e8-9949-0e9382159dc5,(1bb5d052-4e26-4510-90a4-8f83ea5f8401),26
3,0003dc14-4f46-11e8-ad7b-0ef7636f67fa,(6afac83e-ba35-4c2e-9c29-7cb88f8e60ac),33
5,0004c630-ad13-11e8-8454-0a3272bbb876,(5900b291-cfc6-4f01-9e95-3c886ec4c44f),19
7,0005f4ce-4a80-11e8-a326-0e9382159dc5,(aa41f25a-69a0-477d-8f74-08bca8cbb6ce),31
8,000646c2-d19f-11e7-91eb-0e3fb1df70d2,(7f2915db-de7b-4680-bdc9-ce5be56f9f9e),34
10,0007c31c-03f8-11e9-bddd-0a460ee905a4,(f9a02270-0e16-4911-b5aa-fabb8dd3d17c),31
11,00083334-a7bd-11e8-9949-0e9382159dc5,(d3cc09cb-d25d-4225-8885-13d21188cf77),41
12,000878b2-3571-11e8-906d-0ea451014b92,(711bbaef-02ba-492a-9b1d-88632a0fddea),30
13,000882fe-b9af-11e8-9fb1-0e9382159dc5,(52b20684-8bfe-43bf-9879-e4a0f569c80a),35


## Baby's born with health conditions

In [228]:
conditions = cc_baby_info[(cc_baby_info.healthConditions != frozenset([])) & (cc_baby_info.healthConditions != frozenset(['notApplicable']))].healthConditions.value_counts()
conditions

(other)                                                                              4954
(congenital heart disease)                                                            844
(chronic lung disease in infants)                                                     768
(chronic lung disease in infants, other)                                              194
(down syndrome)                                                                       175
(other, congenital heart disease)                                                     174
(down syndrome, congenital heart disease)                                              88
(chronic lung disease in infants, congenital heart disease)                            67
(chronic lung disease in infants, other, congenital heart disease)                     41
(other, down syndrome)                                                                 16
(chronic lung disease in infants, congenital heart disease, down syndrome)             10
(other, do

In [233]:
print('Total babies with reported health conditions:',  sum(conditions.values), ';', sum(conditions.values)/103511*100,'%')


Total babies with reported health conditions: 7351 ; 7.101660693066438 %


In [38]:
total = 103511
other = 4952 + 194 + 174 + 41 + 16 + 7 + 7 + 1
CHD = 844 + 174 + 88 + 67 + 41 + 10 + 7 + 7
CLD = 768 + 194 + 67 + 41 + 10 + 7 + 5 + 1
down_syndrome = 175 + 88 + 16 + 10 + 7 + 7 + 5 + 1
print('Reported health conditions:', f'\ncongenital heart disease: {CHD/total*100:.2f}%', f'\nchronic lung disease in infants: {CLD/total*100:.2f}%' , f'\ndown syndrome: {down_syndrome/total*100:.2f}%', f'\nother: {other/total*100:.2f}%')


Reported health conditions: 
congenital heart disease: 1.20% 
chronic lung disease in infants: 1.06% 
down syndrome: 0.30% 
other: 5.21%


In [229]:
cc_baby_info

Unnamed: 0,deviceUser,birthDate,birthLength,birthWeight,dsn,dueDate,firstName,lastName,gender,healthConditions,plurality,races,userIds
0,0001153c-f1a9-11e8-92b8-0ab32160e8e4,2018-11-16,0.000000,0,AC000W002694016,2018-11-16,Wyatt,Bennett,male,(),not_applicable,(),(3780481c-f52a-4089-a768-2ba1239a6cc8)
1,00012bd4-c1ae-11e7-a66d-0ac49f1ecf36,2018-04-07,21.500000,115,AC000W001114798,2018-04-17,Rowan,Rifanburg,male,(notApplicable),not_applicable,(),(e6b20f27-193f-4aa1-bf3b-8de4e3b19472)
2,00020cba-a647-11e8-9949-0e9382159dc5,2018-09-17,20.700000,131,AC000W003377772,2018-09-17,Elian,Garcia,male,(notApplicable),not_applicable,(hispanic),(1bb5d052-4e26-4510-90a4-8f83ea5f8401)
3,0003dc14-4f46-11e8-ad7b-0ef7636f67fa,2018-04-23,20.000000,109,AC000W002429765,2018-04-23,Graham,Guidry,male,(notApplicable),not_applicable,(white),(6afac83e-ba35-4c2e-9c29-7cb88f8e60ac)
4,00042868-9d93-11e8-a629-0a5d62639812,2018-06-06,0.000000,100,AC000W002688083,2018-06-22,Delilah,Ortiz,female,(notApplicable),not_applicable,"(white, hispanic)",(8bfd4977-4d0e-49e2-ba41-b08b95fa85dc)
5,0004c630-ad13-11e8-8454-0a3272bbb876,2018-10-17,14.173228,112,AC000W001203832,2018-10-17,Aria,Harbour,female,(notApplicable),not_applicable,(white),(5900b291-cfc6-4f01-9e95-3c886ec4c44f)
6,0005e982-9821-11e7-8c60-0e9382159dc5,2017-09-03,0.000000,0,AC000W001133798,2017-09-03,M,M,female,(notApplicable),not_applicable,(white),(84950bde-7448-4848-9fe4-78711c275396)
7,0005f4ce-4a80-11e8-a326-0e9382159dc5,2018-04-25,21.000000,140,AC000W001202478,2018-05-06,Dillon,Roose,female,(notApplicable),not_applicable,(white),(aa41f25a-69a0-477d-8f74-08bca8cbb6ce)
8,000646c2-d19f-11e7-91eb-0e3fb1df70d2,2017-11-19,22.000000,127,AC000W001199278,2017-11-11,Nolan,Gullerud,male,(),not_applicable,"(hispanic, white)",(7f2915db-de7b-4680-bdc9-ce5be56f9f9e)
9,00069c24-cf43-11e8-aed5-0ea451014b92,2018-10-18,20.000000,103,AC000W002454532,2018-10-24,Kensley,Robinson,female,(notApplicable),not_applicable,"(black, white, other)",(619e40ac-be8c-4b8c-95da-a54eaf56a84a)
