In [1]:
import os
import pickle
import statistics
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm_notebook
from datetime import date, datetime, timedelta

### *Before running flatline algorithm, look at the statistics for the 16,000 to understand what the data looks like

## Code to run in preparation for flatline algorithm

In [19]:
round_1_inputs = ['16k_round_1', '20190130', '2019-01-16 00:00:00'] # Or '20181017', '2018-10-03 00:00:00'
round_2_inputs = ['16k_round_2', '20181218', '2018-12-04 00:00:00'] # Once we have data: '20190228', '2019-02-14 00:00:00'

In [20]:
folder = round_2_inputs[0]
date_data_received = round_2_inputs[1] #(want last day of data, or last day we are using for the study)
two_weeks_before_received = round_2_inputs[2] # needs to be 2 weeks before data was received or 2019-02-14
column_names = pd.read_csv('column_names.txt').columns

#### Some of this is only necessary when the rollups are stored by day (instead of 1 file for all of a DSNs data)

In [11]:
# all_file_paths = glob(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_day_rollups/*/*.p')
# additional_paths = glob(f'/Volumes/baileyWD/{folder}/16000_day_rollups/*/*.p')
# all_file_paths += additional_paths
all_file_paths = glob('/Volumes/baileyWD/TenMinRollups16000/*/*.pdpkl')
all_file_paths = sorted(all_file_paths)

In [16]:
# Modified for starting with 2 second data that is divided into folders containing data for 2000 dsns
def get_all_dsns(all_file_paths):
    '''Pickle a set of the unique dsns'''
    my_list = set([])
    for path in all_file_paths:
        my_list.add(path[-17:-2]) 
    
    pickle.dump(my_list, open(f"/Users/brodriguez/Documents/Owlet-code/{folder}/rollups_dsns.p", "wb"))
    

def get_dsn_files(dsn, all_file_paths):
    '''Get all files for a given dsn'''
    files = []
    for file in all_file_paths:
        if dsn in file:
            files.append(file)            
    return sorted(files)

def all_dsn_files(dsn_list, all_file_paths):
    '''Takes a list of dsns and creates a dictionary mapping the dsn to a 
            list of the days for which the dsn has data'''
    dictionary = {}
    for dsn in tqdm_notebook(dsn_list):
        all_files = get_dsn_files(dsn, all_file_paths)
        dictionary[dsn] = all_files
    return dictionary

def load_all_days(all_days):
    '''Loads the files in all_days and combines them into 1 dataframe'''
    load_dfs = [pickle.load(open(file, 'rb')) for file in all_days]
    if len(load_dfs) == 0:
        return pd.DataFrame()
    else:
        df_all = pd.concat(load_dfs)
        df_all = clean_df(df_all)
        
    return df_all

def clean_df(df):
    '''Get rid of data on 2016-01-01'''
    if df.shape[0] == 0:
        return df
    else:
        return df.loc[(df.index > '2016-01-01 23:59:59') | (df.index < '2016-01-01 00:00:00')]

def pickle_full_dfs(dsn_list, all_files):
    '''For each dsn, pickle all the data'''
    for dsn in tqdm_notebook(dsn_list):
        df_all = load_all_days(all_files[dsn])
        pickle.dump(df_all, open(f'/Volumes/baileyWD/{folder}/16000_dfs/{dsn}_df.p', 'wb')) 

Get list of all dsns

In [21]:
# get_all_dsns(all_file_paths) # pickles a list of the dsns for this 16000
list_of_dsns = pickle.load(open(f"/Users/brodriguez/Documents/Owlet-code/{folder}/rollups_dsns.p", "rb"))

Get mapping of dsn to filenames

In [17]:
dictionary = all_dsn_files(list_of_dsns, all_file_paths)
pickle.dump(dictionary, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dictionary.p','wb'))
# dictionary = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dictionary.p', 'rb'))

HBox(children=(IntProgress(value=0, max=15997), HTML(value='')))




In [30]:
list(inclusion_dsns)[5:] #already pickled these

['AC000W000336882',
 'AC000W001090172',
 'AC000W001133941',
 'AC000W002496670',
 'AC000W001118151']

Pickle dataframe for each dsn

In [71]:
# should I just do this for inclusion dsns? or at least do it over night
pickle_full_dfs(list(inclusion_dsns)[450:500], dictionary) 

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Get updated cc and registration data then pickle 

(should only need to be done 1 more time when I have registration data through Feb 2019)

In [6]:
def get_missing_bdays(cc_bdays, registration_bdays):
    dsns = []
    birthDates = []
    for row in cc_bdays.itertuples():
        dsn = row.dsn
        reg_bdays = registration_bdays.query("dsn == @dsn").birthDate.values
        if row.birthDate not in reg_bdays:
            dsns.append(dsn)
            birthDates.append(row.birthDate)
    
    data = {'dsn': dsns, 'birthDate': birthDates}
    missing_birthdays = pd.DataFrame(data)
    
    def get_created_at(x):
        return x[:4] + '-' + x[4:6] + '-' + x[6:] + ' 00:00:00'
    missing_birthdays['created_at'] = missing_birthdays.birthDate.apply(get_created_at)
    
    return missing_birthdays

def save_bday_info(registration, bday_gender, cc_baby_info):
    def clean_bdays(x):
        return x[:10].replace('-','')
    # combine bdays from cc & baby profile
    cc_bdays = cc_baby_info[['dsn','birthDate']]
    cc_bdays = cc_bdays.query("birthDate != '--T00:00Z' and birthDate != '201815'")
    cc_bdays.birthDate = cc_bdays.birthDate.apply(clean_bdays)
    cc_bdays = cc_bdays.loc[cc_bdays.birthDate != '']
    
    registration_bdays = bday_gender.loc[(bday_gender.property_name == 'BIRTHDATE')][['dsn','val_string','created_at']]
    registration_bdays.columns = ['dsn','birthDate','created_at']
    
    # only want to use "missing birthdays"
    missing_birthdays = get_missing_bdays(cc_bdays, registration_bdays)
    
    all_bdays = missing_birthdays.append(registration_bdays)
    all_bdays = all_bdays.drop_duplicates()
    all_bdays = all_bdays[(all_bdays.birthDate > '20160101') & (all_bdays.birthDate < date_data_received)]

    pickle.dump(all_bdays, open('16k_reg_and_cc_bdays.p','wb'))

    # Pickle dsns that have cc or both reg+bday info
    dsns_with_info = set(list(set(registration_bdays.dsn.values).intersection(registration.dsn.values)) + list(cc_bdays.dsn.values))
    pickle.dump(dsns_with_info, open('all_dsns_with_info.p','wb'))

In [12]:
# registration = pd.read_csv('/Users/brodriguez/Documents/parsed_registration_pre_20190113.csv', compression='gzip', index_col=0)
# bday_gender = pd.read_csv('/Users/brodriguez/Documents/parsed_datapoint_pre_20190113.csv', compression='gzip', index_col=0)
# cc_baby_info = pickle.load(open("cc_baby_info_may30_2019.p", "rb"))

# save_bday_info(registration, bday_gender, cc_baby_info)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Load birthday dataframe (contains valid birthdays from registration data and connected care data)

In [5]:
baby_info = pickle.load(open('16k_reg_and_cc_bdays4.p','rb')) # Birthdays found in registration and connected care data


In [7]:
def count_bdays(df):
    '''Save a dictionary mapping each dsn to the set of reported birthdays'''
    dsn_bday_values = {}
    for dsn in tqdm_notebook(list(df.dsn.value_counts().index)):
        bdays = df.loc[(df.dsn == dsn)].birthDate.values
        dsn_bday_values[dsn] = set(bdays)
        
    return dsn_bday_values
        
def dsns_with_1_bday(dsn_bday_values):
    '''Save a list of dsns that have only 1 reported birthday'''
    dsns_1_bday = []
    for dsn, bdays in tqdm_notebook(dsn_bday_values.items()):
        if len(bdays) == 1:
            dsns_1_bday.append(dsn)
    return dsns_1_bday
            
def dsns_with_many_bdays(dsn_bday_values):
    '''Save a list of dsns with more than 5 reported birthdays'''
    # With more than 5 reported birthdays, I've noticed many are devices being used in the office
    dsns_many_bdays = []
    for dsn, bdays in tqdm_notebook(dsn_bday_values.items()):
        if len(bdays) > 5:
            dsns_many_bdays.append(dsn)
    return dsns_many_bdays

Save list of dsns with 1 bday and dsns with > 5 bdays (very few have more than 5 and those that do seem to be Owlet testing devices)

In [364]:
dsn_bday_values = count_bdays(baby_info)
dsns_1_bday = dsns_with_1_bday(dsn_bday_values)
dsns_many_bdays = dsns_with_many_bdays(dsn_bday_values)

pickle.dump(dsns_1_bday, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_1_bday.p', 'wb'))
pickle.dump(dsns_many_bdays, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_more_than_5_bdays.p', 'wb'))

List of dsns to use in flatline algorithm

In [6]:
dsns_with_info = pickle.load(open('all_dsns_with_info.p','rb')) # includes cc dsns
v2_devices = pickle.load(open('/Users/brodriguez/Documents/Owlet-code/V2_monitoring_data/v2_devices.p', 'rb'))
inclusion_dsns = set(list_of_dsns).intersection(dsns_with_info).intersection(v2_devices)

more_than_5_bdays = set(pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_more_than_5_bdays.p', 'rb'))).intersection(inclusion_dsns)
for dsn in more_than_5_bdays:
    inclusion_dsns.remove(dsn)
    
pickle.dump(inclusion_dsns, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/inclusion_dsns.p', 'wb'))


In [76]:
len(v2_devices)

266415

In [52]:
no_registration = (list_of_dsns.intersection(v2_devices) - dsns_with_info) #(list_of_dsns - dsns_with_info) - v2_devices
no_reg_but_data = []
for dsn in tqdm_notebook(no_registration):
    df = pickle.load(open(f'/Volumes/baileyWD/{folder}/16000_dfs/{dsn}_df.p', 'rb'))
    df = df.loc[(df.index > '2017-02-01 00:00:00') & (df.valid_count > 2)]
    if df.shape[0] > 0:
        no_reg_but_data.append(dsn)

HBox(children=(IntProgress(value=0, max=492), HTML(value='')))

In [57]:
'AC000W000660989' in v2_devices #....?

False

In [59]:
len(list_of_dsns - dsns_with_info)

3572

In [10]:
2875-1251

1624

In [58]:
print('No registration', len(no_registration))
# print('No registration but there is data', len(no_reg_but_data))

No registration 492


In [56]:
print('No registration and data extends past Feb 2017', len(no_reg_but_data))

No registration and data extends past Feb 2017 395


In [98]:
no_reg_but_data # IF THESE ARE IN AYLA I HAVE TO GET THEM SOMEHOW!
# Some started using in 2016, probably switched from V1 to V2..
    # What if the last use was after Feb 2017 and the baby was still < 1 yr

['AC000W000339118',
 'AC000W000330979',
 'AC000W000241618',
 'AC000W000504481',
 'AC000W000671328',
 'AC000W000453330',
 'AC000W000419009',
 'AC000W000329352',
 'AC000W000453120',
 'AC000W000659019',
 'AC000W002452911',
 'AC000W001038942',
 'AC000W000342529',
 'AC000W000420511',
 'AC000W000338336',
 'AC000W000658970',
 'AC000W000338718',
 'AC000W000668797',
 'AC000W000336468',
 'AC000W000459367',
 'AC000W000662455',
 'AC000W000671237',
 'AC000W000654855',
 'AC000W000336766',
 'AC000W000418335',
 'AC000W000098642',
 'AC000W001038950',
 'AC000W000333939',
 'AC000W000672733',
 'AC000W001040398',
 'AC000W000458040',
 'AC000W000235985',
 'AC000W000446039',
 'AC000W001083775',
 'AC000W000327731',
 'AC000W000256077',
 'AC000W000504703',
 'AC000W000400637',
 'AC000W000261304',
 'AC000W000233644',
 'AC000W000235767',
 'AC000W000458616',
 'AC000W001038530',
 'AC000W001039039',
 'AC000W000503586',
 'AC000W000253410',
 'AC000W000323711',
 'AC000W001112604',
 'AC000W000419990',
 'AC000W000670478',


Get locations for inclusion dsns

In [20]:
location_data = pd.read_csv('/Users/brodriguez/Documents/Owlet-code/GPS_locations_Mar_2019.csv', compression='gzip')
def dsn_in_16000(x):
    if x['dsn'] in inclusion_dsns:
        return True
    else:
        return False

# Location info for only the 16000 dsns (otherwise the df is too big and it slows everything down)
in_16000 = location_data.apply(dsn_in_16000, axis=1)
location_data = location_data.loc[in_16000].sort_values(by='created_at')

pickle.dump(location_data, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_location_data.p', 'wb'))

  mask |= (ar1 == a)


# Flatline Algorithm

In [7]:
def get_df_day(df_all, day):
    '''Get 2 hours of data just prior to the time given'''
    # return the 2 hours just prior to the datetime (and include the datetime)
    # if there is not data in the 2 hours before, it will get what is there
    prior_2_hrs = day - pd.Timedelta(120,'m')
    return df_all[(df_all.index >= prior_2_hrs) & (df_all.index <= day)]
    
def find_last_days(df_all, dsn_bdays, dsns_1_bday):
    ''''''
    # if there is only 1 unique bday: return only very last day of use
    if df_all.dsn[0] in dsns_1_bday:
        return [df_all.index[-1]]
    else:
        # Loop through birthdays by created_at date (descending) take last use before next birthday was reported
        dsn_bdays = dsn_bdays.sort_values(by='created_at', ascending=False)
        prev_bday = ''
        last_days = []
        for row in dsn_bdays.itertuples():
            if prev_bday == '':
                last_days.append(df_all.index[-1])
            else:
                bday_data = df_all.loc[(df_all.index < prev_bday) & (df_all.index >= row.created_at)]
                if bday_data.shape[0] != 0:
                    last_days.append(bday_data.index[-1])
            prev_bday = row.created_at[:10] # Dont keep time: want entire day to belong to the bday reported that day
        # make sure there are not duplicate last days of use
        return list(set(last_days))
    
def possible_flatline(df, low_HR_thres, high_HR_thres, low_O2_thres, valid_thres):
    '''Find critical events (high or low heart rate or low oxygen) in the data given'''
    valid_percent = df.valid_count/df['count']
    
    # We don't alert for low HR unless O2 is also low.
    critical_vitals = ((df.heart_rate_raw_min < low_HR_thres) & (df.oxygen_raw_min < 90)) | (df.oxygen_raw_min < low_O2_thres) | (df.heart_rate_raw_max > high_HR_thres)
    critical_event = any(critical_vitals & (valid_percent >= valid_thres))
    
    if critical_event:
        # Low HR won't be valid if the O2 is not actually dropping
        if all(df.heart_rate_raw_max < 220) & all(df.oxygen_avg_min >= 90):
            # Check how many times oxygen_raw_min was below 60 
            if (df.oxygen_raw_min < 60).sum() >= 5:
                # For cases of Oxygen Noise Index (indicating bad hardware)
                return 'True, many instantaneous'
            return 'True, instantaneous'
        elif all(df.heart_rate_raw_max < 220) & all(df.oxygen_avg_min < 90): 
            # (df.oxygen_avg_mean.mean() < 93) this one makes nearly everything low baseline.. 
                # meaning when they are fluctuating it still says low baseline, don't want that...
            # this category is just a "nice to know"
            return 'True, low baseline'
        else:
            return 'True'
    else:
        return 'False'
    
def last_vitals_2(df):
    if df.shape[0] == 0:
        return 'not valid'
    else:
        last_30_min = df.loc[df.valid_count > 0].iloc[-3:] # last 30 min or less
        end_min_hr = last_30_min.heart_rate_avg_min.min() 
        end_max_hr = last_30_min[-2:].heart_rate_avg_max.max()# dont want 30 min
        end_avg_o2 = last_30_min.oxygen_avg_min.min()
        end_raw_o2 = last_30_min.oxygen_raw_min.min()       
    if (end_min_hr < 60) & (end_avg_o2 < 80): # TODO O2 threshold here may be too low
        return 'low HR'
    elif (end_max_hr > 220):
        return 'high HR'
    # if o2 < 70 make extra low o2 category?
    elif (end_raw_o2 < 80) & (end_avg_o2 < 85): # Avg min? < 90?
        return 'low O2'
    else:
        return 'Good vitals'
    
def at_risk(df):
    '''is there at least 24 hours of base 7 data
        If not this could be bad hardware or a baby that has a preexisting condition'''
    base_7 = df.base_state_7.cumsum()
    mask = base_7 >= 144 
    if df.loc[mask].shape[0] == 0:
        return True
    else:
        return False

def had_cc(dsn, cc_dsns):
    '''Return True if the parent was using connected care'''
    if dsn in cc_dsns:
        return 1
    else:
        return 0
        
def in_US(day, location_df, no_reg):
    '''Return True if device was used in the US'''
    if no_reg:
        return True
    location = location_df.loc[location_df.created_at <= str(day)]
    if location.shape[0] == 0:
        # Don't know the location
        return False
    elif location.cc.iloc[-1] != 'US':
        return False
    else:
        return True   
    
def baby_age(day, baby_df, no_reg):
    '''Use most recently reported unless you dont know when 1 was reported'''
    # Still would need to modify the birthday data to have a column with the reported date
    if no_reg:
        return pd.Timedelta(days=1), ''
    relevant_birthdays = baby_df.loc[baby_df.created_at <= str(day)] # What was reported before the last use
    if relevant_birthdays.shape[0] == 0:
        return pd.Timedelta(days=-1), ''
    else:
        # Use the last reported bday
        last_reported = max(relevant_birthdays.created_at.values)
        bday = relevant_birthdays.query('created_at == @last_reported').birthDate.values[0]
    
    age = day.date() - date(int(bday[:4]), int(bday[4:6]), int(bday[6:]))# diff between birthday and day of incident
    return age, bday  

def dead_battery(df):
    '''Did the battery die at the end'''
    if df.iloc[-1].battery_level == 0:
        return True
    else:
        return False
    
def find_critical_events(dsn_list, location_data, baby_info, dsns_1_bday, low_HR_thres=60, high_HR_thres=220, O2_thres=70, valid_thres=.4, no_reg=False):
    '''Find all last use cases and classify critical events'''
    count_total = 0
    count = 0
    classifications = []
    cc_dsns = pickle.load(open('cc_dsns.p', 'rb')) # dsns with connected care
    for dsn in tqdm_notebook(dsn_list):
        # sorted/duplicate timestamps have been dropped (rollup algorithm)

        df_all = pickle.load(open(f'/Volumes/baileyWD/{folder}/16000_dfs/{dsn}_df.p', 'rb'))
        df_all = df_all.drop_duplicates()
        at_risk_or_hardware = at_risk(df_all)
        valid_df = df_all.loc[df_all.valid_count > 2] # > 2 so we don't miss possible cases, but also dont base critical event on 1 reading
        dsn_location = location_data.loc[location_data.dsn == dsn] 
        dsn_baby_info = baby_info.loc[baby_info.dsn == dsn] 

        # df could be empty! if so, skip it
        if valid_df.shape[0] != 0:
            gaps = find_last_days(valid_df, baby_info.loc[baby_info.dsn == dsn],dsns_1_bday)
            if dsn in cc_dsns:
                cc = 1
            else:
                cc = 0
            for day in gaps:
                # Calculate age of baby and dont check for critical events if they are older than 1
                age, bday = baby_age(day, dsn_baby_info, no_reg)
                count_total += 1
                if (age < pd.Timedelta(days=365)) and (age >= pd.Timedelta(days=0)):
                    count += 1
                    # if day is w/in 2 weeks of when Tanner did the rollups - disregard. 
                    if (str(day) <= two_weeks_before_received) & (str(day) >= '2017-01-31 23:59:59'):

                        # Check if in the US
                        if in_US(day, dsn_location, no_reg):
                            df_day = get_df_day(valid_df, day)
                            flatline = possible_flatline(df_day, low_HR_thres, high_HR_thres, O2_thres, valid_thres)
                            vitals = last_vitals_2(df_day) 
                            classifications.append((dsn, day.date(), flatline, vitals, cc, at_risk_or_hardware, age, bday))

                elif age >= pd.Timedelta(days=365):
                    # add row for older babies so we know the actual last day of use
                    classifications.append((dsn, day.date(), 'False', 'Good vitals', cc, at_risk_or_hardware, age, bday))

    print('total babies with last day', count_total)
    print('total babies < 1 on last day', count)
    df_columns = ['dsn', 'date', 'critical_event', 'last_10_minutes', 'cc', 'at_risk_or_issues', 'baby_age', 'birthday']
    df_classified = pd.DataFrame(classifications, columns=df_columns)
    
    # Get rid of multiple last days for individual babies
    df_classified.sort_values(by='date', inplace=True)
    df_classified.drop_duplicates(subset=['dsn', 'birthday'], keep='last', inplace=True)
    
    return df_classified

#### Functions for sock off and signal loss

In [19]:
def load_2sec(dsn, day, time_index=False):
    '''Load 2 second data for the given dsn and day'''
    if folder == '16k_round_1':
        df_2sec = pd.read_csv(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_2sec/{day}/{dsn}.csv.zip',names=column_names)
    else:
        df_2sec = pd.read_csv(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_2sec/{day}/{dsn}.csv.gz',compression='gzip',names=column_names)
    df_2sec.timestamp = pd.to_datetime(df_2sec.timestamp, unit='s')
    df_2sec = df_2sec.sort_values(by=['timestamp'])
    df_2sec.drop_duplicates('timestamp',inplace=True)

    df_2sec = df_2sec.loc[(df_2sec.ble_rssi != 0)]
    df_2sec.reset_index(inplace=True, drop=True)
    
    if time_index:
        df_2sec.set_index('timestamp',inplace=True)
    
    return df_2sec

def lost_signal(df):
    '''Return the timestamp when the signal is lost'''
    consecutive_valid = df.notification_mask.eq(0).rolling(window=16).sum().fillna(0)
    if consecutive_valid[consecutive_valid>=15].shape[0] == 0:
        return 0
    return consecutive_valid[consecutive_valid>=15].index[-1]

def multiple_sock_off(df, thresh=-.05):
    '''Find the timestamps when the sock is taken off'''
    diff_new = df.skin_temperature.diff().fillna(0)
    diff_rolling = diff_new.rolling(300).mean().fillna(method='bfill') # different window?
    off_indices = []
    # find ALL indices of min..
    if min(diff_rolling) < thresh: # < must match one for less_than
        less_than = diff_rolling[diff_rolling < thresh]
        
        # 0 accounts for first time it is below threshold, > 30 accounts for multiple sock offs
        drop_indices = [0] + list(np.where(np.diff(less_than.index.values) > 30)[0] + 1)
        drop_indices = less_than.index.values[drop_indices]
        
        # For each index:
        for i in drop_indices:
            if i == 0:
                off_indices.append(0)
            else:
                off = diff_new.iloc[max(0,(i-275)):i]
                off = off.loc[off != 0]
                
                if off.shape[0] == 0:
                    off_indices.append(i)#??????
                else:
                    # - 3 because window = 4, max(0, ) because if idxmax < 3 you get a negative index
                    off_index_reset = max(0, off.eq(-1).reset_index().skin_temperature.rolling(window=4).sum().fillna(0).idxmax() - 3)
                    count_neg_1 = off.iloc[off_index_reset:].eq(-1).value_counts()

                    #  if there are no False or no True:
                    if len(count_neg_1) == 1:
                        if count_neg_1.index[0]:
                            off_index = off.index[off_index_reset]
                            off_indices.append(off_index)
                    elif count_neg_1[True]/(count_neg_1[False] + count_neg_1[True]) >= .7: # What threshhold?
                        off_index = off.index[off_index_reset] # This is the index where the sock came off
                        off_indices.append(off_index)
                    
    return off_indices

def mvmt(x):
    if x == 0:
        return 1
    return int(bin(x)[-2])

def signal_sock(x):
    '''Determine if the sock came off before the signal was lost'''
    five_min = pd.Timedelta(minutes=5) #  is 5 min right choice
    try:
        df = load_2sec(x['dsn'], str(x['date']).replace('-','')) 
    except FileNotFoundError:
        return 'Need data'
    
    df_non_charging = df.loc[(df.base_state > 3) & (df.heart_rate_raw != 0)]
    df_non_charging.reset_index(drop=True, inplace=True)
    
    broken_sensor = any((df_non_charging.skin_temperature < 120) & (df_non_charging.skin_temperature > 0))
    
    time_values = df_non_charging.timestamp
    last_reading = df_non_charging.timestamp.values[-1]
    
    loss = lost_signal(df_non_charging)
    sock = multiple_sock_off(df_non_charging)
    
    # does the sock come off within 15 minutes of signal loss?
    last_base_state = df.loc[df.timestamp <= time_values[loss] + pd.Timedelta(15,'m')].iloc[-1].base_state 
#     print('last base state',last_base_state, 'signal loss', time_values[loss])
    # make timestamp the index
    df_non_charging.set_index('timestamp',inplace=True)
    mvmt_flag = df_non_charging.loc[(df_non_charging.index > time_values[loss]) & (df_non_charging.index < last_reading)].notification_mask.apply(mvmt) # are thresholds good?
    
    # if lost signal is within 1 min of last reading (before charging) return 'data cut off' 
    if abs(time_values[loss] - last_reading) < pd.Timedelta(minutes=1): # is 1 min good?
        if last_base_state == 3:
            return 'sock off'
        else:
            return 'data cut off'
        
    elif broken_sensor == True:
        if last_base_state == 3:
            return 'sock off'
        else:
            return 'unknown'
    
    # if sock comes off w/in ~5 min of lost signal return 'sock off'
    # sock off has to be at the end not beginning***
    elif len(sock) > 0: 
        for sock_off in sock:
            # if sock off is way before loss no sock off...
            if abs(sock_off - loss) > 1800: # Don't care about sock off more than 30 minutes before the signal was lost
                # if the sock came off before going on again
                pass
            elif (sock_off < loss) or (abs(time_values[sock_off] - time_values[loss]) <= five_min): # is 5 minutes good?
                return 'sock off' 
            else:
                # if time btw loss and sock off has movement > 20% of the time => cut off 
                loss_to_off = mvmt_flag.loc[mvmt_flag.index < (time_values[sock_off] - pd.Timedelta(minutes=5))].rolling(window=100).sum().fillna(method='bfill')
                if loss_to_off.min() < 15: # 20% or more?
                    return 'signal lost before sock off'
                else:
                    return 'sock off' # sock off or data cut off?
    
    elif dead_battery(df_non_charging):
        # Check for dead battery after checking for sock off (if battery dies after sock off we don't care)
        return 'battery died'
    
    # if time btw loss and last reading has movement > 20% => cut off
    loss_to_last = mvmt_flag.rolling(window=100).sum().fillna(method='bfill')
    if loss_to_last.min() > 15: # 20% or more?
        if last_base_state == 3:
            return 'sock off'
        else:
            return 'data cut off'
    
    if last_base_state == 3:
        return 'sock off'
    return 'signal lost before sock off'


#### Function for identifying possible expected deaths

In [None]:
def expected(x):
    ''''''
    # Load rollup df
    # identify how many days the device was used total
    # What is the range of days used
    # Return whether or not it looks like the baby had preexisting condition
    pass

# Run Algorithm

In [27]:
# Load all pickled files if kernal has been restarted
folder = '16k_round_2'
inclusion_dsns = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/inclusion_dsns.p', 'rb'))
location_data = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_location_data.p', 'rb'))
dsns_1_bday = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_1_bday.p', 'rb'))
baby_info = pickle.load(open('16k_reg_and_cc_bdays4.p','rb'))

In [60]:
df_classified_no_reg = find_critical_events(no_reg_but_data, location_data, baby_info, no_reg_but_data, no_reg=True) 
# df_classified = find_critical_events(inclusion_dsns, location_data, baby_info, dsns_1_bday) 


HBox(children=(IntProgress(value=0, max=395), HTML(value='')))

total babies with last day 395
total babies < 1 on last day 395


In [62]:
df_classified_no_reg[(df_classified_no_reg.critical_event !='False') & 
              (df_classified_no_reg.last_10_minutes !='Good vitals') &
              (df_classified_no_reg.at_risk_or_issues == False)]

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
285,AC000W002496826,2018-04-30,True,low O2,0,False,1 days,
153,AC000W001204959,2018-07-15,"True, low baseline",low O2,0,False,1 days,
209,AC000W001111840,2018-08-24,True,low O2,0,False,1 days,


In [42]:
df_classified.shape

(1210, 8)

In [24]:
folder = '16k_round_2'
df_classified = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/df_classified_jun_28.p', 'rb'))
df_critical = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/df_critical_jun_28.p', 'rb'))
# df_classified_no_reg = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/df_classified_no_reg_jul_11.p', 'rb'))


In [28]:
baby_info[baby_info.dsn=='AC000W000503823']

Unnamed: 0,dsn,birthDate,created_at


In [26]:
df_classified[df_classified.dsn == 'AC000W000503823']

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday


In [76]:
# df_classified[(df_classified.birthday > '20170222') & (df_classified.birthday < '20171017')].dsn.value_counts()#[df_classified.dsn == 'AC000W001041852']


In [77]:
# df_critical[(df_critical.birthday > '20170222') & 
#             (df_critical.birthday < '20171017') &
#             (df_critical.last_10_minutes != 'Good vitals')]#.dsn.value_counts()

In [11]:
df_critical[df_critical.dsn == 'AC000W001129425']

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
889,AC000W001129425,2018-10-06,True,high variance,1,False,85 days,20180713,sock off


Test a few dsns:

In [262]:
good_vital_dsns = list(df_classified.query("critical_event in @true and last_10_minutes == 'Good vitals' and at_risk_or_issues == False").dsn.value_counts().index)

In [658]:
# Birthdays that were after the data collection 
# bdays_after_nov_2018 = baby_info.query("birthDate > '20181130'")
# for dsn in inclusion_dsns:
#     results = bdays_after_nov_2018.query('dsn == @dsn').birthDate.values
#     if len(results) > 0:
#         print(results)

In [1]:
baby_info = pickle.load(open(f'16k_reg_and_cc_bdays4.p', 'rb'))
folder = '16k_round_1'
loc_round1 = pickle.load(open(f'/Volumes/baileyWD/{folder}/16000_location_data.p', 'rb'))
one_bday_round1 = pickle.load(open(f'/Volumes/baileyWD/{folder}/dsns_1_bday.p', 'rb'))
dsns = ['AC000W002497208']#good_vital_dsns
df_classify_round1 = find_critical_events(dsns, loc_round1, baby_info, one_bday_round1)


In [40]:
df_classify_round1

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
0,AC000W002432785,2017-12-22,"True, low baseline",low O2,1,True,151 days,20170724


In [362]:
df_classify_round1[df_classify_round1.last_10_minutes != 'Good vitals']#.sort_values(by='dsn').iloc[0:]#.drop_duplicates(['dsn','date'])

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
92,AC000W001042848,2017-04-30,"True, low baseline",low O2,0,False,7 days,20170423
17,AC000W001053602,2017-07-25,True,low O2,0,False,8 days,20170717
167,AC000W001057901,2017-10-08,"True, low baseline",low O2,0,False,187 days,20170404
157,AC000W001115978,2017-11-27,True,low O2,1,False,55 days,20171003
75,AC000W001102134,2018-01-11,True,low O2,0,False,263 days,20170423
3,AC000W001041852,2018-02-26,True,low O2,1,False,324 days,20170408
2,AC000W001041852,2018-05-03,True,low O2,1,False,93 days,20180130
117,AC000W001134190,2018-05-21,"True, low baseline",low O2,0,False,312 days,20170713
65,AC000W001180258,2018-05-23,False,low O2,1,False,342 days,20170615
170,AC000W002663450,2018-07-10,True,low O2,0,False,154 days,20180206


In [51]:
true = ['True', 'True, low baseline', 'True, instantaneous']
df_critical_round1 = df_classify_round1.query('critical_event in @true')
df_critical_round1['signal_sock'] = df_critical_round1.apply(signal_sock, axis=1)

last base state 4


In [383]:
df_critical_round1

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
0,AC000W002449423,2018-10-28,True,low O2,1,False,262 days,20180208,sock off


#### Get 2 second data for critical events before further classification

In [25]:
#dont want to apply this to whole df, just to critical events (have to get 2sec data on computer before)
true = ['True', 'True, low baseline', 'True, instantaneous', 'True, many instantaneous']
# TODO include False w/ last 10 min low o2?? Look at some
df_critical = df_classified.query('critical_event in @true and at_risk_or_issues == False')
if df_critical.shape[0] != 0:
    df_critical['signal_sock'] = df_critical.apply(signal_sock, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [24]:
df_critical

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
6458,AC000W002497208,2018-05-02,True,low O2,1,False,165 days,20171118,sock off


In [65]:
# pickle.dump(df_classified, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/df_classified_jun_28.p', 'wb'))
# pickle.dump(df_critical, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/df_critical_jun_28.p', 'wb'))
# pickle.dump(df_classified_no_reg, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/df_classified_no_reg_jul_11.p','wb'))

In [42]:
df_classified[df_classified.dsn == 'AC000W002432785']

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
1486,AC000W002432785,2017-12-22,"True, low baseline",low O2,1,True,151 days,20170724


## Breakdown of classifications: ** * ?

In [56]:
# ** cases 

df_critical[(df_critical.at_risk_or_issues == False) &
            (df_critical.last_10_minutes != 'Good vitals') &
            (df_critical.critical_event != 'True, instantaneous') &
            (df_critical.critical_event != 'True, many instantaneous') &
            (df_critical.signal_sock != 'sock off') &
            (df_critical.signal_sock != 'data cut off')]

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
4678,AC000W001105945,2017-08-20,"True, low baseline",low O2,0,False,105 days,20170507,signal lost before sock off
3821,AC000W001052267,2017-10-20,True,low HR,1,False,9 days,20171011,signal lost before sock off
5015,AC000W001060959,2018-02-12,"True, low baseline",low O2,0,False,251 days,20170606,signal lost before sock off
5508,AC000W001099202,2018-02-22,"True, low baseline",low O2,0,False,209 days,20170728,signal lost before sock off
4894,AC000W001134190,2018-05-21,"True, low baseline",low O2,0,False,312 days,20170713,signal lost before sock off
4579,AC000W001137082,2018-06-07,"True, low baseline",low O2,0,False,323 days,20170719,unknown
4096,AC000W002663450,2018-07-10,"True, low baseline",low O2,0,False,154 days,20180206,signal lost before sock off
5798,AC000W001110174,2018-07-19,True,low O2,1,False,262 days,20171030,signal lost before sock off
5774,AC000W001150173,2018-08-16,True,low O2,0,False,15 days,20180801,signal lost before sock off
2207,AC000W002665726,2018-09-16,True,low O2,0,False,207 days,20180221,signal lost before sock off


In [57]:
# * cases

df_critical[(df_critical.at_risk_or_issues == False) &
            (df_critical.last_10_minutes != 'Good vitals') &
            (df_critical.critical_event != 'True, instantaneous') &
            (df_critical.signal_sock != 'signal lost before sock off') &
            (df_critical.signal_sock != 'unknown') &
            (df_critical.signal_sock != 'battery died')]

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
6821,AC000W001178921,2017-04-07,True,low O2,0,False,1 days,20170406,sock off
7428,AC000W001042848,2017-04-30,"True, low baseline",low O2,0,False,7 days,20170423,sock off
756,AC000W001053602,2017-07-25,"True, low baseline",low O2,0,False,8 days,20170717,sock off
2649,AC000W001132720,2017-08-30,"True, low baseline",low O2,0,False,9 days,20170821,sock off
7619,AC000W001057901,2017-10-08,"True, low baseline",low O2,0,False,187 days,20170404,data cut off
7415,AC000W001115978,2017-11-27,"True, low baseline",low O2,1,False,55 days,20171003,sock off
8227,AC000W001045152,2017-12-22,True,low O2,0,False,2 days,20171220,sock off
4338,AC000W001102134,2018-01-11,True,low O2,0,False,263 days,20170423,sock off
4531,AC000W001152020,2018-01-17,True,high HR,1,False,0 days,20180117,data cut off
1938,AC000W001041852,2018-02-26,"True, low baseline",low O2,1,False,324 days,20170408,sock off


In [58]:
# ? cases

df_critical[(df_critical.at_risk_or_issues == False) &
            (((df_critical.last_10_minutes != 'Good vitals') &
            ((df_critical.critical_event == 'True, instantaneous') &
            (df_critical.signal_sock != 'sock off'))) |
            (df_critical.critical_event == 'True, many instantaneous'))]

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
8034,AC000W001181803,2018-03-22,"True, many instantaneous",Good vitals,1,False,237 days,20170728,signal lost before sock off
6202,AC000W001131035,2018-07-26,"True, many instantaneous",Good vitals,1,False,56 days,20180531,sock off
1804,AC000W001072954,2018-08-01,"True, many instantaneous",Good vitals,1,False,301 days,20171004,unknown


In [29]:
# nothing..

# df_critical[(df_critical.at_risk_or_issues == False) &
#             (df_critical.critical_event == 'True, instantaneous') &
#             (df_critical.signal_sock == 'sock off')]

In [239]:
# Cases where there was never 24 hours of base state 7 but there was "continued use" 
    # (maybe 7 or more days; or the span of time is was used)
    # and monitoring was just turned off most or all of the time
    # Likely indicator that the baby had some pre-existing condition => not going to be a SUID case

In [14]:
df_classified.query("at_risk_or_issues == True and critical_event in @true and last_10_minutes != 'Good vitals'")

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
790,AC000W001084337,2017-02-25,True,low O2,0,True,27 days,20170129
2869,AC000W000424430,2017-02-27,"True, low baseline",low O2,0,True,10 days,20170217
3385,AC000W001124472,2017-03-16,"True, low baseline",low O2,0,True,28 days,20170216
7174,AC000W001123812,2017-03-22,"True, low baseline",low O2,0,True,5 days,20170317
8123,AC000W001181483,2017-03-22,"True, low baseline",low O2,0,True,12 days,20170310
6427,AC000W001124776,2017-04-02,"True, low baseline",low O2,0,True,30 days,20170303
7617,AC000W001123919,2017-04-15,True,low O2,1,True,3 days,20170412
3584,AC000W001038945,2017-04-15,"True, low baseline",low O2,1,True,3 days,20170412
7944,AC000W001122185,2017-04-26,"True, low baseline",low O2,0,True,9 days,20170417
8063,AC000W001042935,2017-05-01,"True, low baseline",low O2,0,True,3 days,20170428


2 second that is still needed

In [17]:
df_critical[df_critical.signal_sock == 'Need data']

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock


In [231]:
'AC000W001183543' in v2_devices

True

### Get 2 second data (find what folder the data is in)

In [None]:
cp BinaryMask9/20180804/AC000W001110671* ~/Documents/Owlet-code/16k_round_2/16000_2sec/20180804/

In [678]:
# for row in df_critical.itertuples():
#     print(row.dsn, row.date)

In [43]:
def get_all_dsns2(all_file_paths):
    '''Pickle a set of the unique dsns'''
    my_list = set([])
    for path in all_file_paths:
        my_list.add(path[-22:-7]) 
    return my_list

In [40]:
folder_dsns = {}

In [42]:
# binarymask9_files = glob('/Volumes/My Passport/BinaryMask9/*/*.csv.gz')
# binarymask10_files = glob('/Volumes/My Passport/BinaryMask10/*/*.csv.gz')
# binarymask11_files = glob('/Volumes/My Passport/BinaryMask11/*/*.csv.gz')
# binarymask12_files = glob('/Volumes/My Passport/BinaryMask12/*/*.csv.gz')
# binarymask13_files = glob('/Volumes/My Passport/BinaryMask13/*/*.csv.gz')
# binarymask14_files = glob('/Volumes/My Passport/BinaryMask14/*/*.csv.gz')
# binarymask15_files = glob('/Volumes/My Passport/BinaryMask15/*/*.csv.gz')
# binarymask16_files = glob('/Volumes/My Passport/BinaryMask16/*/*.csv.gz')

In [45]:
# folder_dsns['BinaryMask9'] = get_all_dsns2(binarymask9_files)
# folder_dsns['BinaryMask10'] = get_all_dsns2(binarymask10_files)
# folder_dsns['BinaryMask11'] = get_all_dsns2(binarymask11_files)
# folder_dsns['BinaryMask12'] = get_all_dsns2(binarymask12_files)
# folder_dsns['BinaryMask13'] = get_all_dsns2(binarymask13_files)
# folder_dsns['BinaryMask14'] = get_all_dsns2(binarymask14_files)
# folder_dsns['BinaryMask15'] = get_all_dsns2(binarymask15_files)
# folder_dsns['BinaryMask16'] = get_all_dsns2(binarymask16_files)

In [47]:
# pickle.dump(folder_dsns, open('folder_dsns.p', 'wb'))

In [67]:
count = 0
for row in df_critical.itertuples():
    folders_ = ['BinaryMask9', 'BinaryMask10', 'BinaryMask11', 'BinaryMask12', 'BinaryMask13', 'BinaryMask14', 'BinaryMask15', 'BinaryMask16']
    for f in folders_:
        if row.dsn in folder_dsns[f]:
            count += 1
            print(f)

BinaryMask14
BinaryMask13
BinaryMask9
BinaryMask11
BinaryMask14
BinaryMask16
BinaryMask10
BinaryMask15
BinaryMask15
BinaryMask11
BinaryMask13
BinaryMask9
BinaryMask11
BinaryMask16
BinaryMask13
BinaryMask15
BinaryMask13
BinaryMask13
BinaryMask11
BinaryMask9
BinaryMask10
BinaryMask15
BinaryMask13
BinaryMask9
BinaryMask11
BinaryMask15
BinaryMask16
BinaryMask14
BinaryMask13
BinaryMask9
BinaryMask15
BinaryMask11
BinaryMask14
BinaryMask10
BinaryMask10
BinaryMask14
BinaryMask9
BinaryMask14
BinaryMask12
BinaryMask15
BinaryMask16
BinaryMask16
BinaryMask9
BinaryMask12
BinaryMask15
BinaryMask9
BinaryMask14
BinaryMask14
BinaryMask9
BinaryMask13
BinaryMask13
BinaryMask11
BinaryMask9
BinaryMask16
BinaryMask15
BinaryMask14
BinaryMask11
BinaryMask15
BinaryMask10
BinaryMask11
BinaryMask12
BinaryMask14
BinaryMask10
BinaryMask11
BinaryMask16
BinaryMask14
BinaryMask10
BinaryMask14
BinaryMask12
BinaryMask12
BinaryMask14
BinaryMask10
BinaryMask9
BinaryMask16
BinaryMask14
BinaryMask10
BinaryMask9
BinaryMask1

In [2]:
folder_dsns = pickle.load(open('folder_dsns.p', 'rb'))

In [23]:
dsn = 'AC000W003420495'
print('9', dsn in folder_dsns['BinaryMask9'])
print('10', dsn in folder_dsns['BinaryMask10'])
print('11', dsn in folder_dsns['BinaryMask11'])
print('12', dsn in folder_dsns['BinaryMask12'])
print('13', dsn in folder_dsns['BinaryMask13'])
print('14', dsn in folder_dsns['BinaryMask14'])
print('15', dsn in folder_dsns['BinaryMask15'])
print('16', dsn in folder_dsns['BinaryMask16'])

9 False
10 False
11 False
12 False
13 True
14 False
15 False
16 False


In [20]:
AC000W002441380	2018-08-20
cp TwoSecondSockMask13/AC000W003420495/AC000W003420495_20181110* ~/Documents/Owlet-code/16k_round_2/16000_2sec/20181110/AC000W003420495.csv.gz
# cp BinaryMask13/20180523/AC000W001180258* ~/Documents/Owlet-code/16k_round_2/16000_2sec/20180523/
# cp BinaryMask15/20180117/AC000W001152020* ~/Documents/Owlet-code/16k_round_2/16000_2sec/20180117/


In [None]:
cp Parsed16000/20170423/AC000W001051578* ~/Documents/Owlet-code/16k_round_1/16000_2sec/20170423/