In [6]:
import os
import pickle
import statistics
import numpy as np
import pandas as pd
from glob import glob
import sqlalchemy as sql
from tqdm import tqdm_notebook
from datetime import date, datetime, timedelta

In [None]:
all_files = glob('10minData/Mask*/*') # Substitute actual folder names
def get_all_dsns(all_files):
    '''Pickle a set of the unique dsns'''
    my_list = set([])
    for file in all_files:
        my_list.add(file[:15]) # dsn should be first, double check that [:15] gives just DSN 
    
    pickle.dump(my_list, open(f"/Users/brodriguez/Documents/Owlet-code/{folder}/rollups_dsns.p", "wb"))
    
    
    
def find_critical_events(dsn_list, location_data, baby_info, dsns_1_bday, size_of_gap=pd.Timedelta(21,'D'), low_HR_thres=60, high_HR_thres=220, O2_thres=70, valid_thres=.4):
    '''Find all last use cases and classify critical events'''
    _errors = []
    count_total = 0
    count = 0
    classifications = []
    cc_dsns = pickle.load(open('cc_dsns.p', 'rb')) # dsns with connected care
    for dsn in tqdm_notebook(dsn_list):
        # sorted/duplicate timestamps have been dropped (rollup algorithm)

        # not pickle anymore
        df_all = pd.read_msgpack(f'TenMindata/Mask?/{dsn}.TenMinutemask.msgpack') # get details when the rollups are done
#             df_all = pickle.load(open(f'/Volumes/baileyWD/{folder}/16000_dfs/{dsn}_df.p', 'rb')) 
        df_all = df_all.drop_duplicates()
        at_risk_or_hardware = at_risk(df_all)
        valid_df = df_all.loc[df_all.valid_count > 2] # > 2 so we don't miss possible cases, but also dont base critical event on 1 reading
        dsn_location = location_data.loc[location_data.dsn == dsn] 
        dsn_baby_info = baby_info.loc[baby_info.dsn == dsn] 

        # df could be empty! if so, skip it
        if valid_df.shape[0] != 0:
            gaps = find_last_days(valid_df, baby_info.loc[baby_info.dsn == dsn],dsns_1_bday)
            if dsn in cc_dsns:
                cc = 1
            else:
                cc = 0
            for day in gaps:
                # Calculate age of baby and dont check for critical events if they are older than 1
                age, bday = baby_age(day, dsn_baby_info)
                count_total += 1
                if (age < pd.Timedelta(days=365)) and (age >= pd.Timedelta(days=0)):
                    count += 1
                    # if day is w/in 2 weeks of when Tanner did the rollups - disregard. I got the data Jan 30
                    if (str(day) <= two_weeks_before_received) & (str(day) >= '2017-01-31 00:00:00'):

                        # Check if in the US
                        if in_US(day, dsn_location):
                            df_day = get_df_day(valid_df, day)
                            flatline = possible_flatline(df_day, low_HR_thres, high_HR_thres, O2_thres, valid_thres)
                            vitals = last_vitals_2(df_day) 
                            classifications.append((dsn, day.date(), flatline, vitals, cc, at_risk_or_hardware, age, bday))

                elif age >= pd.Timedelta(days=365):
                    # add row for older babies so we know the actual last day of use
                    classifications.append((dsn, day.date(), False, 'Good vitals', cc, at_risk_or_hardware, age, bday))
        
    print('total babies with last day', count_total)
    print('total babies < 1 on last day', count)
    df_columns = ['dsn', 'date', 'critical_event', 'last_10_minutes', 'cc', 'at_risk_or_issues', 'baby_age', 'birthday']
    df_classified = pd.DataFrame(classifications, columns=df_columns)
    
    # Get rid of multiple last days for individual babies
    df_classified.sort_values(by='date', inplace=True)
    df_classified.drop_duplicates(subset=['dsn', 'birthday'], keep='last', inplace=True)
    
    return df_classified


# Base state is now a bit mask.. 
def at_risk(df):
    '''is there at least 24 hours of base 7 data
        If not this could be bad hardware or a baby that has a preexisting condition'''
    # df.BaseStateMask (want all rows where the bit for base 7 is true...)
    base_7 = df.base_state_7.cumsum()
    mask = base_7 >= 144 
    if df.loc[mask].shape[0] == 0:
        return True
    else:
        return False

## Using big mama

### Functions

Pickle list of dsns in this round

In [None]:
# Load dsns from each mask
# combine them into 1 list
# pickle the list

Need list of dsns that actually have 1st reported bday in the correct range

Pickle inclusion dsns (the only thing not taken into account yet will be those with no data, i think..)

    *Remember that we are only doing babies w/ bdays in a certain range and that are the first to use the device*

In [None]:
#TODO intersect inclusions with dsns with 1st bday in correct range
dsns_with_info = pickle.load(open('all_dsns_with_info.p','rb')) # includes cc dsns
v2_devices = pickle.load(open('/Users/brodriguez/Documents/Owlet-code/V2_monitoring_data/v2_devices.p', 'rb'))
inclusion_dsns = set(list_of_dsns).intersection(dsns_with_info).intersection(v2_devices)

more_than_5_bdays = set(pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_more_than_5_bdays.p', 'rb'))).intersection(inclusion_dsns)
for dsn in more_than_5_bdays:
    inclusion_dsns.remove(dsn)
    
pickle.dump(inclusion_dsns, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/inclusion_dsns.p', 'wb'))


Get locations for inclusion dsns

In [None]:
location_data = pd.read_csv('/Users/brodriguez/Documents/Owlet-code/GPS_locations_Mar_2019.csv', compression='gzip')
def dsn_in_16000(x):
    if x['dsn'] in inclusion_dsns:
        return True
    else:
        return False

# Location info for only the 16000 dsns (otherwise the df is too big and it slows everything down)
in_16000 = location_data.apply(dsn_in_16000, axis=1)
location_data = location_data.loc[in_16000].sort_values(by='created_at')

pickle.dump(location_data, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_location_data.p', 'wb'))

Find critical events (load 10 min rollups with query on big mama

In [99]:
# adjust to changes in accessing data
          # changes in column names
          # changes in column types (base state..not sure if others have changed: need to test along the way)
# valid_count == ValidSamples
# index is not timestamp..should I make it the index or change the code?
def binary(x):
    '''Convert int to binary and return indices of bits that are 1'''
    bi = list(bin(x)[2:])[::-1]
    indices = np.where(np.array(bi) == "1")[0]
    return indices

def at_risk(df):
    if df.shape[0] < 144: 
        return True
    else:
        count = 0
        for row in df.iloc[:144].itertuples():
            if 7 in binary(int(row.BaseStateMask)):
                count += 1
            if count >= 144:
                return False
    if count >= 144:
        return False
    else:
        return True
    
# find_last_days should now only return 1 day (since we only care about the first user for a given device)
def find_last_day(dsn, df_all, dsn_bdays, dsns_1_bday):
    ''''''
    # if there is only 1 unique bday: return only very last day of use
    dsn_bdays = list(dsn_bdays.sort_values(by='created_at').created_at)
    if (dsn in dsns_1_bday) or (len(dsn_bdays) == 1):
        return list(df_all.TimeWindowStartTime)[-1]
    else:
        #  take last use before second birthday was reported
        df_all = df_all.loc[(df_all.TimeWindowStartTime >=  dsn_bdays[0]) & (df_all.TimeWindowStartTime <  dsn_bdays[1])]
        if df_all.shape[0] == 0:
            # No data between 1st 2 reported bdays
            return -1
        return list(df_all.TimeWindowStartTime)[-1]
    

def baby_age(day, baby_df, no_reg):
    '''Use most recently reported unless you dont know when 1 was reported'''
    # Still would need to modify the birthday data to have a column with the reported date
    if no_reg:
        return pd.Timedelta(days=1), ''
    relevant_birthdays = baby_df.loc[baby_df.created_at <= str(day)] # What was reported before the last use
    if relevant_birthdays.shape[0] == 0:
        return pd.Timedelta(days=-1), ''
    else:
        # Use the last reported bday
        last_reported = max(relevant_birthdays.created_at.values)
        bday = relevant_birthdays.query('created_at == @last_reported').birthDate.values[0]
    
    age = day.date() - date(int(bday[:4]), int(bday[4:6]), int(bday[6:]))# diff between birthday and day of incident
    return age, bday 

def in_US(day, location_df, no_reg):
    '''Return True if device was used in the US'''
    if no_reg:
        return True
    location = location_df.loc[location_df.created_at <= str(day)]
    if location.shape[0] == 0:
        # Don't know the location
        return False
    elif location.cc.iloc[-1] != 'US':
        return False
    else:
        return True 

def get_df_day(df_all, day):
    '''Get 2 hours of data just prior to the time given'''
    # return the 2 hours just prior to the datetime (and include the datetime)
    # if there is not data in the 2 hours before, it will get what is there
    prior_2_hrs = day - pd.Timedelta(120,'m')
    return df_all[(df_all.TimeWindowStartTime >= prior_2_hrs) & (df_all.TimeWindowStartTime <= day)] #.TimeWindowStartTime or .FirstReadingTime


def possible_flatline(df, low_HR_thres, high_HR_thres, low_O2_thres, valid_thres):
    '''Find critical events (high or low heart rate or low oxygen) in the data given'''
    valid_percent = df.ValidSamples/df.TotalSamples
    
    # We don't alert for low HR unless O2 is also low.
    critical_vitals = ((df.HeartRateRawMin < low_HR_thres) & (df.OxygenRawMin < 90)) | (df.OxygenRawMin < low_O2_thres) | (df.HeartRateRawMax > high_HR_thres)
    critical_event = any(critical_vitals & (valid_percent >= valid_thres))
    
    if critical_event:
        # Low HR won't be valid if the O2 is not actually dropping
        if all(df.HeartRateRawMax < 220) & all(df.OxygenAvgMin >= 90):
            # Check how many times oxygen_raw_min was below 60 
            if (df.OxygenRawMin < 60).sum() >= 5:
                # For cases of Oxygen Noise Index (indicating bad hardware)
                return 'True, many instantaneous'
            return 'True, instantaneous'
        elif all(df.HeartRateRawMax < 220) & all(df.OxygenAvgMin < 90): 
            # (df.OxygenAvgAvg.mean() < 93) this one makes nearly everything low baseline.. 
                # meaning when they are fluctuating it still says low baseline, don't want that...
            # this category is just a "nice to know"
            return 'True, low baseline'
        else:
            return 'True'
    else:
        return 'False'
    

def last_vitals_2(df):
    if df.shape[0] == 0:
        return 'not valid'
    else:
        last_30_min = df.loc[df.ValidSamples > 0].iloc[-3:] # last 30 min or less
        end_min_hr = last_30_min.HeartRateAvgMin.min() 
        end_max_hr = last_30_min[-2:].HeartRateAvgMax.max()# dont want 30 min
        end_avg_o2 = last_30_min.OxygenAvgMin.min()
        end_raw_o2 = last_30_min.OxygenRawMin.min()       
    if (end_min_hr < 60) & (end_avg_o2 < 80): # TODO O2 threshold here may be too low
        return 'low HR'
    elif (end_max_hr > 220):
        return 'high HR'
    # if o2 < 70 make extra low o2 category?
    elif (end_raw_o2 < 80) & (end_avg_o2 < 85): # Avg min? < 90?
        return 'low O2'
    else:
        return 'Good vitals'
    

def find_critical_events(dsn_list, location_data, baby_info, dsns_1_bday, conn, low_HR_thres=60, high_HR_thres=220, O2_thres=70, valid_thres=.4, no_reg=False):
    '''Find all last use cases and classify critical events'''
    count_total = 0
    count = 0
    classifications = []
    cc_dsns = pickle.load(open('cc_dsns.p', 'rb')) # dsns with connected care
    for dsn in tqdm_notebook(dsn_list):
        # sorted/duplicate timestamps have been dropped (rollup algorithm)

        df_all = pd.read_sql('select * from tenminsock where dsn = %(d)s', conn, params={'d':dsn})
        df_all = df_all.drop_duplicates()
        df_all = df_all.sort_values(by='TimeWindowStartTime')
        
        at_risk_or_hardware = at_risk(df_all)
        valid_df = df_all.loc[df_all.ValidSamples > 2] # > 2 so we don't miss possible cases, but also dont base critical event on 1 reading
        dsn_location = location_data.loc[location_data.dsn == dsn] 
        dsn_baby_info = baby_info.loc[baby_info.dsn == dsn] 

        # df could be empty! if so, skip it
        if valid_df.shape[0] != 0:
#             gaps = find_last_days(valid_df, baby_info.loc[baby_info.dsn == dsn],dsns_1_bday)
            last_day = find_last_day(dsn, valid_df, baby_info.loc[baby_info.dsn == dsn],dsns_1_bday)
            if last_day != -1:
                # there is data for 1st baby
                if dsn in cc_dsns:
                    cc = 1
                else:
                    cc = 0
                # Calculate age of baby and dont check for critical events if they are older than 1
                age, bday = baby_age(last_day, dsn_baby_info, no_reg)
                count_total += 1
                if (age < pd.Timedelta(days=365)) and (age >= pd.Timedelta(days=0)):
                    count += 1
                    # if day is w/in 2 weeks of last day we have data (shouldnt happen unless we need to do bdays after oct)
    #                 if (str(day) <= two_weeks_before_received) & (str(day) >= '2017-01-31 23:59:59'):

                    # Check if in the US
                    if in_US(last_day, dsn_location, no_reg):
                        df_day = get_df_day(valid_df, last_day)
                        flatline = possible_flatline(df_day, low_HR_thres, high_HR_thres, O2_thres, valid_thres)
                        vitals = last_vitals_2(df_day) 
                        classifications.append((dsn, last_day.date(), flatline, vitals, cc, at_risk_or_hardware, age, bday))

                elif age >= pd.Timedelta(days=365):
                    # add row for older babies so we know the actual last day of use
                    classifications.append((dsn, last_day.date(), 'False', 'Good vitals', cc, at_risk_or_hardware, age, bday))

    print('total babies with last day', count_total)
    print('total babies < 1 on last day', count)
    df_columns = ['dsn', 'date', 'critical_event', 'last_10_minutes', 'cc', 'at_risk_or_issues', 'baby_age', 'birthday']
    df_classified = pd.DataFrame(classifications, columns=df_columns)
    
    # Get rid of multiple last days for individual babies
#     df_classified.sort_values(by='date', inplace=True)
#     df_classified.drop_duplicates(subset=['dsn', 'birthday'], keep='last', inplace=True)
    
    return df_classified

Sock off classifications..How to access 2 second data?

### Run algorithm

In [63]:
DB_USER = 'brodriguez'
DB_PASSWORD = input()
DB_CONN = 'localhost'
DB_NAME = 'owletsock'
# SQL login
s = f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_CONN}/{DB_NAME}'
engine = sql.create_engine(s)
conn = engine.connect()

letmein


In [4]:
folder = '16k_round_2'
inclusion_dsns = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/inclusion_dsns.p', 'rb'))
location_data = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_location_data.p', 'rb'))
dsns_1_bday = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_1_bday.p', 'rb'))
baby_info = pickle.load(open('16k_reg_and_cc_bdays4.p','rb'))

In [95]:
inclusion_dsns = inclusion_dsns.intersection(baby_info.dsn.values)

In [None]:
df_classified = find_critical_events(inclusion_dsns, location_data, baby_info, dsns_1_bday, conn) 

HBox(children=(IntProgress(value=0, max=10889), HTML(value='')))

In [60]:
conn.close()

In [88]:
df_classified

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
0,AC000W001218386,2018-12-18,False,Good vitals,1,True,294 days,20180227


#### Once I have 2 second data:

In [None]:
#dont want to apply this to whole df, just to critical events (have to get 2sec data on computer before)
true = ['True', 'True, low baseline', 'True, instantaneous', 'True, many instantaneous']
df_critical = df_classified.query('critical_event in @true and at_risk_or_issues == False')
if df_critical.shape[0] != 0:
    df_critical['signal_sock'] = df_critical.apply(signal_sock, axis=1)

## Breakdown of classifications: ** * ?

In [None]:
# ** cases 

df_critical[(df_critical.at_risk_or_issues == False) &
            (df_critical.last_10_minutes != 'Good vitals') &
            (df_critical.critical_event != 'True, instantaneous') &
            (df_critical.critical_event != 'True, many instantaneous') &
            (df_critical.signal_sock != 'sock off') &
            (df_critical.signal_sock != 'data cut off')]

In [None]:
# * cases

df_critical[(df_critical.at_risk_or_issues == False) &
            (df_critical.last_10_minutes != 'Good vitals') &
            (df_critical.critical_event != 'True, instantaneous') &
            (df_critical.signal_sock != 'signal lost before sock off') &
            (df_critical.signal_sock != 'unknown') &
            (df_critical.signal_sock != 'battery died')]

In [None]:
# ? cases

df_critical[(df_critical.at_risk_or_issues == False) &
            (((df_critical.last_10_minutes != 'Good vitals') &
            ((df_critical.critical_event == 'True, instantaneous') &
            (df_critical.signal_sock != 'sock off'))) |
            (df_critical.critical_event == 'True, many instantaneous'))]

2 second data still needed:

In [None]:
df_critical[df_critical.signal_sock == 'Need data']