In [4]:
import os
import pickle
import statistics
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm_notebook
from datetime import date, datetime, timedelta

### *Before running flatline algorithm, look at the statistics for the 16,000 to understand what the data looks like

## Code to run in preparation for flatline algorithm

In [2]:
folder = '16k_round_2'
date_data_received = '20190520' #actually want date data was saved
two_weeks_before_received = '2019-05-06 00:00:00' # last use needs to be prior to 2 weeks of when data was received
column_names = pd.read_csv('column_names.txt').columns

In [54]:
# all_file_paths = glob(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_day_rollups/*/*.p')
# additional_paths = glob(f'/Volumes/Seagate Backup Plus Drive/Work/{folder}/16000_day_rollups/*/*.p')
# all_file_paths += additional_paths
all_file_paths = sorted(all_file_paths)

In [5]:
# Modified for starting with 2 second data that is divided into folders containing data for 2000 dsns
def get_all_dsns(all_file_paths):
    '''Pickle a set of the unique dsns'''
    my_list = set([])
    for path in all_file_paths:
        my_list.add(path[-17:-2]) 
    
    pickle.dump(my_list, open(f"/Users/brodriguez/Documents/Owlet-code/{folder}/rollups_dsns.p", "wb"))
    
# def get_file_paths():
#     '''Gets a list of all file paths for every dsn'''
    
#     filepaths = glob(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_day_rollups/*/*.p')
#     return sorted(filepaths)

def get_dsn_files(dsn, all_file_paths):
    '''Get all files for a given dsn'''
    files = []
    for file in all_file_paths:
        if dsn in file:
            files.append(file)            
    return sorted(files)

def all_dsn_files(dsn_list, all_file_paths):
    '''Takes a list of dsns and creates a dictionary mapping the dsn to a 
            list of the days for which the dsn has data'''
    dictionary = {}
    for dsn in tqdm_notebook(dsn_list):
        all_files = get_dsn_files(dsn, all_file_paths)
        dictionary[dsn] = all_files
    return dictionary

def load_all_days(all_days):
    '''Loads the files in all_days and combines them into 1 dataframe'''
    load_dfs = [pickle.load(open(file, 'rb')) for file in all_days]
    if len(load_dfs) == 0:
        return pd.DataFrame()
    else:
        df_all = pd.concat(load_dfs)
#         df_all = clean_df(df_all)
        
    return df_all

# def clean_df(df):
#     '''Get rid of data on 2016-01-01'''
#     if df.shape[0] == 0:
#         return df
#     else:
#         return df.loc[(df.index > '2016-01-01 23:59:59') | (df.index < '2016-01-01 00:00:00')]

def pickle_full_dfs(dsn_list, all_files):
    '''For each dsn, pickle all the data'''
    for dsn in tqdm_notebook(dsn_list):
        df_all = load_all_days(all_files[dsn])
        pickle.dump(df_all, open(f'/Volumes/Seagate Backup Plus Drive/Work/{folder}/16000_dfs/{dsn}_df.p', 'wb')) 

Get list of all dsns

In [55]:
get_all_dsns(all_file_paths) # pickles a list of the dsns for this 16000
list_of_dsns = pickle.load(open(f"/Users/brodriguez/Documents/Owlet-code/{folder}/rollups_dsns.p", "rb"))

Get mapping of dsn to filenames

In [4]:
# dictionary = all_dsn_files(list_of_dsns, all_file_paths)
# pickle.dump(dictionary, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dictionary.p','wb'))
dictionary = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dictionary.p', 'rb'))

Pickle dataframe for each dsn

In [6]:
pickle_full_dfs(list_of_dsns, dictionary) 

HBox(children=(IntProgress(value=0, max=15992), HTML(value='')))




Get updated cc and registration data then pickle

In [6]:
def save_bday_info(registration, bday_gender, cc_baby_info):
    def clean_bdays(x):
        return x[:10].replace('-','')
    # combine bdays from cc & baby profile
    cc_bdays = cc_baby_info[['dsn','birthDate']]
    cc_bdays.birthDate = cc_bdays.birthDate.apply(clean_bdays)
    cc_bdays = cc_bdays.loc[cc_bdays.birthDate != '']

    registration_bdays = bday_gender.loc[(bday_gender.property_name == 'BIRTHDATE')][['dsn','val_string']]
    registration_bdays.columns = ['dsn','birthDate']
    all_bdays = cc_bdays.append(registration_bdays)
    all_bdays = all_bdays.drop_duplicates()
    all_bdays = all_bdays[(all_bdays.birthDate > '20160101') & (all_bdays.birthDate < '20190506')]

    pickle.dump(all_bdays, open('16k_reg_and_cc_bdays.p','wb'))

    # Pickle dsns that have cc or both reg+bday info
    dsns_with_info = set(list(set(registration_bdays.dsn.values).intersection(registration.dsn.values)) + list(cc_bdays.dsn.values))
    pickle.dump(dsns_with_info, open('all_dsns_with_info.p','wb'))

In [12]:
registration = pd.read_csv('/Users/brodriguez/Documents/parsed_registration_pre_20190113.csv', compression='gzip', index_col=0)
bday_gender = pd.read_csv('/Users/brodriguez/Documents/parsed_datapoint_pre_20190113.csv', compression='gzip', index_col=0)
cc_baby_info = pickle.load(open("cc_baby_info_may30_2019.p", "rb"))

save_bday_info(registration, bday_gender, cc_baby_info)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Load birthday dataframe (contains valid birthdays from registration data and connected care data)

In [11]:
baby_info = pickle.load(open('16k_reg_and_cc_bdays.p','rb')) # Birthdays found in registration and connected care data

In [7]:
def count_bdays(df):
    '''Save a dictionary mapping each dsn to the set of reported birthdays'''
    dsn_bday_values = {}
    for dsn in tqdm_notebook(list(df.dsn.value_counts().index)):
        bdays = df.loc[(df.dsn == dsn)].birthDate.values
        dsn_bday_values[dsn] = set(bdays)
        
    return dsn_bday_values
        
def dsns_with_1_bday(dsn_bday_values):
    '''Save a list of dsns that have only 1 reported birthday'''
    dsns_1_bday = []
    for dsn, bdays in tqdm_notebook(dsn_bday_values.items()):
        if len(bdays) == 1:
            dsns_1_bday.append(dsn)
    return dsns_1_bday
            
def dsns_with_many_bdays(dsn_bday_values):
    '''Save a list of dsns with more than 5 reported birthdays'''
    # With more than 5 reported birthdays, I've noticed many are devices being used in the office
    dsns_many_bdays = []
    for dsn, bdays in tqdm_notebook(dsn_bday_values.items()):
        if len(bdays) > 5:
            dsns_many_bdays.append(dsn)
    return dsns_many_bdays

Save list of dsns with 1 bday and dsns with > 5 bdays (very few have more than 5 and those that do seem to be Owlet testing devices)

In [16]:
dsn_bday_values = count_bdays(baby_info)
dsns_1_bday = dsns_with_1_bday(dsn_bday_values)
dsns_many_bdays = dsns_with_many_bdays(dsn_bday_values)

pickle.dump(dsns_1_bday, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_1_bday.p', 'wb'))
pickle.dump(dsns_many_bdays, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_more_than_5_bdays.p', 'wb'))

HBox(children=(IntProgress(value=0, max=240924), HTML(value='')))




HBox(children=(IntProgress(value=0, max=240924), HTML(value='')))




List of dsns to use in flatline algorithm

In [14]:
dsns_with_info = pickle.load(open('all_dsns_with_info.p','rb')) # includes cc dsns
v2_devices = pickle.load(open('/Users/brodriguez/Documents/Owlet-code/V2_monitoring_data/v2_devices.p', 'rb'))
inclusion_dsns = set(list_of_dsns).intersection(dsns_with_info).intersection(v2_devices)

more_than_5_bdays = set(pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_more_than_5_bdays.p', 'rb'))).intersection(inclusion_dsns)
for dsn in more_than_5_bdays:
    inclusion_dsns.remove(dsn)
    
pickle.dump(inclusion_dsns, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/inclusion_dsns.p', 'wb'))

Get locations for inclusion dsns

In [20]:
location_data = pd.read_csv('/Users/brodriguez/Documents/GPS_locations.csv', compression='gzip', index_col=0)
def dsn_in_16000(x):
    if x['dsn'] in inclusion_dsns:
        return True
    else:
        return False

# Location info for only the 16000 dsns (otherwise the df is too big and it slows everything down)
in_16000 = location_data.apply(dsn_in_16000, axis=1)
location_data = location_data.loc[in_16000].sort_values(by='created_at')

pickle.dump(location_data, open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_location_data.p', 'wb'))

  mask |= (ar1 == a)


# Flatline Algorithm

In [8]:
def get_df_day(df_all, day):
    '''Get 2 hours of data just prior to the time given'''
    # return the 2 hours just prior to the datetime (and include the datetime)
    # if there is not data in the 2 hours before, it will get what is there
    prior_2_hrs = day - pd.Timedelta(120,'m')
    return df_all[(df_all.index >= prior_2_hrs) & (df_all.index <= day)]

def find_gaps_from_df(df_all, size_of_gap, dsns_1_bday):
    '''Find the date just before a gap in data of the size given'''
    # if there is only 1 unique bday: return only very last day of use, else find 3 week gaps
    if df_all.dsn[0] in dsns_1_bday: 
        return [df_all.index[-1]]
    else:
        # fillna(size_of_gap) includes the very last day of use as a day before a gap
        gaps = df_all.index.to_series(keep_tz=True).diff(-1).fillna(size_of_gap).abs().ge(size_of_gap)
        day_before = df_all[gaps]
        return list(day_before.index)

def possible_flatline(df, low_HR_thres, high_HR_thres, low_O2_thres, valid_thres):
    '''Find critical events (high or low heart rate or low oxygen) in the data given'''
    valid_percent = df.valid_count/df['count']
    
    # We don't alert for low HR unless O2 is also low.
    critical_vitals = ((df.heart_rate_raw_min < low_HR_thres) & (df.oxygen_raw_min < 90)) | (df.oxygen_raw_min < low_O2_thres) | (df.heart_rate_raw_max > high_HR_thres)
    critical_event = any(critical_vitals & (valid_percent >= valid_thres))

    if critical_event:
        return True
    else:
        return False

def last_vitals(df):
    '''Classify the last 10 minutes of use
            Low O2, High HR, Low HR, Good vitals, or high variance (high standard deviation in vitals)'''
    if df.shape[0] == 0:
        return 'not valid'
    elif df.shape[0] == 1:
        end_min_hr = df.iloc[-1].heart_rate_raw_min
        end_max_hr = df.iloc[-1].heart_rate_raw_max
        end_o2 = df.iloc[-1].oxygen_raw_min
    else:
        last_20_min = df.loc[df.valid_count > 0].iloc[-2:]
        end_min_hr = last_20_min.heart_rate_raw_mean.mean()
        end_max_hr = last_20_min.heart_rate_raw_mean.mean()
        end_o2 = last_20_min.oxygen_raw_mean.mean()
    
    if (df.iloc[-1].heart_rate_raw_std > 10) or (df.iloc[-1].oxygen_raw_std > 10):
        if (end_min_hr > 60) & (end_max_hr < 220) & (df.iloc[-1].oxygen_raw_min > 80):
            return 'Good vitals'
        elif end_o2 < 60:
            return 'low O2' 
        else:
            return 'high variance'
    # Check the last 20 minutes avgs vs last 10 minutes min (critical if either is outside threshold)
    elif (df.iloc[-1].heart_rate_raw_min <= 60) or (end_min_hr < 60):
        return 'low HR'
    elif (df.iloc[-1].heart_rate_raw_max >= 220) or (end_max_hr > 220):
        return 'high HR'
    elif ((df.iloc[-1].oxygen_raw_min < 80) & (df.iloc[-1].oxygen_raw_mean < 98)) or (end_o2 < 80): 
        return 'low O2'
    else:
        return 'Good vitals'
    
def at_risk(df):
    '''is there at least 24 hours of base 7 data
        If not this could be bad hardware or a baby that has a preexisting condition'''
    base_7 = df.base_state_7.cumsum()
    mask = base_7 >= 144 
    if df.loc[mask].shape[0] == 0:
        return True
    else:
        return False

def had_cc(dsn, cc_dsns):
    '''Return True if the parent was using connected care'''
    if dsn in cc_dsns:
        return 1
    else:
        return 0
        
def in_US(day, location_df):
    '''Return True if device was used in the US'''
    location = location_df.loc[location_df.created_at <= str(day)]
    if location.shape[0] == 0:
        # Don't know the location
        return False
    elif location.cc.iloc[-1] != 'US':
        return False
    else:
        return True
    
def baby_age(day, baby_df):
    '''Return the age of the baby on the day of last use and the birthday'''
    birthdays = baby_df.loc[(baby_df.birthDate <= str(day.date()).replace('-',''))]
    if birthdays.shape[0] == 0:
        return pd.Timedelta(days=-1), ''
    else:
        bday = max(birthdays.birthDate.values) # Latest bday that is before the incident
        age = day.date() - date(int(bday[:4]), int(bday[4:6]), int(bday[6:]))# diff between birthday and day of incident
        return age, bday   
    
def find_critical_events(dsn_list, location_data, baby_info, dsns_1_bday, size_of_gap=pd.Timedelta(21,'D'), low_HR_thres=60, high_HR_thres=220, O2_thres=70, valid_thres=.4):
    '''Find all last use cases and classify critical events'''
    count_total = 0
    count = 0
    classifications = []
    cc_dsns = pickle.load(open('cc_dsns.p', 'rb')) # dsns with connected care
    for dsn in tqdm_notebook(dsn_list):
        df_all = pickle.load(open(f'/Volumes/Seagate Backup Plus Drive/Work/{folder}/16000_dfs/{dsn}_df.p', 'rb')) # sorted/duplicate timestamps have been dropped (rollup algorithm)
        at_risk_or_hardware = at_risk(df_all)
        valid_df = df_all.loc[df_all.valid_count > 2] # > 2 so there is a standard deviation for the vitals
        dsn_location = location_data.loc[location_data.dsn == dsn] 
        dsn_baby_info = baby_info.loc[baby_info.dsn == dsn] 
        
        # df could be empty! if so, skip it
        if valid_df.shape[0] != 0:
            gaps = find_gaps_from_df(valid_df, size_of_gap, dsns_1_bday)
            if dsn in cc_dsns:
                cc = 1
            else:
                cc = 0
            for day in gaps:
                # Calculate age of baby and dont add a row if they are older than 1
                age, bday = baby_age(day, dsn_baby_info)
                count_total += 1
                if (age < pd.Timedelta(days=365)) and (age >= pd.Timedelta(days=0)):
                    count += 1
                    
                    # if day is w/in 2 weeks of when Tanner did the rollups - disregard. I got the data Jan 30
                    if (str(day) <= two_weeks_before_received) & (str(day) >= '2017-01-01 00:00:00'):
                        # Check if in the US
                        if in_US(day, dsn_location):
                            df_day = get_df_day(valid_df, day)
                            flatline = possible_flatline(df_day, low_HR_thres, high_HR_thres, O2_thres, valid_thres)
                            vitals = last_vitals(df_day) 
                            classifications.append((dsn, day.date(), flatline, vitals, cc, at_risk_or_hardware, age, bday))
    
    print('total babies with last day', count_total)
    print('total babies < 1 on last day', count)
    df_columns = ['dsn', 'date', 'critical_event', 'last_10_minutes', 'cc', 'at_risk_or_issues', 'baby_age', 'birthday']
    df_classified = pd.DataFrame(classifications, columns=df_columns)
    
    # Get rid of multiple last days for individual babies
    df_classified.sort_values(by='date', inplace=True)
    df_classified.drop_duplicates(subset=['dsn', 'birthday'], keep='last', inplace=True)
    return df_classified

#### Functions for sock off and signal loss

In [9]:
def load_2sec(dsn, day, time_index=False):
    '''Load 2 second data for the given dsn and day'''
    df_2sec = pd.read_csv(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_2sec/{day}/{dsn}.csv.gz', compression='gzip',names=column_names)
    df_2sec.timestamp = pd.to_datetime(df_2sec.timestamp, unit='s')
    df_2sec = df_2sec.sort_values(by=['timestamp'])
    df_2sec.drop_duplicates('timestamp',inplace=True)

    df_2sec = df_2sec.loc[(df_2sec.base_state > 3) & (df_2sec.ble_rssi != 0) & (df_2sec.heart_rate_raw >0)]
    df_2sec.reset_index(inplace=True, drop=True)
    
    if time_index:
        df_2sec.set_index('timestamp',inplace=True)
    
    return df_2sec

def lost_signal(df):
    '''Return the timestamp when the signal is lost'''
    consecutive_valid = df.notification_mask.eq(0).rolling(window=16).sum().fillna(0)
    if consecutive_valid[consecutive_valid>=15].shape[0] == 0:
        return 0
    return consecutive_valid[consecutive_valid>=15].index[-1]

def multiple_sock_off(df, thresh=-.05):
    '''Find the timestamps when the sock is taken off'''
    diff_new = df.skin_temperature.diff().fillna(0)
    diff_rolling = diff_new.rolling(300).mean().fillna(method='bfill') # different window?
    off_indices = []
    
    # find ALL indices of min..
    if min(diff_rolling) < thresh: # < must match one for less_than
        less_than = diff_rolling[diff_rolling < thresh]
        drop_indices = [0] + list(np.where(np.diff(less_than.index.values) > 30)[0] + 1)
        drop_indices = less_than.index.values[drop_indices]
        
        # For each index:
        for i in drop_indices:
            if i == 0:
                off_indices.append(0)
            else:
                off = diff_new.iloc[max(0,(i-275)):i]
                off = off.loc[off != 0]
                
                if off.shape[0] == 0:
                    off_indices.append(i)#??????
                else:
                    # - 3 because window = 4, max(0, ) because if idxmax < 3 you get a negative index
                    off_index_reset = max(0, off.eq(-1).reset_index().skin_temperature.rolling(window=4).sum().fillna(0).idxmax() - 3)
                    count_neg_1 = off.iloc[off_index_reset:].eq(-1).value_counts()

                    #  if there are no False or no True:
                    if len(count_neg_1) == 1:
                        if count_neg_1.index[0]:
                            off_index = off.index[off_index_reset]
                            off_indices.append(off_index)
                    elif count_neg_1[True]/(count_neg_1[False] + count_neg_1[True]) >= .7: # What threshhold?
                        off_index = off.index[off_index_reset] # This is the index where the sock came off
                        off_indices.append(off_index)
            
    return off_indices

def mvmt(x):
    if x == 0:
        return 1
    return int(bin(x)[-2])

def signal_sock(x):
    '''Determine if the sock came off before the signal was lost'''
    five_min = pd.Timedelta(minutes=5) #  is 5 min right choice
    try:
        df = load_2sec(x['dsn'], str(x['date']).replace('-','')) 
    except FileNotFoundError:
        return 'Need data'
    
    time_values = df.timestamp
    last_reading = df.loc[df.base_state > 3].timestamp.values[-1]
    loss = lost_signal(df) 
    sock = multiple_sock_off(df)
    # make timestamp the index
    df.set_index('timestamp',inplace=True)
    mvmt_flag = df.loc[(df.index > time_values[loss]) & (df.index < last_reading)].notification_mask.apply(mvmt) # are thresholds good?
    
    # if lost signal is within 1 min of last reading (before charging) return 'data cut off' 
    if abs(time_values[loss] - last_reading) < pd.Timedelta(minutes=1): # is 1 min good?
        return 'data cut off'
    # if sock comes off w/in ~5 min of lost signal return 'sock off'
    elif len(sock) > 0: 
        for sock_off in sock:
            if (sock_off < loss) or (abs(time_values[sock_off] - time_values[loss]) <= five_min): # is 5 minutes good?
                return 'sock off' 
            else:
                # if time btw loss and sock off has movement > 20% of the time => cut off 
                loss_to_off = mvmt_flag.loc[mvmt_flag.index < (time_values[sock_off] - pd.Timedelta(minutes=5))].rolling(window=100).sum().fillna(method='bfill')
                if loss_to_off.min() <= 20: # 20% or more?
                    return 'signal lost before sock off'
                else:
                    return 'sock off' # sock off or data cut off?
    
    # if time btw loss and last reading has movement > 20% => cut off
    loss_to_last = mvmt_flag.rolling(window=100).sum().fillna(method='bfill')
    if loss_to_last.min() > 20: # 20% or more?
        return 'data cut off'
    return 'signal lost before sock off'

# Run Algorithm

In [29]:
# Load all pickled files if kernal has been restarted
folder = '16k_round_2'
inclusion_dsns = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/inclusion_dsns.p', 'rb'))
location_data = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/16000_location_data.p', 'rb'))
# reg_baby_info = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/reg_baby_data_test.p', 'rb')).sort_values(by='registered_at')
dsns_1_bday = pickle.load(open(f'/Users/brodriguez/Documents/Owlet-code/{folder}/dsns_1_bday.p', 'rb'))

In [262]:
df_classified = find_critical_events(inclusion_dsns, location_data, baby_info, dsns_1_bday) 

HBox(children=(IntProgress(value=0, max=10932), HTML(value='')))


total babies with last day 14421
total babies < 1 on last day 11630


In [37]:
# Birthdays that were after the data collection 
# bdays_after_nov_2018 = baby_info.query("birthDate > '20181130'")
# for dsn in inclusion_dsns:
#     results = bdays_after_nov_2018.query('dsn == @dsn').birthDate.values
#     if len(results) > 0:
#         print(results)

In [263]:
14421-11630

2791

In [280]:
df_classified[(df_classified.critical_event == True) & (df_classified.at_risk_or_issues == True)].shape

(44, 8)

In [281]:
df_classified[df_classified.dsn == 'AC000W001132720']

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
3630,AC000W001132720,2017-08-30,True,low O2,0,False,9 days,20170821


In [12]:
loc_round1 = pickle.load(open(f'/Volumes/Seagate Backup Plus Drive/Work/16k_round_1/16000_location_data.p', 'rb'))
folder = '16k_round_1'
one_bday_round1 = pickle.load(open(f'/Volumes/Seagate Backup Plus Drive/Work/16k_round_1/dsns_1_bday.p', 'rb'))
dsns = ['AC000W001052578']
df_classify_round1 = find_critical_events(dsns, loc_round1, baby_info, one_bday_round1)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


total babies with last day 1
total babies < 1 on last day 1


In [13]:
df_classify_round1

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
0,AC000W001052578,2017-09-24,False,Good vitals,0,False,27 days,20170828


In [26]:
'AC000W001052578' in v2_devices

True

In [235]:
df_classify_round1

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday


In [222]:
df_classify_round1[df_classify_round1.critical_event == False]

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday
15,AC000W000422937,2017-04-24,False,low O2,0,True,12 days,20170412
6,AC000W001056677,2017-06-14,False,low HR,0,False,37 days,20170508
3,AC000W001204540,2017-09-16,False,Good vitals,1,True,25 days,20170822
10,AC000W001052578,2017-09-24,False,Good vitals,0,False,27 days,20170828
13,AC000W001179577,2017-11-03,False,Good vitals,1,False,187 days,20170430
5,AC000W001084878,2018-03-07,False,Good vitals,0,False,293 days,20170518
12,AC000W000339336,2018-07-03,False,Good vitals,1,False,230 days,20171115
0,AC000W001059756,2018-07-05,False,low O2,1,False,245 days,20171102
9,AC000W001212809,2018-07-08,False,Good vitals,0,True,21 days,20180617
1,AC000W002428238,2018-10-06,False,Good vitals,0,False,134 days,20180525


In [219]:
df_critical_round1 = df_classify_round1.loc[df_classify_round1.critical_event == True]
df_critical_round1['signal_sock'] = df_critical_round1.apply(signal_sock, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [220]:
df_critical_round1

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
11,AC000W001045532,2017-03-13,True,low O2,0,True,10 days,20170303,sock off
14,AC000W001096216,2017-09-23,True,high variance,0,True,2 days,20170921,sock off
4,AC000W001055680,2017-10-13,True,low O2,0,True,8 days,20171005,sock off
7,AC000W001211941,2018-02-11,True,low O2,1,True,230 days,20170626,sock off
8,AC000W001212809,2018-03-07,True,low O2,0,True,40 days,20180126,sock off
2,AC000W002502642,2018-07-21,True,low O2,0,True,72 days,20180510,sock off


In [127]:
df_classified.dsn.value_counts()

AC000W001041852    4
AC000W001081517    4
AC000W001047242    3
AC000W001125606    3
AC000W001127650    3
AC000W000420460    3
AC000W001041773    3
AC000W001092496    3
AC000W001202272    3
AC000W001041179    3
AC000W001127565    3
AC000W001038633    3
AC000W001047347    3
AC000W001128003    3
AC000W000459039    3
AC000W001205161    3
AC000W001181389    3
AC000W001048314    3
AC000W001042493    3
AC000W001114072    3
AC000W001132958    3
AC000W001041980    3
AC000W000331316    3
AC000W001052267    3
AC000W000400721    3
AC000W001205087    3
AC000W001123028    3
AC000W001125055    3
AC000W001081571    3
AC000W000022700    3
                  ..
AC000W001151217    1
AC000W001199159    1
AC000W001212917    1
AC000W002516086    1
AC000W001118493    1
AC000W001153595    1
AC000W003423038    1
AC000W002435338    1
AC000W001043827    1
AC000W001149392    1
AC000W001052980    1
AC000W001111317    1
AC000W001074252    1
AC000W002623751    1
AC000W001117138    1
AC000W001210962    1
AC000W0026664

In [132]:
len(list_of_dsns.intersection(v2_devices))
#dsns_with_info

11427

Get 2 second data for critical events before further classification

In [115]:
#dont want to apply this to whole df, just to critical events (have to get 2sec data on computer before)
df_critical = df_classified.loc[df_classified.critical_event == True]
if df_critical.shape[0] != 0:
    df_critical['signal_sock'] = df_critical.apply(signal_sock, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [138]:
# ** cases 
df_critical[(df_critical.last_10_minutes != 'high variance') &
            (df_critical.last_10_minutes != 'Good vitals') &
            (df_critical.last_10_minutes != 'not valid') &
            (df_critical.at_risk_or_issues == False) &
            (df_critical.signal_sock != 'sock off') & 
            (df_critical.signal_sock != 'Need data')]

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
6083,AC000W001048490,2017-05-29,True,low O2,0,False,71 days,20170319,signal lost before sock off
10291,AC000W001105945,2017-08-20,True,low O2,0,False,105 days,20170507,signal lost before sock off
3658,AC000W001052267,2017-10-20,True,low HR,1,False,9 days,20171011,signal lost before sock off
7480,AC000W001071406,2017-11-27,True,low O2,1,False,4 days,20171123,signal lost before sock off
4871,AC000W001099202,2018-02-22,True,low O2,0,False,209 days,20170728,signal lost before sock off
7712,AC000W001137082,2018-06-07,True,low O2,0,False,323 days,20170719,signal lost before sock off
6385,AC000W003378821,2018-07-03,True,low O2,0,False,120 days,20180305,data cut off
4984,AC000W002575870,2018-08-16,True,low O2,1,False,241 days,20171218,data cut off
2468,AC000W001150173,2018-08-16,True,low O2,0,False,15 days,20180801,signal lost before sock off
1923,AC000W001110819,2018-09-07,True,low O2,0,False,357 days,20170915,signal lost before sock off


In [139]:
# * cases
df_critical[(df_critical.last_10_minutes != 'Good vitals') &
            (df_critical.last_10_minutes != 'not valid') & 
            (df_critical.at_risk_or_issues == False) & 
            (((df_critical.last_10_minutes != 'high variance') & (df_critical.signal_sock == 'sock off')) | 
            ((df_critical.last_10_minutes == 'high variance') & (df_critical.signal_sock == 'signal lost before sock off')))]
 

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
3630,AC000W001132720,2017-08-30,True,low O2,0,False,9 days,20170821,sock off
1231,AC000W001205165,2017-11-01,True,low O2,0,False,33 days,20170929,sock off
6807,AC000W001099882,2018-01-05,True,low O2,0,False,182 days,20170707,sock off
3553,AC000W001060959,2018-02-12,True,low O2,0,False,251 days,20170606,sock off
2236,AC000W001181803,2018-03-22,True,high variance,1,False,237 days,20170728,signal lost before sock off
8322,AC000W001109623,2018-04-16,True,high variance,0,False,102 days,20180104,signal lost before sock off
2658,AC000W001204138,2018-05-19,True,low O2,0,False,206 days,20171025,sock off
2235,AC000W001145618,2018-05-31,True,low O2,0,False,268 days,20170905,sock off
3401,AC000W001153969,2018-07-22,True,low O2,1,False,349 days,20170807,sock off
1557,AC000W002669445,2018-08-21,True,low O2,1,False,52 days,20180630,sock off


In [140]:
# ? cases
df_critical[(df_critical.signal_sock != 'Need data') & 
            (df_critical.last_10_minutes != 'not valid') &
            (df_critical.last_10_minutes != 'Good vitals') &
            (((df_critical.last_10_minutes == 'high variance') & (df_critical.signal_sock != 'signal lost before sock off') & (df_critical.at_risk_or_issues == False)) |
            ((df_critical.at_risk_or_issues == True) & (df_critical.signal_sock != 'sock off')))]


Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock
8004,AC000W002432785,2017-12-22,True,high variance,1,True,151 days,20170724,signal lost before sock off
4485,AC000W001205315,2018-05-07,True,low O2,0,True,12 days,20180425,data cut off
6937,AC000W001106356,2018-05-16,True,high variance,0,False,18 days,20180428,data cut off
10219,AC000W001215123,2018-06-18,True,low O2,0,True,3 days,20180615,signal lost before sock off
10241,AC000W001057264,2018-07-02,True,low O2,0,True,33 days,20180530,data cut off
4817,AC000W001072954,2018-08-01,True,high variance,1,False,301 days,20171004,sock off
5724,AC000W003419384,2018-09-07,True,low O2,0,True,8 days,20180830,signal lost before sock off
9259,AC000W002455662,2018-09-07,True,low O2,1,True,21 days,20180817,signal lost before sock off
3328,AC000W001082234,2018-10-08,True,high variance,1,False,327 days,20171115,sock off
7474,AC000W001041852,2018-11-03,True,high variance,1,False,81 days,20180814,sock off


In [231]:
'AC000W001183543' in v2_devices

True

2 second that is still needed

In [119]:
df_critical[df_critical.signal_sock == 'Need data']

Unnamed: 0,dsn,date,critical_event,last_10_minutes,cc,at_risk_or_issues,baby_age,birthday,signal_sock


In [37]:
for row in df_critical.itertuples():
    print(row.dsn, row.date)

AC000W001084337 2017-02-25
AC000W000424430 2017-02-27
AC000W001124472 2017-03-16
AC000W001123812 2017-03-22
AC000W001181483 2017-03-22
AC000W001124776 2017-04-02
AC000W001051517 2017-04-10
AC000W001123919 2017-04-15
AC000W001038945 2017-04-15
AC000W001092350 2017-04-26
AC000W001122185 2017-04-26
AC000W001042848 2017-04-30
AC000W001042935 2017-05-01
AC000W001057151 2017-05-06
AC000W001179694 2017-05-12
AC000W000419890 2017-05-18
AC000W001084993 2017-05-25
AC000W001048490 2017-05-29
AC000W001134913 2017-06-14
AC000W001099714 2017-07-11
AC000W001059658 2017-07-21
AC000W001053602 2017-07-25
AC000W001100036 2017-07-27
AC000W001126420 2017-07-31
AC000W001090283 2017-08-16
AC000W001105945 2017-08-20
AC000W001132720 2017-08-30
AC000W001095726 2017-09-02
AC000W001081522 2017-09-10
AC000W001103068 2017-09-21
AC000W001199159 2017-09-22
AC000W001153751 2017-10-03
AC000W001057901 2017-10-08
AC000W001052267 2017-10-20
AC000W001204434 2017-10-21
AC000W001127719 2017-10-29
AC000W001205165 2017-11-01
A

In [43]:
def get_all_dsns2(all_file_paths):
    '''Pickle a set of the unique dsns'''
    my_list = set([])
    for path in all_file_paths:
        my_list.add(path[-22:-7]) 
    return my_list

In [40]:
folder_dsns = {}

In [41]:
binarymask9_files = glob('/Volumes/My Passport/BinaryMask9/*/*.csv.gz')

In [42]:
binarymask10_files = glob('/Volumes/My Passport/BinaryMask10/*/*.csv.gz')
binarymask11_files = glob('/Volumes/My Passport/BinaryMask11/*/*.csv.gz')
binarymask12_files = glob('/Volumes/My Passport/BinaryMask12/*/*.csv.gz')
binarymask13_files = glob('/Volumes/My Passport/BinaryMask13/*/*.csv.gz')
binarymask14_files = glob('/Volumes/My Passport/BinaryMask14/*/*.csv.gz')
binarymask15_files = glob('/Volumes/My Passport/BinaryMask15/*/*.csv.gz')
binarymask16_files = glob('/Volumes/My Passport/BinaryMask16/*/*.csv.gz')

In [49]:
binarymask9_files[0][-22:-7]

'AC000W000098114'

In [44]:
folder_dsns['BinaryMask9'] = get_all_dsns2(binarymask9_files)

In [45]:
folder_dsns['BinaryMask10'] = get_all_dsns2(binarymask10_files)
folder_dsns['BinaryMask11'] = get_all_dsns2(binarymask11_files)
folder_dsns['BinaryMask12'] = get_all_dsns2(binarymask12_files)
folder_dsns['BinaryMask13'] = get_all_dsns2(binarymask13_files)
folder_dsns['BinaryMask14'] = get_all_dsns2(binarymask14_files)
folder_dsns['BinaryMask15'] = get_all_dsns2(binarymask15_files)
folder_dsns['BinaryMask16'] = get_all_dsns2(binarymask16_files)

In [47]:
pickle.dump(folder_dsns, open('folder_dsns.p', 'wb'))

In [67]:
count = 0
for row in df_critical.itertuples():
    folders_ = ['BinaryMask9', 'BinaryMask10', 'BinaryMask11', 'BinaryMask12', 'BinaryMask13', 'BinaryMask14', 'BinaryMask15', 'BinaryMask16']
    for f in folders_:
        if row.dsn in folder_dsns[f]:
            count += 1
            print(f)

BinaryMask14
BinaryMask13
BinaryMask9
BinaryMask11
BinaryMask14
BinaryMask16
BinaryMask10
BinaryMask15
BinaryMask15
BinaryMask11
BinaryMask13
BinaryMask9
BinaryMask11
BinaryMask16
BinaryMask13
BinaryMask15
BinaryMask13
BinaryMask13
BinaryMask11
BinaryMask9
BinaryMask10
BinaryMask15
BinaryMask13
BinaryMask9
BinaryMask11
BinaryMask15
BinaryMask16
BinaryMask14
BinaryMask13
BinaryMask9
BinaryMask15
BinaryMask11
BinaryMask14
BinaryMask10
BinaryMask10
BinaryMask14
BinaryMask9
BinaryMask14
BinaryMask12
BinaryMask15
BinaryMask16
BinaryMask16
BinaryMask9
BinaryMask12
BinaryMask15
BinaryMask9
BinaryMask14
BinaryMask14
BinaryMask9
BinaryMask13
BinaryMask13
BinaryMask11
BinaryMask9
BinaryMask16
BinaryMask15
BinaryMask14
BinaryMask11
BinaryMask15
BinaryMask10
BinaryMask11
BinaryMask12
BinaryMask14
BinaryMask10
BinaryMask11
BinaryMask16
BinaryMask14
BinaryMask10
BinaryMask14
BinaryMask12
BinaryMask12
BinaryMask14
BinaryMask10
BinaryMask9
BinaryMask16
BinaryMask14
BinaryMask10
BinaryMask9
BinaryMask1

In [76]:
count

223

In [None]:
cp BinaryMask11/20181103/AC000W001041852* ~/Documents/Owlet-code/16k_round_2/16000_2sec/20181103/

In [76]:
'AC000W001048490' in folder_dsns['BinaryMask13']

True

In [113]:
dictionary['AC000W002665178']

['/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20160101/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180227/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180506/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180507/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180508/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180509/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180510/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180511/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180512/AC000W002665178.p',
 '/Users/brodriguez/Documents/Owlet-code/16k_round_2/16000_day_rollups/20180513/AC000W002665178.p',


In [None]:
cp Parsed16000/20170608/AC000W000421078* ~/Documents/Owlet-code/16k_round_1/16000_2sec/20170608/
cp Parsed16000/20170607/AC000W000421078* ~/Documents/Owlet-code/16k_round_1/16000_2sec/20170607/
cp Parsed16000/20170606/AC000W000421078* ~/Documents/Owlet-code/16k_round_1/16000_2sec/20170606/
cp Parsed16000/20170605/AC000W000421078* ~/Documents/Owlet-code/16k_round_1/16000_2sec/20170605/
cp Parsed16000/20170604/AC000W000421078* ~/Documents/Owlet-code/16k_round_1/16000_2sec/20170604/
cp Parsed16000/20170603/AC000W000421078* ~/Documents/Owlet-code/16k_round_1/16000_2sec/20170603/
cp Parsed16000/20170602/AC000W000421078* ~/Documents/Owlet-code/16k_round_1/16000_2sec/20170602/